• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 //   Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13 
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h>  // for abs()
17 
18 #include "./vp8enci.h"
19 #include "./cost.h"
20 
21 #define DO_TRELLIS_I4  1
22 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
23 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
24 #define USE_TDISTO 1
25 
26 #define MID_ALPHA 64      // neutral value for susceptibility
27 #define MIN_ALPHA 30      // lowest usable value for susceptibility
28 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
29 
30 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
31                           // power-law modulation. Must be strictly less than 1.
32 
33 #define I4_PENALTY 14000  // Rate-penalty for quick i4/i16 decision
34 
35 // number of non-zero coeffs below which we consider the block very flat
36 // (and apply a penalty to complex predictions)
37 #define FLATNESS_LIMIT_I16 10      // I16 mode
38 #define FLATNESS_LIMIT_I4  3       // I4 mode
39 #define FLATNESS_LIMIT_UV  2       // UV mode
40 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
41 
42 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
43 
44 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
45 
46 // #define DEBUG_BLOCK
47 
48 //------------------------------------------------------------------------------
49 
50 #if defined(DEBUG_BLOCK)
51 
52 #include <stdio.h>
53 #include <stdlib.h>
54 
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)55 static void PrintBlockInfo(const VP8EncIterator* const it,
56                            const VP8ModeScore* const rd) {
57   int i, j;
58   const int is_i16 = (it->mb_->type_ == 1);
59   const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
60   const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
61   const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
62   const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
63   printf("SOURCE / OUTPUT / ABS DELTA\n");
64   for (j = 0; j < 16; ++j) {
65     for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
66     printf("     ");
67     for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
68     printf("     ");
69     for (i = 0; i < 16; ++i) {
70       printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
71     }
72     printf("\n");
73   }
74   printf("\n");   // newline before the U/V block
75   for (j = 0; j < 8; ++j) {
76     for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
77     printf(" ");
78     for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
79     printf("    ");
80     for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
81     printf(" ");
82     for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
83     printf("   ");
84     for (i = 0; i < 8; ++i) {
85       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
86     }
87     printf(" ");
88     for (i = 8; i < 16; ++i) {
89       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
90     }
91     printf("\n");
92   }
93   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
94     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
95     (int)rd->score);
96   if (is_i16) {
97     printf("Mode: %d\n", rd->mode_i16);
98     printf("y_dc_levels:");
99     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
100     printf("\n");
101   } else {
102     printf("Modes[16]: ");
103     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
104     printf("\n");
105   }
106   printf("y_ac_levels:\n");
107   for (j = 0; j < 16; ++j) {
108     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
109       printf("%4d ", rd->y_ac_levels[j][i]);
110     }
111     printf("\n");
112   }
113   printf("\n");
114   printf("uv_levels (mode=%d):\n", rd->mode_uv);
115   for (j = 0; j < 8; ++j) {
116     for (i = 0; i < 16; ++i) {
117       printf("%4d ", rd->uv_levels[j][i]);
118     }
119     printf("\n");
120   }
121 }
122 
123 #endif   // DEBUG_BLOCK
124 
125 //------------------------------------------------------------------------------
126 
clip(int v,int m,int M)127 static WEBP_INLINE int clip(int v, int m, int M) {
128   return v < m ? m : v > M ? M : v;
129 }
130 
131 static const uint8_t kZigzag[16] = {
132   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
133 };
134 
135 static const uint8_t kDcTable[128] = {
136   4,     5,   6,   7,   8,   9,  10,  10,
137   11,   12,  13,  14,  15,  16,  17,  17,
138   18,   19,  20,  20,  21,  21,  22,  22,
139   23,   23,  24,  25,  25,  26,  27,  28,
140   29,   30,  31,  32,  33,  34,  35,  36,
141   37,   37,  38,  39,  40,  41,  42,  43,
142   44,   45,  46,  46,  47,  48,  49,  50,
143   51,   52,  53,  54,  55,  56,  57,  58,
144   59,   60,  61,  62,  63,  64,  65,  66,
145   67,   68,  69,  70,  71,  72,  73,  74,
146   75,   76,  76,  77,  78,  79,  80,  81,
147   82,   83,  84,  85,  86,  87,  88,  89,
148   91,   93,  95,  96,  98, 100, 101, 102,
149   104, 106, 108, 110, 112, 114, 116, 118,
150   122, 124, 126, 128, 130, 132, 134, 136,
151   138, 140, 143, 145, 148, 151, 154, 157
152 };
153 
154 static const uint16_t kAcTable[128] = {
155   4,     5,   6,   7,   8,   9,  10,  11,
156   12,   13,  14,  15,  16,  17,  18,  19,
157   20,   21,  22,  23,  24,  25,  26,  27,
158   28,   29,  30,  31,  32,  33,  34,  35,
159   36,   37,  38,  39,  40,  41,  42,  43,
160   44,   45,  46,  47,  48,  49,  50,  51,
161   52,   53,  54,  55,  56,  57,  58,  60,
162   62,   64,  66,  68,  70,  72,  74,  76,
163   78,   80,  82,  84,  86,  88,  90,  92,
164   94,   96,  98, 100, 102, 104, 106, 108,
165   110, 112, 114, 116, 119, 122, 125, 128,
166   131, 134, 137, 140, 143, 146, 149, 152,
167   155, 158, 161, 164, 167, 170, 173, 177,
168   181, 185, 189, 193, 197, 201, 205, 209,
169   213, 217, 221, 225, 229, 234, 239, 245,
170   249, 254, 259, 264, 269, 274, 279, 284
171 };
172 
173 static const uint16_t kAcTable2[128] = {
174   8,     8,   9,  10,  12,  13,  15,  17,
175   18,   20,  21,  23,  24,  26,  27,  29,
176   31,   32,  34,  35,  37,  38,  40,  41,
177   43,   44,  46,  48,  49,  51,  52,  54,
178   55,   57,  58,  60,  62,  63,  65,  66,
179   68,   69,  71,  72,  74,  75,  77,  79,
180   80,   82,  83,  85,  86,  88,  89,  93,
181   96,   99, 102, 105, 108, 111, 114, 117,
182   120, 124, 127, 130, 133, 136, 139, 142,
183   145, 148, 151, 155, 158, 161, 164, 167,
184   170, 173, 176, 179, 184, 189, 193, 198,
185   203, 207, 212, 217, 221, 226, 230, 235,
186   240, 244, 249, 254, 258, 263, 268, 274,
187   280, 286, 292, 299, 305, 311, 317, 323,
188   330, 336, 342, 348, 354, 362, 370, 379,
189   385, 393, 401, 409, 416, 424, 432, 440
190 };
191 
192 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
193   { 96, 110 }, { 96, 108 }, { 110, 115 }
194 };
195 
196 // Sharpening by (slightly) raising the hi-frequency coeffs.
197 // Hack-ish but helpful for mid-bitrate range. Use with care.
198 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
199 static const uint8_t kFreqSharpening[16] = {
200   0,  30, 60, 90,
201   30, 60, 90, 90,
202   60, 90, 90, 90,
203   90, 90, 90, 90
204 };
205 
206 //------------------------------------------------------------------------------
207 // Initialize quantization parameters in VP8Matrix
208 
209 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)210 static int ExpandMatrix(VP8Matrix* const m, int type) {
211   int i, sum;
212   for (i = 0; i < 2; ++i) {
213     const int is_ac_coeff = (i > 0);
214     const int bias = kBiasMatrices[type][is_ac_coeff];
215     m->iq_[i] = (1 << QFIX) / m->q_[i];
216     m->bias_[i] = BIAS(bias);
217     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
218     //   * zero if coeff <= zthresh
219     //   * non-zero if coeff > zthresh
220     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
221   }
222   for (i = 2; i < 16; ++i) {
223     m->q_[i] = m->q_[1];
224     m->iq_[i] = m->iq_[1];
225     m->bias_[i] = m->bias_[1];
226     m->zthresh_[i] = m->zthresh_[1];
227   }
228   for (sum = 0, i = 0; i < 16; ++i) {
229     if (type == 0) {  // we only use sharpening for AC luma coeffs
230       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
231     } else {
232       m->sharpen_[i] = 0;
233     }
234     sum += m->q_[i];
235   }
236   return (sum + 8) >> 4;
237 }
238 
SetupMatrices(VP8Encoder * enc)239 static void SetupMatrices(VP8Encoder* enc) {
240   int i;
241   const int tlambda_scale =
242     (enc->method_ >= 4) ? enc->config_->sns_strength
243                         : 0;
244   const int num_segments = enc->segment_hdr_.num_segments_;
245   for (i = 0; i < num_segments; ++i) {
246     VP8SegmentInfo* const m = &enc->dqm_[i];
247     const int q = m->quant_;
248     int q4, q16, quv;
249     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
250     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
251 
252     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
253     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
254 
255     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
256     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
257 
258     q4  = ExpandMatrix(&m->y1_, 0);
259     q16 = ExpandMatrix(&m->y2_, 1);
260     quv = ExpandMatrix(&m->uv_, 2);
261 
262     m->lambda_i4_          = (3 * q4 * q4) >> 7;
263     m->lambda_i16_         = (3 * q16 * q16);
264     m->lambda_uv_          = (3 * quv * quv) >> 6;
265     m->lambda_mode_        = (1 * q4 * q4) >> 7;
266     m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
267     m->lambda_trellis_i16_ = (q16 * q16) >> 2;
268     m->lambda_trellis_uv_  = (quv *quv) << 1;
269     m->tlambda_            = (tlambda_scale * q4) >> 5;
270 
271     m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
272     m->max_edge_  = 0;
273   }
274 }
275 
276 //------------------------------------------------------------------------------
277 // Initialize filtering parameters
278 
279 // Very small filter-strength values have close to no visual effect. So we can
280 // save a little decoding-CPU by turning filtering off for these.
281 #define FSTRENGTH_CUTOFF 2
282 
SetupFilterStrength(VP8Encoder * const enc)283 static void SetupFilterStrength(VP8Encoder* const enc) {
284   int i;
285   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
286   const int level0 = 5 * enc->config_->filter_strength;
287   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
288     VP8SegmentInfo* const m = &enc->dqm_[i];
289     // We focus on the quantization of AC coeffs.
290     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
291     const int base_strength =
292         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
293     // Segments with lower complexity ('beta') will be less filtered.
294     const int f = base_strength * level0 / (256 + m->beta_);
295     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
296   }
297   // We record the initial strength (mainly for the case of 1-segment only).
298   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
299   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
300   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
301 }
302 
303 //------------------------------------------------------------------------------
304 
305 // Note: if you change the values below, remember that the max range
306 // allowed by the syntax for DQ_UV is [-16,16].
307 #define MAX_DQ_UV (6)
308 #define MIN_DQ_UV (-4)
309 
310 // We want to emulate jpeg-like behaviour where the expected "good" quality
311 // is around q=75. Internally, our "good" middle is around c=50. So we
312 // map accordingly using linear piece-wise function
QualityToCompression(double c)313 static double QualityToCompression(double c) {
314   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
315   // The file size roughly scales as pow(quantizer, 3.). Actually, the
316   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
317   // in the mid-quant range. So we scale the compressibility inversely to
318   // this power-law: quant ~= compression ^ 1/3. This law holds well for
319   // low quant. Finer modeling for high-quant would make use of kAcTable[]
320   // more explicitly.
321   const double v = pow(linear_c, 1 / 3.);
322   return v;
323 }
324 
QualityToJPEGCompression(double c,double alpha)325 static double QualityToJPEGCompression(double c, double alpha) {
326   // We map the complexity 'alpha' and quality setting 'c' to a compression
327   // exponent empirically matched to the compression curve of libjpeg6b.
328   // On average, the WebP output size will be roughly similar to that of a
329   // JPEG file compressed with same quality factor.
330   const double amin = 0.30;
331   const double amax = 0.85;
332   const double exp_min = 0.4;
333   const double exp_max = 0.9;
334   const double slope = (exp_min - exp_max) / (amax - amin);
335   // Linearly interpolate 'expn' from exp_min to exp_max
336   // in the [amin, amax] range.
337   const double expn = (alpha > amax) ? exp_min
338                     : (alpha < amin) ? exp_max
339                     : exp_max + slope * (alpha - amin);
340   const double v = pow(c, expn);
341   return v;
342 }
343 
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)344 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
345                                  const VP8SegmentInfo* const S2) {
346   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
347 }
348 
SimplifySegments(VP8Encoder * const enc)349 static void SimplifySegments(VP8Encoder* const enc) {
350   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
351   const int num_segments = enc->segment_hdr_.num_segments_;
352   int num_final_segments = 1;
353   int s1, s2;
354   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
355     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
356     int found = 0;
357     // check if we already have similar segment
358     for (s2 = 0; s2 < num_final_segments; ++s2) {
359       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
360       if (SegmentsAreEquivalent(S1, S2)) {
361         found = 1;
362         break;
363       }
364     }
365     map[s1] = s2;
366     if (!found) {
367       if (num_final_segments != s1) {
368         enc->dqm_[num_final_segments] = enc->dqm_[s1];
369       }
370       ++num_final_segments;
371     }
372   }
373   if (num_final_segments < num_segments) {  // Remap
374     int i = enc->mb_w_ * enc->mb_h_;
375     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
376     enc->segment_hdr_.num_segments_ = num_final_segments;
377     // Replicate the trailing segment infos (it's mostly cosmetics)
378     for (i = num_final_segments; i < num_segments; ++i) {
379       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
380     }
381   }
382 }
383 
VP8SetSegmentParams(VP8Encoder * const enc,float quality)384 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
385   int i;
386   int dq_uv_ac, dq_uv_dc;
387   const int num_segments = enc->segment_hdr_.num_segments_;
388   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
389   const double Q = quality / 100.;
390   const double c_base = enc->config_->emulate_jpeg_size ?
391       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
392       QualityToCompression(Q);
393   for (i = 0; i < num_segments; ++i) {
394     // We modulate the base coefficient to accommodate for the quantization
395     // susceptibility and allow denser segments to be quantized more.
396     const double expn = 1. - amp * enc->dqm_[i].alpha_;
397     const double c = pow(c_base, expn);
398     const int q = (int)(127. * (1. - c));
399     assert(expn > 0.);
400     enc->dqm_[i].quant_ = clip(q, 0, 127);
401   }
402 
403   // purely indicative in the bitstream (except for the 1-segment case)
404   enc->base_quant_ = enc->dqm_[0].quant_;
405 
406   // fill-in values for the unused segments (required by the syntax)
407   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
408     enc->dqm_[i].quant_ = enc->base_quant_;
409   }
410 
411   // uv_alpha_ is normally spread around ~60. The useful range is
412   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
413   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
414   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
415                                           / (MAX_ALPHA - MIN_ALPHA);
416   // we rescale by the user-defined strength of adaptation
417   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
418   // and make it safe.
419   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
420   // We also boost the dc-uv-quant a little, based on sns-strength, since
421   // U/V channels are quite more reactive to high quants (flat DC-blocks
422   // tend to appear, and are unpleasant).
423   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
424   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
425 
426   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
427   enc->dq_y2_dc_ = 0;
428   enc->dq_y2_ac_ = 0;
429   enc->dq_uv_dc_ = dq_uv_dc;
430   enc->dq_uv_ac_ = dq_uv_ac;
431 
432   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
433 
434   if (num_segments > 1) SimplifySegments(enc);
435 
436   SetupMatrices(enc);         // finalize quantization matrices
437 }
438 
439 //------------------------------------------------------------------------------
440 // Form the predictions in cache
441 
442 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
443 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
444 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
445 
446 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
447 const int VP8I4ModeOffsets[NUM_BMODES] = {
448   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
449 };
450 
VP8MakeLuma16Preds(const VP8EncIterator * const it)451 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
452   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
453   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
454   VP8EncPredLuma16(it->yuv_p_, left, top);
455 }
456 
VP8MakeChroma8Preds(const VP8EncIterator * const it)457 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
458   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
459   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
460   VP8EncPredChroma8(it->yuv_p_, left, top);
461 }
462 
VP8MakeIntra4Preds(const VP8EncIterator * const it)463 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
464   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
465 }
466 
467 //------------------------------------------------------------------------------
468 // Quantize
469 
470 // Layout:
471 // +----+----+
472 // |YYYY|UUVV| 0
473 // |YYYY|UUVV| 4
474 // |YYYY|....| 8
475 // |YYYY|....| 12
476 // +----+----+
477 
478 const int VP8Scan[16] = {  // Luma
479   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
480   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
481   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
482   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
483 };
484 
485 static const int VP8ScanUV[4 + 4] = {
486   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
487   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
488 };
489 
490 //------------------------------------------------------------------------------
491 // Distortion measurement
492 
493 static const uint16_t kWeightY[16] = {
494   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
495 };
496 
497 static const uint16_t kWeightTrellis[16] = {
498 #if USE_TDISTO == 0
499   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
500 #else
501   30, 27, 19, 11,
502   27, 24, 17, 10,
503   19, 17, 12,  8,
504   11, 10,  8,  6
505 #endif
506 };
507 
508 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)509 static void InitScore(VP8ModeScore* const rd) {
510   rd->D  = 0;
511   rd->SD = 0;
512   rd->R  = 0;
513   rd->H  = 0;
514   rd->nz = 0;
515   rd->score = MAX_COST;
516 }
517 
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)518 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
519   dst->D  = src->D;
520   dst->SD = src->SD;
521   dst->R  = src->R;
522   dst->H  = src->H;
523   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
524   dst->score = src->score;
525 }
526 
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)527 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
528   dst->D  += src->D;
529   dst->SD += src->SD;
530   dst->R  += src->R;
531   dst->H  += src->H;
532   dst->nz |= src->nz;     // here, new nz bits are accumulated.
533   dst->score += src->score;
534 }
535 
536 //------------------------------------------------------------------------------
537 // Performs trellis-optimized quantization.
538 
539 // Trellis node
540 typedef struct {
541   int8_t prev;            // best previous node
542   int8_t sign;            // sign of coeff_i
543   int16_t level;          // level
544 } Node;
545 
546 // Score state
547 typedef struct {
548   score_t score;          // partial RD score
549   const uint16_t* costs;  // shortcut to cost tables
550 } ScoreState;
551 
552 // If a coefficient was quantized to a value Q (using a neutral bias),
553 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
554 // We don't test negative values though.
555 #define MIN_DELTA 0   // how much lower level to try
556 #define MAX_DELTA 1   // how much higher
557 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
558 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
559 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
560 
SetRDScore(int lambda,VP8ModeScore * const rd)561 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
562   rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
563 }
564 
RDScoreTrellis(int lambda,score_t rate,score_t distortion)565 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
566                                           score_t distortion) {
567   return rate * lambda + RD_DISTO_MULT * distortion;
568 }
569 
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)570 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
571                                 int16_t in[16], int16_t out[16],
572                                 int ctx0, int coeff_type,
573                                 const VP8Matrix* const mtx,
574                                 int lambda) {
575   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
576   CostArrayPtr const costs =
577       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
578   const int first = (coeff_type == 0) ? 1 : 0;
579   Node nodes[16][NUM_NODES];
580   ScoreState score_states[2][NUM_NODES];
581   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
582   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
583   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
584   score_t best_score;
585   int n, m, p, last;
586 
587   {
588     score_t cost;
589     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
590     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
591 
592     // compute the position of the last interesting coefficient
593     last = first - 1;
594     for (n = 15; n >= first; --n) {
595       const int j = kZigzag[n];
596       const int err = in[j] * in[j];
597       if (err > thresh) {
598         last = n;
599         break;
600       }
601     }
602     // we don't need to go inspect up to n = 16 coeffs. We can just go up
603     // to last + 1 (inclusive) without losing much.
604     if (last < 15) ++last;
605 
606     // compute 'skip' score. This is the max score one can do.
607     cost = VP8BitCost(0, last_proba);
608     best_score = RDScoreTrellis(lambda, cost, 0);
609 
610     // initialize source node.
611     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
612       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
613       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
614       ss_cur[m].costs = costs[first][ctx0];
615     }
616   }
617 
618   // traverse trellis.
619   for (n = first; n <= last; ++n) {
620     const int j = kZigzag[n];
621     const uint32_t Q  = mtx->q_[j];
622     const uint32_t iQ = mtx->iq_[j];
623     const uint32_t B = BIAS(0x00);     // neutral bias
624     // note: it's important to take sign of the _original_ coeff,
625     // so we don't have to consider level < 0 afterward.
626     const int sign = (in[j] < 0);
627     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
628     int level0 = QUANTDIV(coeff0, iQ, B);
629     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
630 
631     {   // Swap current and previous score states
632       ScoreState* const tmp = ss_cur;
633       ss_cur = ss_prev;
634       ss_prev = tmp;
635     }
636 
637     // test all alternate level values around level0.
638     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
639       Node* const cur = &NODE(n, m);
640       int level = level0 + m;
641       const int ctx = (level > 2) ? 2 : level;
642       const int band = VP8EncBands[n + 1];
643       score_t base_score, last_pos_score;
644       score_t best_cur_score = MAX_COST;
645       int best_prev = 0;   // default, in case
646 
647       ss_cur[m].score = MAX_COST;
648       ss_cur[m].costs = costs[n + 1][ctx];
649       if (level > MAX_LEVEL || level < 0) {   // node is dead?
650         continue;
651       }
652 
653       // Compute extra rate cost if last coeff's position is < 15
654       {
655         const score_t last_pos_cost =
656             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
657         last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
658       }
659 
660       {
661         // Compute delta_error = how much coding this level will
662         // subtract to max_error as distortion.
663         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
664         const int new_error = coeff0 - level * Q;
665         const int delta_error =
666             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
667         base_score = RDScoreTrellis(lambda, 0, delta_error);
668       }
669 
670       // Inspect all possible non-dead predecessors. Retain only the best one.
671       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
672         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
673         // eliminated since their score can't be better than the current best.
674         const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
675         // Examine node assuming it's a non-terminal one.
676         const score_t score =
677             base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
678         if (score < best_cur_score) {
679           best_cur_score = score;
680           best_prev = p;
681         }
682       }
683       // Store best finding in current node.
684       cur->sign = sign;
685       cur->level = level;
686       cur->prev = best_prev;
687       ss_cur[m].score = best_cur_score;
688 
689       // Now, record best terminal node (and thus best entry in the graph).
690       if (level != 0) {
691         const score_t score = best_cur_score + last_pos_score;
692         if (score < best_score) {
693           best_score = score;
694           best_path[0] = n;                     // best eob position
695           best_path[1] = m;                     // best node index
696           best_path[2] = best_prev;             // best predecessor
697         }
698       }
699     }
700   }
701 
702   // Fresh start
703   memset(in + first, 0, (16 - first) * sizeof(*in));
704   memset(out + first, 0, (16 - first) * sizeof(*out));
705   if (best_path[0] == -1) {
706     return 0;   // skip!
707   }
708 
709   {
710     // Unwind the best path.
711     // Note: best-prev on terminal node is not necessarily equal to the
712     // best_prev for non-terminal. So we patch best_path[2] in.
713     int nz = 0;
714     int best_node = best_path[1];
715     n = best_path[0];
716     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
717 
718     for (; n >= first; --n) {
719       const Node* const node = &NODE(n, best_node);
720       const int j = kZigzag[n];
721       out[n] = node->sign ? -node->level : node->level;
722       nz |= node->level;
723       in[j] = out[n] * mtx->q_[j];
724       best_node = node->prev;
725     }
726     return (nz != 0);
727   }
728 }
729 
730 #undef NODE
731 
732 //------------------------------------------------------------------------------
733 // Performs: difference, transform, quantize, back-transform, add
734 // all at once. Output is the reconstructed block in *yuv_out, and the
735 // quantized levels in *levels.
736 
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)737 static int ReconstructIntra16(VP8EncIterator* const it,
738                               VP8ModeScore* const rd,
739                               uint8_t* const yuv_out,
740                               int mode) {
741   const VP8Encoder* const enc = it->enc_;
742   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
743   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
744   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
745   int nz = 0;
746   int n;
747   int16_t tmp[16][16], dc_tmp[16];
748 
749   for (n = 0; n < 16; n += 2) {
750     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
751   }
752   VP8FTransformWHT(tmp[0], dc_tmp);
753   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
754 
755   if (DO_TRELLIS_I16 && it->do_trellis_) {
756     int x, y;
757     VP8IteratorNzToBytes(it);
758     for (y = 0, n = 0; y < 4; ++y) {
759       for (x = 0; x < 4; ++x, ++n) {
760         const int ctx = it->top_nz_[x] + it->left_nz_[y];
761         const int non_zero =
762             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
763                                  &dqm->y1_, dqm->lambda_trellis_i16_);
764         it->top_nz_[x] = it->left_nz_[y] = non_zero;
765         rd->y_ac_levels[n][0] = 0;
766         nz |= non_zero << n;
767       }
768     }
769   } else {
770     for (n = 0; n < 16; n += 2) {
771       // Zero-out the first coeff, so that: a) nz is correct below, and
772       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
773       tmp[n][0] = tmp[n + 1][0] = 0;
774       nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
775       assert(rd->y_ac_levels[n + 0][0] == 0);
776       assert(rd->y_ac_levels[n + 1][0] == 0);
777     }
778   }
779 
780   // Transform back
781   VP8TransformWHT(dc_tmp, tmp[0]);
782   for (n = 0; n < 16; n += 2) {
783     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
784   }
785 
786   return nz;
787 }
788 
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)789 static int ReconstructIntra4(VP8EncIterator* const it,
790                              int16_t levels[16],
791                              const uint8_t* const src,
792                              uint8_t* const yuv_out,
793                              int mode) {
794   const VP8Encoder* const enc = it->enc_;
795   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
796   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
797   int nz = 0;
798   int16_t tmp[16];
799 
800   VP8FTransform(src, ref, tmp);
801   if (DO_TRELLIS_I4 && it->do_trellis_) {
802     const int x = it->i4_ & 3, y = it->i4_ >> 2;
803     const int ctx = it->top_nz_[x] + it->left_nz_[y];
804     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
805                               dqm->lambda_trellis_i4_);
806   } else {
807     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
808   }
809   VP8ITransform(ref, tmp, yuv_out, 0);
810   return nz;
811 }
812 
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)813 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
814                          uint8_t* const yuv_out, int mode) {
815   const VP8Encoder* const enc = it->enc_;
816   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
817   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
818   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
819   int nz = 0;
820   int n;
821   int16_t tmp[8][16];
822 
823   for (n = 0; n < 8; n += 2) {
824     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
825   }
826   if (DO_TRELLIS_UV && it->do_trellis_) {
827     int ch, x, y;
828     for (ch = 0, n = 0; ch <= 2; ch += 2) {
829       for (y = 0; y < 2; ++y) {
830         for (x = 0; x < 2; ++x, ++n) {
831           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
832           const int non_zero =
833               TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
834                                    &dqm->uv_, dqm->lambda_trellis_uv_);
835           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
836           nz |= non_zero << n;
837         }
838       }
839     }
840   } else {
841     for (n = 0; n < 8; n += 2) {
842       nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
843     }
844   }
845 
846   for (n = 0; n < 8; n += 2) {
847     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
848   }
849   return (nz << 16);
850 }
851 
852 //------------------------------------------------------------------------------
853 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
854 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
855 
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])856 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
857   // We look at the first three AC coefficients to determine what is the average
858   // delta between each sub-4x4 block.
859   const int v0 = abs(DCs[1]);
860   const int v1 = abs(DCs[4]);
861   const int v2 = abs(DCs[5]);
862   int max_v = (v0 > v1) ? v1 : v0;
863   max_v = (v2 > max_v) ? v2 : max_v;
864   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
865 }
866 
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)867 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
868   VP8ModeScore* const tmp = *a;
869   *a = *b;
870   *b = tmp;
871 }
872 
SwapPtr(uint8_t ** a,uint8_t ** b)873 static void SwapPtr(uint8_t** a, uint8_t** b) {
874   uint8_t* const tmp = *a;
875   *a = *b;
876   *b = tmp;
877 }
878 
SwapOut(VP8EncIterator * const it)879 static void SwapOut(VP8EncIterator* const it) {
880   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
881 }
882 
IsFlat(const int16_t * levels,int num_blocks,score_t thresh)883 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
884   score_t score = 0;
885   while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
886     int i;
887     for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
888       score += (levels[i] != 0);
889       if (score > thresh) return 0;
890     }
891     levels += 16;
892   }
893   return 1;
894 }
895 
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * rd)896 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
897   const int kNumBlocks = 16;
898   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
899   const int lambda = dqm->lambda_i16_;
900   const int tlambda = dqm->tlambda_;
901   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
902   VP8ModeScore rd_tmp;
903   VP8ModeScore* rd_cur = &rd_tmp;
904   VP8ModeScore* rd_best = rd;
905   int mode;
906 
907   rd->mode_i16 = -1;
908   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
909     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
910     rd_cur->mode_i16 = mode;
911 
912     // Reconstruct
913     rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
914 
915     // Measure RD-score
916     rd_cur->D = VP8SSE16x16(src, tmp_dst);
917     rd_cur->SD =
918         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
919     rd_cur->H = VP8FixedCostsI16[mode];
920     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
921     if (mode > 0 &&
922         IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
923       // penalty to avoid flat area to be mispredicted by complex mode
924       rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
925     }
926 
927     // Since we always examine Intra16 first, we can overwrite *rd directly.
928     SetRDScore(lambda, rd_cur);
929     if (mode == 0 || rd_cur->score < rd_best->score) {
930       SwapModeScore(&rd_cur, &rd_best);
931       SwapOut(it);
932     }
933   }
934   if (rd_best != rd) {
935     memcpy(rd, rd_best, sizeof(*rd));
936   }
937   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
938   VP8SetIntra16Mode(it, rd->mode_i16);
939 
940   // we have a blocky macroblock (only DCs are non-zero) with fairly high
941   // distortion, record max delta so we can later adjust the minimal filtering
942   // strength needed to smooth these blocks out.
943   if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
944     StoreMaxDelta(dqm, rd->y_dc_levels);
945   }
946 }
947 
948 //------------------------------------------------------------------------------
949 
950 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])951 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
952                                      const uint8_t modes[16]) {
953   const int preds_w = it->enc_->preds_w_;
954   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
955   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
956   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
957   return VP8FixedCostsI4[top][left];
958 }
959 
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)960 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
961   const VP8Encoder* const enc = it->enc_;
962   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
963   const int lambda = dqm->lambda_i4_;
964   const int tlambda = dqm->tlambda_;
965   const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
966   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
967   int total_header_bits = 0;
968   VP8ModeScore rd_best;
969 
970   if (enc->max_i4_header_bits_ == 0) {
971     return 0;
972   }
973 
974   InitScore(&rd_best);
975   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
976   SetRDScore(dqm->lambda_mode_, &rd_best);
977   VP8IteratorStartI4(it);
978   do {
979     const int kNumBlocks = 1;
980     VP8ModeScore rd_i4;
981     int mode;
982     int best_mode = -1;
983     const uint8_t* const src = src0 + VP8Scan[it->i4_];
984     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
985     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
986     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
987 
988     InitScore(&rd_i4);
989     VP8MakeIntra4Preds(it);
990     for (mode = 0; mode < NUM_BMODES; ++mode) {
991       VP8ModeScore rd_tmp;
992       int16_t tmp_levels[16];
993 
994       // Reconstruct
995       rd_tmp.nz =
996           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
997 
998       // Compute RD-score
999       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1000       rd_tmp.SD =
1001           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1002                   : 0;
1003       rd_tmp.H = mode_costs[mode];
1004 
1005       // Add flatness penalty
1006       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1007         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1008       } else {
1009         rd_tmp.R = 0;
1010       }
1011 
1012       // early-out check
1013       SetRDScore(lambda, &rd_tmp);
1014       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1015 
1016       // finish computing score
1017       rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1018       SetRDScore(lambda, &rd_tmp);
1019 
1020       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1021         CopyScore(&rd_i4, &rd_tmp);
1022         best_mode = mode;
1023         SwapPtr(&tmp_dst, &best_block);
1024         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1025                sizeof(rd_best.y_ac_levels[it->i4_]));
1026       }
1027     }
1028     SetRDScore(dqm->lambda_mode_, &rd_i4);
1029     AddScore(&rd_best, &rd_i4);
1030     if (rd_best.score >= rd->score) {
1031       return 0;
1032     }
1033     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1034     if (total_header_bits > enc->max_i4_header_bits_) {
1035       return 0;
1036     }
1037     // Copy selected samples if not in the right place already.
1038     if (best_block != best_blocks + VP8Scan[it->i4_]) {
1039       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1040     }
1041     rd->modes_i4[it->i4_] = best_mode;
1042     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1043   } while (VP8IteratorRotateI4(it, best_blocks));
1044 
1045   // finalize state
1046   CopyScore(rd, &rd_best);
1047   VP8SetIntra4Mode(it, rd->modes_i4);
1048   SwapOut(it);
1049   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1050   return 1;   // select intra4x4 over intra16x16
1051 }
1052 
1053 //------------------------------------------------------------------------------
1054 
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1055 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1056   const int kNumBlocks = 8;
1057   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1058   const int lambda = dqm->lambda_uv_;
1059   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1060   uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1061   uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1062   uint8_t* dst = dst0;
1063   VP8ModeScore rd_best;
1064   int mode;
1065 
1066   rd->mode_uv = -1;
1067   InitScore(&rd_best);
1068   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1069     VP8ModeScore rd_uv;
1070 
1071     // Reconstruct
1072     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1073 
1074     // Compute RD-score
1075     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1076     rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1077     rd_uv.H  = VP8FixedCostsUV[mode];
1078     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1079     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1080       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1081     }
1082 
1083     SetRDScore(lambda, &rd_uv);
1084     if (mode == 0 || rd_uv.score < rd_best.score) {
1085       CopyScore(&rd_best, &rd_uv);
1086       rd->mode_uv = mode;
1087       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1088       SwapPtr(&dst, &tmp_dst);
1089     }
1090   }
1091   VP8SetIntraUVMode(it, rd->mode_uv);
1092   AddScore(rd, &rd_best);
1093   if (dst != dst0) {   // copy 16x8 block if needed
1094     VP8Copy16x8(dst, dst0);
1095   }
1096 }
1097 
1098 //------------------------------------------------------------------------------
1099 // Final reconstruction and quantization.
1100 
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1101 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1102   const VP8Encoder* const enc = it->enc_;
1103   const int is_i16 = (it->mb_->type_ == 1);
1104   int nz = 0;
1105 
1106   if (is_i16) {
1107     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1108   } else {
1109     VP8IteratorStartI4(it);
1110     do {
1111       const int mode =
1112           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1113       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1114       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1115       VP8MakeIntra4Preds(it);
1116       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1117                               src, dst, mode) << it->i4_;
1118     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1119   }
1120 
1121   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1122   rd->nz = nz;
1123 }
1124 
1125 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * const rd)1126 static void RefineUsingDistortion(VP8EncIterator* const it,
1127                                   int try_both_modes, int refine_uv_mode,
1128                                   VP8ModeScore* const rd) {
1129   score_t best_score = MAX_COST;
1130   score_t score_i4 = (score_t)I4_PENALTY;
1131   int16_t tmp_levels[16][16];
1132   uint8_t modes_i4[16];
1133   int nz = 0;
1134   int mode;
1135   int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1136 
1137   if (is_i16) {   // First, evaluate Intra16 distortion
1138     int best_mode = -1;
1139     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1140     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1141       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1142       const score_t score = VP8SSE16x16(src, ref);
1143       if (score < best_score) {
1144         best_mode = mode;
1145         best_score = score;
1146       }
1147     }
1148     VP8SetIntra16Mode(it, best_mode);
1149     // we'll reconstruct later, if i16 mode actually gets selected
1150   }
1151 
1152   // Next, evaluate Intra4
1153   if (try_both_modes || !is_i16) {
1154     // We don't evaluate the rate here, but just account for it through a
1155     // constant penalty (i4 mode usually needs more bits compared to i16).
1156     is_i16 = 0;
1157     VP8IteratorStartI4(it);
1158     do {
1159       int best_i4_mode = -1;
1160       score_t best_i4_score = MAX_COST;
1161       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1162 
1163       VP8MakeIntra4Preds(it);
1164       for (mode = 0; mode < NUM_BMODES; ++mode) {
1165         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1166         const score_t score = VP8SSE4x4(src, ref);
1167         if (score < best_i4_score) {
1168           best_i4_mode = mode;
1169           best_i4_score = score;
1170         }
1171       }
1172       modes_i4[it->i4_] = best_i4_mode;
1173       score_i4 += best_i4_score;
1174       if (score_i4 >= best_score) {
1175         // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1176         is_i16 = 1;
1177         break;
1178       } else {  // reconstruct partial block inside yuv_out2_ buffer
1179         uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1180         nz |= ReconstructIntra4(it, tmp_levels[it->i4_],
1181                                 src, tmp_dst, best_i4_mode) << it->i4_;
1182       }
1183     } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1184   }
1185 
1186   // Final reconstruction, depending on which mode is selected.
1187   if (!is_i16) {
1188     VP8SetIntra4Mode(it, modes_i4);
1189     memcpy(rd->y_ac_levels, tmp_levels, sizeof(tmp_levels));
1190     SwapOut(it);
1191     best_score = score_i4;
1192   } else {
1193     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1194   }
1195 
1196   // ... and UV!
1197   if (refine_uv_mode) {
1198     int best_mode = -1;
1199     score_t best_uv_score = MAX_COST;
1200     const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1201     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1202       const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1203       const score_t score = VP8SSE16x8(src, ref);
1204       if (score < best_uv_score) {
1205         best_mode = mode;
1206         best_uv_score = score;
1207       }
1208     }
1209     VP8SetIntraUVMode(it, best_mode);
1210   }
1211   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1212 
1213   rd->nz = nz;
1214   rd->score = best_score;
1215 }
1216 
1217 //------------------------------------------------------------------------------
1218 // Entry point
1219 
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1220 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1221                 VP8RDLevel rd_opt) {
1222   int is_skipped;
1223   const int method = it->enc_->method_;
1224 
1225   InitScore(rd);
1226 
1227   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1228   // Luma4x4 predictions needs to be done as-we-go.
1229   VP8MakeLuma16Preds(it);
1230   VP8MakeChroma8Preds(it);
1231 
1232   if (rd_opt > RD_OPT_NONE) {
1233     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1234     PickBestIntra16(it, rd);
1235     if (method >= 2) {
1236       PickBestIntra4(it, rd);
1237     }
1238     PickBestUV(it, rd);
1239     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1240       it->do_trellis_ = 1;
1241       SimpleQuantize(it, rd);
1242     }
1243   } else {
1244     // At this point we have heuristically decided intra16 / intra4.
1245     // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1246     // For method <= 1, we don't re-examine the decision but just go ahead with
1247     // quantization/reconstruction.
1248     RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1249   }
1250   is_skipped = (rd->nz == 0);
1251   VP8SetSkip(it, is_skipped);
1252   return is_skipped;
1253 }
1254