1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // This code is licensed under the same terms as WebM:
4 // Software License Agreement: http://www.webmproject.org/license/software/
5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/
6 // -----------------------------------------------------------------------------
7 //
8 // Speed-critical encoding functions.
9 //
10 // Author: Skal (pascal.massimino@gmail.com)
11
12 #include <stdlib.h> // for abs()
13 #include "./dsp.h"
14 #include "../enc/vp8enci.h"
15
16 #if defined(__cplusplus) || defined(c_plusplus)
17 extern "C" {
18 #endif
19
20 //------------------------------------------------------------------------------
21 // Compute susceptibility based on DCT-coeff histograms:
22 // the higher, the "easier" the macroblock is to compress.
23
ClipAlpha(int alpha)24 static int ClipAlpha(int alpha) {
25 return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha;
26 }
27
VP8GetAlpha(const int histo[MAX_COEFF_THRESH+1])28 int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) {
29 int num = 0, den = 0, val = 0;
30 int k;
31 int alpha;
32 // note: changing this loop to avoid the numerous "k + 1" slows things down.
33 for (k = 0; k < MAX_COEFF_THRESH; ++k) {
34 if (histo[k + 1]) {
35 val += histo[k + 1];
36 num += val * (k + 1);
37 den += (k + 1) * (k + 1);
38 }
39 }
40 // we scale the value to a usable [0..255] range
41 alpha = den ? 10 * num / den - 5 : 0;
42 return ClipAlpha(alpha);
43 }
44
45 const int VP8DspScan[16 + 4 + 4] = {
46 // Luma
47 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
48 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
49 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
50 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
51
52 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
53 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
54 };
55
CollectHistogram(const uint8_t * ref,const uint8_t * pred,int start_block,int end_block)56 static int CollectHistogram(const uint8_t* ref, const uint8_t* pred,
57 int start_block, int end_block) {
58 int histo[MAX_COEFF_THRESH + 1] = { 0 };
59 int16_t out[16];
60 int j, k;
61 for (j = start_block; j < end_block; ++j) {
62 VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
63
64 // Convert coefficients to bin (within out[]).
65 for (k = 0; k < 16; ++k) {
66 const int v = abs(out[k]) >> 2;
67 out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v;
68 }
69
70 // Use bin to update histogram.
71 for (k = 0; k < 16; ++k) {
72 histo[out[k]]++;
73 }
74 }
75
76 return VP8GetAlpha(histo);
77 }
78
79 //------------------------------------------------------------------------------
80 // run-time tables (~4k)
81
82 static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
83
84 // We declare this variable 'volatile' to prevent instruction reordering
85 // and make sure it's set to true _last_ (so as to be thread-safe)
86 static volatile int tables_ok = 0;
87
InitTables(void)88 static void InitTables(void) {
89 if (!tables_ok) {
90 int i;
91 for (i = -255; i <= 255 + 255; ++i) {
92 clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
93 }
94 tables_ok = 1;
95 }
96 }
97
clip_8b(int v)98 static WEBP_INLINE uint8_t clip_8b(int v) {
99 return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255;
100 }
101
102 //------------------------------------------------------------------------------
103 // Transforms (Paragraph 14.4)
104
105 #define STORE(x, y, v) \
106 dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
107
108 static const int kC1 = 20091 + (1 << 16);
109 static const int kC2 = 35468;
110 #define MUL(a, b) (((a) * (b)) >> 16)
111
ITransformOne(const uint8_t * ref,const int16_t * in,uint8_t * dst)112 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
113 uint8_t* dst) {
114 int C[4 * 4], *tmp;
115 int i;
116 tmp = C;
117 for (i = 0; i < 4; ++i) { // vertical pass
118 const int a = in[0] + in[8];
119 const int b = in[0] - in[8];
120 const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
121 const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
122 tmp[0] = a + d;
123 tmp[1] = b + c;
124 tmp[2] = b - c;
125 tmp[3] = a - d;
126 tmp += 4;
127 in++;
128 }
129
130 tmp = C;
131 for (i = 0; i < 4; ++i) { // horizontal pass
132 const int dc = tmp[0] + 4;
133 const int a = dc + tmp[8];
134 const int b = dc - tmp[8];
135 const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
136 const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
137 STORE(0, i, a + d);
138 STORE(1, i, b + c);
139 STORE(2, i, b - c);
140 STORE(3, i, a - d);
141 tmp++;
142 }
143 }
144
ITransform(const uint8_t * ref,const int16_t * in,uint8_t * dst,int do_two)145 static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
146 int do_two) {
147 ITransformOne(ref, in, dst);
148 if (do_two) {
149 ITransformOne(ref + 4, in + 16, dst + 4);
150 }
151 }
152
FTransform(const uint8_t * src,const uint8_t * ref,int16_t * out)153 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
154 int i;
155 int tmp[16];
156 for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
157 const int d0 = src[0] - ref[0];
158 const int d1 = src[1] - ref[1];
159 const int d2 = src[2] - ref[2];
160 const int d3 = src[3] - ref[3];
161 const int a0 = (d0 + d3) << 3;
162 const int a1 = (d1 + d2) << 3;
163 const int a2 = (d1 - d2) << 3;
164 const int a3 = (d0 - d3) << 3;
165 tmp[0 + i * 4] = (a0 + a1);
166 tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12;
167 tmp[2 + i * 4] = (a0 - a1);
168 tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 7500) >> 12;
169 }
170 for (i = 0; i < 4; ++i) {
171 const int a0 = (tmp[0 + i] + tmp[12 + i]);
172 const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
173 const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
174 const int a3 = (tmp[0 + i] - tmp[12 + i]);
175 out[0 + i] = (a0 + a1 + 7) >> 4;
176 out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
177 out[8 + i] = (a0 - a1 + 7) >> 4;
178 out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
179 }
180 }
181
ITransformWHT(const int16_t * in,int16_t * out)182 static void ITransformWHT(const int16_t* in, int16_t* out) {
183 int tmp[16];
184 int i;
185 for (i = 0; i < 4; ++i) {
186 const int a0 = in[0 + i] + in[12 + i];
187 const int a1 = in[4 + i] + in[ 8 + i];
188 const int a2 = in[4 + i] - in[ 8 + i];
189 const int a3 = in[0 + i] - in[12 + i];
190 tmp[0 + i] = a0 + a1;
191 tmp[8 + i] = a0 - a1;
192 tmp[4 + i] = a3 + a2;
193 tmp[12 + i] = a3 - a2;
194 }
195 for (i = 0; i < 4; ++i) {
196 const int dc = tmp[0 + i * 4] + 3; // w/ rounder
197 const int a0 = dc + tmp[3 + i * 4];
198 const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
199 const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
200 const int a3 = dc - tmp[3 + i * 4];
201 out[ 0] = (a0 + a1) >> 3;
202 out[16] = (a3 + a2) >> 3;
203 out[32] = (a0 - a1) >> 3;
204 out[48] = (a3 - a2) >> 3;
205 out += 64;
206 }
207 }
208
FTransformWHT(const int16_t * in,int16_t * out)209 static void FTransformWHT(const int16_t* in, int16_t* out) {
210 int tmp[16];
211 int i;
212 for (i = 0; i < 4; ++i, in += 64) {
213 const int a0 = (in[0 * 16] + in[2 * 16]) << 2;
214 const int a1 = (in[1 * 16] + in[3 * 16]) << 2;
215 const int a2 = (in[1 * 16] - in[3 * 16]) << 2;
216 const int a3 = (in[0 * 16] - in[2 * 16]) << 2;
217 tmp[0 + i * 4] = (a0 + a1) + (a0 != 0);
218 tmp[1 + i * 4] = a3 + a2;
219 tmp[2 + i * 4] = a3 - a2;
220 tmp[3 + i * 4] = a0 - a1;
221 }
222 for (i = 0; i < 4; ++i) {
223 const int a0 = (tmp[0 + i] + tmp[8 + i]);
224 const int a1 = (tmp[4 + i] + tmp[12+ i]);
225 const int a2 = (tmp[4 + i] - tmp[12+ i]);
226 const int a3 = (tmp[0 + i] - tmp[8 + i]);
227 const int b0 = a0 + a1;
228 const int b1 = a3 + a2;
229 const int b2 = a3 - a2;
230 const int b3 = a0 - a1;
231 out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3;
232 out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3;
233 out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3;
234 out[12 + i] = (b3 + (b3 > 0) + 3) >> 3;
235 }
236 }
237
238 #undef MUL
239 #undef STORE
240
241 //------------------------------------------------------------------------------
242 // Intra predictions
243
244 #define DST(x, y) dst[(x) + (y) * BPS]
245
Fill(uint8_t * dst,int value,int size)246 static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
247 int j;
248 for (j = 0; j < size; ++j) {
249 memset(dst + j * BPS, value, size);
250 }
251 }
252
VerticalPred(uint8_t * dst,const uint8_t * top,int size)253 static WEBP_INLINE void VerticalPred(uint8_t* dst,
254 const uint8_t* top, int size) {
255 int j;
256 if (top) {
257 for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
258 } else {
259 Fill(dst, 127, size);
260 }
261 }
262
HorizontalPred(uint8_t * dst,const uint8_t * left,int size)263 static WEBP_INLINE void HorizontalPred(uint8_t* dst,
264 const uint8_t* left, int size) {
265 if (left) {
266 int j;
267 for (j = 0; j < size; ++j) {
268 memset(dst + j * BPS, left[j], size);
269 }
270 } else {
271 Fill(dst, 129, size);
272 }
273 }
274
TrueMotion(uint8_t * dst,const uint8_t * left,const uint8_t * top,int size)275 static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
276 const uint8_t* top, int size) {
277 int y;
278 if (left) {
279 if (top) {
280 const uint8_t* const clip = clip1 + 255 - left[-1];
281 for (y = 0; y < size; ++y) {
282 const uint8_t* const clip_table = clip + left[y];
283 int x;
284 for (x = 0; x < size; ++x) {
285 dst[x] = clip_table[top[x]];
286 }
287 dst += BPS;
288 }
289 } else {
290 HorizontalPred(dst, left, size);
291 }
292 } else {
293 // true motion without left samples (hence: with default 129 value)
294 // is equivalent to VE prediction where you just copy the top samples.
295 // Note that if top samples are not available, the default value is
296 // then 129, and not 127 as in the VerticalPred case.
297 if (top) {
298 VerticalPred(dst, top, size);
299 } else {
300 Fill(dst, 129, size);
301 }
302 }
303 }
304
DCMode(uint8_t * dst,const uint8_t * left,const uint8_t * top,int size,int round,int shift)305 static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
306 const uint8_t* top,
307 int size, int round, int shift) {
308 int DC = 0;
309 int j;
310 if (top) {
311 for (j = 0; j < size; ++j) DC += top[j];
312 if (left) { // top and left present
313 for (j = 0; j < size; ++j) DC += left[j];
314 } else { // top, but no left
315 DC += DC;
316 }
317 DC = (DC + round) >> shift;
318 } else if (left) { // left but no top
319 for (j = 0; j < size; ++j) DC += left[j];
320 DC += DC;
321 DC = (DC + round) >> shift;
322 } else { // no top, no left, nothing.
323 DC = 0x80;
324 }
325 Fill(dst, DC, size);
326 }
327
328 //------------------------------------------------------------------------------
329 // Chroma 8x8 prediction (paragraph 12.2)
330
IntraChromaPreds(uint8_t * dst,const uint8_t * left,const uint8_t * top)331 static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
332 const uint8_t* top) {
333 // U block
334 DCMode(C8DC8 + dst, left, top, 8, 8, 4);
335 VerticalPred(C8VE8 + dst, top, 8);
336 HorizontalPred(C8HE8 + dst, left, 8);
337 TrueMotion(C8TM8 + dst, left, top, 8);
338 // V block
339 dst += 8;
340 if (top) top += 8;
341 if (left) left += 16;
342 DCMode(C8DC8 + dst, left, top, 8, 8, 4);
343 VerticalPred(C8VE8 + dst, top, 8);
344 HorizontalPred(C8HE8 + dst, left, 8);
345 TrueMotion(C8TM8 + dst, left, top, 8);
346 }
347
348 //------------------------------------------------------------------------------
349 // luma 16x16 prediction (paragraph 12.3)
350
Intra16Preds(uint8_t * dst,const uint8_t * left,const uint8_t * top)351 static void Intra16Preds(uint8_t* dst,
352 const uint8_t* left, const uint8_t* top) {
353 DCMode(I16DC16 + dst, left, top, 16, 16, 5);
354 VerticalPred(I16VE16 + dst, top, 16);
355 HorizontalPred(I16HE16 + dst, left, 16);
356 TrueMotion(I16TM16 + dst, left, top, 16);
357 }
358
359 //------------------------------------------------------------------------------
360 // luma 4x4 prediction
361
362 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
363 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
364
VE4(uint8_t * dst,const uint8_t * top)365 static void VE4(uint8_t* dst, const uint8_t* top) { // vertical
366 const uint8_t vals[4] = {
367 AVG3(top[-1], top[0], top[1]),
368 AVG3(top[ 0], top[1], top[2]),
369 AVG3(top[ 1], top[2], top[3]),
370 AVG3(top[ 2], top[3], top[4])
371 };
372 int i;
373 for (i = 0; i < 4; ++i) {
374 memcpy(dst + i * BPS, vals, 4);
375 }
376 }
377
HE4(uint8_t * dst,const uint8_t * top)378 static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
379 const int X = top[-1];
380 const int I = top[-2];
381 const int J = top[-3];
382 const int K = top[-4];
383 const int L = top[-5];
384 *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
385 *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
386 *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
387 *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
388 }
389
DC4(uint8_t * dst,const uint8_t * top)390 static void DC4(uint8_t* dst, const uint8_t* top) {
391 uint32_t dc = 4;
392 int i;
393 for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
394 Fill(dst, dc >> 3, 4);
395 }
396
RD4(uint8_t * dst,const uint8_t * top)397 static void RD4(uint8_t* dst, const uint8_t* top) {
398 const int X = top[-1];
399 const int I = top[-2];
400 const int J = top[-3];
401 const int K = top[-4];
402 const int L = top[-5];
403 const int A = top[0];
404 const int B = top[1];
405 const int C = top[2];
406 const int D = top[3];
407 DST(0, 3) = AVG3(J, K, L);
408 DST(0, 2) = DST(1, 3) = AVG3(I, J, K);
409 DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J);
410 DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
411 DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X);
412 DST(2, 0) = DST(3, 1) = AVG3(C, B, A);
413 DST(3, 0) = AVG3(D, C, B);
414 }
415
LD4(uint8_t * dst,const uint8_t * top)416 static void LD4(uint8_t* dst, const uint8_t* top) {
417 const int A = top[0];
418 const int B = top[1];
419 const int C = top[2];
420 const int D = top[3];
421 const int E = top[4];
422 const int F = top[5];
423 const int G = top[6];
424 const int H = top[7];
425 DST(0, 0) = AVG3(A, B, C);
426 DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
427 DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
428 DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
429 DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
430 DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
431 DST(3, 3) = AVG3(G, H, H);
432 }
433
VR4(uint8_t * dst,const uint8_t * top)434 static void VR4(uint8_t* dst, const uint8_t* top) {
435 const int X = top[-1];
436 const int I = top[-2];
437 const int J = top[-3];
438 const int K = top[-4];
439 const int A = top[0];
440 const int B = top[1];
441 const int C = top[2];
442 const int D = top[3];
443 DST(0, 0) = DST(1, 2) = AVG2(X, A);
444 DST(1, 0) = DST(2, 2) = AVG2(A, B);
445 DST(2, 0) = DST(3, 2) = AVG2(B, C);
446 DST(3, 0) = AVG2(C, D);
447
448 DST(0, 3) = AVG3(K, J, I);
449 DST(0, 2) = AVG3(J, I, X);
450 DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
451 DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
452 DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
453 DST(3, 1) = AVG3(B, C, D);
454 }
455
VL4(uint8_t * dst,const uint8_t * top)456 static void VL4(uint8_t* dst, const uint8_t* top) {
457 const int A = top[0];
458 const int B = top[1];
459 const int C = top[2];
460 const int D = top[3];
461 const int E = top[4];
462 const int F = top[5];
463 const int G = top[6];
464 const int H = top[7];
465 DST(0, 0) = AVG2(A, B);
466 DST(1, 0) = DST(0, 2) = AVG2(B, C);
467 DST(2, 0) = DST(1, 2) = AVG2(C, D);
468 DST(3, 0) = DST(2, 2) = AVG2(D, E);
469
470 DST(0, 1) = AVG3(A, B, C);
471 DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
472 DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
473 DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
474 DST(3, 2) = AVG3(E, F, G);
475 DST(3, 3) = AVG3(F, G, H);
476 }
477
HU4(uint8_t * dst,const uint8_t * top)478 static void HU4(uint8_t* dst, const uint8_t* top) {
479 const int I = top[-2];
480 const int J = top[-3];
481 const int K = top[-4];
482 const int L = top[-5];
483 DST(0, 0) = AVG2(I, J);
484 DST(2, 0) = DST(0, 1) = AVG2(J, K);
485 DST(2, 1) = DST(0, 2) = AVG2(K, L);
486 DST(1, 0) = AVG3(I, J, K);
487 DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
488 DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
489 DST(3, 2) = DST(2, 2) =
490 DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
491 }
492
HD4(uint8_t * dst,const uint8_t * top)493 static void HD4(uint8_t* dst, const uint8_t* top) {
494 const int X = top[-1];
495 const int I = top[-2];
496 const int J = top[-3];
497 const int K = top[-4];
498 const int L = top[-5];
499 const int A = top[0];
500 const int B = top[1];
501 const int C = top[2];
502
503 DST(0, 0) = DST(2, 1) = AVG2(I, X);
504 DST(0, 1) = DST(2, 2) = AVG2(J, I);
505 DST(0, 2) = DST(2, 3) = AVG2(K, J);
506 DST(0, 3) = AVG2(L, K);
507
508 DST(3, 0) = AVG3(A, B, C);
509 DST(2, 0) = AVG3(X, A, B);
510 DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
511 DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
512 DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
513 DST(1, 3) = AVG3(L, K, J);
514 }
515
TM4(uint8_t * dst,const uint8_t * top)516 static void TM4(uint8_t* dst, const uint8_t* top) {
517 int x, y;
518 const uint8_t* const clip = clip1 + 255 - top[-1];
519 for (y = 0; y < 4; ++y) {
520 const uint8_t* const clip_table = clip + top[-2 - y];
521 for (x = 0; x < 4; ++x) {
522 dst[x] = clip_table[top[x]];
523 }
524 dst += BPS;
525 }
526 }
527
528 #undef DST
529 #undef AVG3
530 #undef AVG2
531
532 // Left samples are top[-5 .. -2], top_left is top[-1], top are
533 // located at top[0..3], and top right is top[4..7]
Intra4Preds(uint8_t * dst,const uint8_t * top)534 static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
535 DC4(I4DC4 + dst, top);
536 TM4(I4TM4 + dst, top);
537 VE4(I4VE4 + dst, top);
538 HE4(I4HE4 + dst, top);
539 RD4(I4RD4 + dst, top);
540 VR4(I4VR4 + dst, top);
541 LD4(I4LD4 + dst, top);
542 VL4(I4VL4 + dst, top);
543 HD4(I4HD4 + dst, top);
544 HU4(I4HU4 + dst, top);
545 }
546
547 //------------------------------------------------------------------------------
548 // Metric
549
GetSSE(const uint8_t * a,const uint8_t * b,int w,int h)550 static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
551 int w, int h) {
552 int count = 0;
553 int y, x;
554 for (y = 0; y < h; ++y) {
555 for (x = 0; x < w; ++x) {
556 const int diff = (int)a[x] - b[x];
557 count += diff * diff;
558 }
559 a += BPS;
560 b += BPS;
561 }
562 return count;
563 }
564
SSE16x16(const uint8_t * a,const uint8_t * b)565 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
566 return GetSSE(a, b, 16, 16);
567 }
SSE16x8(const uint8_t * a,const uint8_t * b)568 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
569 return GetSSE(a, b, 16, 8);
570 }
SSE8x8(const uint8_t * a,const uint8_t * b)571 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
572 return GetSSE(a, b, 8, 8);
573 }
SSE4x4(const uint8_t * a,const uint8_t * b)574 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
575 return GetSSE(a, b, 4, 4);
576 }
577
578 //------------------------------------------------------------------------------
579 // Texture distortion
580 //
581 // We try to match the spectral content (weighted) between source and
582 // reconstructed samples.
583
584 // Hadamard transform
585 // Returns the weighted sum of the absolute value of transformed coefficients.
TTransform(const uint8_t * in,const uint16_t * w)586 static int TTransform(const uint8_t* in, const uint16_t* w) {
587 int sum = 0;
588 int tmp[16];
589 int i;
590 // horizontal pass
591 for (i = 0; i < 4; ++i, in += BPS) {
592 const int a0 = (in[0] + in[2]) << 2;
593 const int a1 = (in[1] + in[3]) << 2;
594 const int a2 = (in[1] - in[3]) << 2;
595 const int a3 = (in[0] - in[2]) << 2;
596 tmp[0 + i * 4] = a0 + a1 + (a0 != 0);
597 tmp[1 + i * 4] = a3 + a2;
598 tmp[2 + i * 4] = a3 - a2;
599 tmp[3 + i * 4] = a0 - a1;
600 }
601 // vertical pass
602 for (i = 0; i < 4; ++i, ++w) {
603 const int a0 = (tmp[0 + i] + tmp[8 + i]);
604 const int a1 = (tmp[4 + i] + tmp[12+ i]);
605 const int a2 = (tmp[4 + i] - tmp[12+ i]);
606 const int a3 = (tmp[0 + i] - tmp[8 + i]);
607 const int b0 = a0 + a1;
608 const int b1 = a3 + a2;
609 const int b2 = a3 - a2;
610 const int b3 = a0 - a1;
611 // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
612 sum += w[ 0] * ((abs(b0) + 3) >> 3);
613 sum += w[ 4] * ((abs(b1) + 3) >> 3);
614 sum += w[ 8] * ((abs(b2) + 3) >> 3);
615 sum += w[12] * ((abs(b3) + 3) >> 3);
616 }
617 return sum;
618 }
619
Disto4x4(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)620 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
621 const uint16_t* const w) {
622 const int sum1 = TTransform(a, w);
623 const int sum2 = TTransform(b, w);
624 return (abs(sum2 - sum1) + 8) >> 4;
625 }
626
Disto16x16(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)627 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
628 const uint16_t* const w) {
629 int D = 0;
630 int x, y;
631 for (y = 0; y < 16 * BPS; y += 4 * BPS) {
632 for (x = 0; x < 16; x += 4) {
633 D += Disto4x4(a + x + y, b + x + y, w);
634 }
635 }
636 return D;
637 }
638
639 //------------------------------------------------------------------------------
640 // Quantization
641 //
642
643 static const uint8_t kZigzag[16] = {
644 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
645 };
646
647 // Simple quantization
QuantizeBlock(int16_t in[16],int16_t out[16],int n,const VP8Matrix * const mtx)648 static int QuantizeBlock(int16_t in[16], int16_t out[16],
649 int n, const VP8Matrix* const mtx) {
650 int last = -1;
651 for (; n < 16; ++n) {
652 const int j = kZigzag[n];
653 const int sign = (in[j] < 0);
654 int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
655 if (coeff > 2047) coeff = 2047;
656 if (coeff > mtx->zthresh_[j]) {
657 const int Q = mtx->q_[j];
658 const int iQ = mtx->iq_[j];
659 const int B = mtx->bias_[j];
660 out[n] = QUANTDIV(coeff, iQ, B);
661 if (sign) out[n] = -out[n];
662 in[j] = out[n] * Q;
663 if (out[n]) last = n;
664 } else {
665 out[n] = 0;
666 in[j] = 0;
667 }
668 }
669 return (last >= 0);
670 }
671
672 //------------------------------------------------------------------------------
673 // Block copy
674
Copy(const uint8_t * src,uint8_t * dst,int size)675 static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
676 int y;
677 for (y = 0; y < size; ++y) {
678 memcpy(dst, src, size);
679 src += BPS;
680 dst += BPS;
681 }
682 }
683
Copy4x4(const uint8_t * src,uint8_t * dst)684 static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
685
686 //------------------------------------------------------------------------------
687 // Initialization
688
689 // Speed-critical function pointers. We have to initialize them to the default
690 // implementations within VP8EncDspInit().
691 VP8CHisto VP8CollectHistogram;
692 VP8Idct VP8ITransform;
693 VP8Fdct VP8FTransform;
694 VP8WHT VP8ITransformWHT;
695 VP8WHT VP8FTransformWHT;
696 VP8Intra4Preds VP8EncPredLuma4;
697 VP8IntraPreds VP8EncPredLuma16;
698 VP8IntraPreds VP8EncPredChroma8;
699 VP8Metric VP8SSE16x16;
700 VP8Metric VP8SSE8x8;
701 VP8Metric VP8SSE16x8;
702 VP8Metric VP8SSE4x4;
703 VP8WMetric VP8TDisto4x4;
704 VP8WMetric VP8TDisto16x16;
705 VP8QuantizeBlock VP8EncQuantizeBlock;
706 VP8BlockCopy VP8Copy4x4;
707
708 extern void VP8EncDspInitSSE2(void);
709
VP8EncDspInit(void)710 void VP8EncDspInit(void) {
711 InitTables();
712
713 // default C implementations
714 VP8CollectHistogram = CollectHistogram;
715 VP8ITransform = ITransform;
716 VP8FTransform = FTransform;
717 VP8ITransformWHT = ITransformWHT;
718 VP8FTransformWHT = FTransformWHT;
719 VP8EncPredLuma4 = Intra4Preds;
720 VP8EncPredLuma16 = Intra16Preds;
721 VP8EncPredChroma8 = IntraChromaPreds;
722 VP8SSE16x16 = SSE16x16;
723 VP8SSE8x8 = SSE8x8;
724 VP8SSE16x8 = SSE16x8;
725 VP8SSE4x4 = SSE4x4;
726 VP8TDisto4x4 = Disto4x4;
727 VP8TDisto16x16 = Disto16x16;
728 VP8EncQuantizeBlock = QuantizeBlock;
729 VP8Copy4x4 = Copy4x4;
730
731 // If defined, use CPUInfo() to overwrite some pointers with faster versions.
732 if (VP8GetCPUInfo) {
733 #if defined(WEBP_USE_SSE2)
734 if (VP8GetCPUInfo(kSSE2)) {
735 VP8EncDspInitSSE2();
736 }
737 #endif
738 }
739 }
740
741 #if defined(__cplusplus) || defined(c_plusplus)
742 } // extern "C"
743 #endif
744