1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // Speed-critical encoding functions.
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13
14 #include <assert.h>
15 #include <stdlib.h> // for abs()
16
17 #include "src/dsp/dsp.h"
18 #include "src/enc/vp8i_enc.h"
19
clip_8b(int v)20 static WEBP_INLINE uint8_t clip_8b(int v) {
21 return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
22 }
23
24 #if !WEBP_NEON_OMIT_C_CODE
clip_max(int v,int max)25 static WEBP_INLINE int clip_max(int v, int max) {
26 return (v > max) ? max : v;
27 }
28 #endif // !WEBP_NEON_OMIT_C_CODE
29
30 //------------------------------------------------------------------------------
31 // Compute susceptibility based on DCT-coeff histograms:
32 // the higher, the "easier" the macroblock is to compress.
33
34 const int VP8DspScan[16 + 4 + 4] = {
35 // Luma
36 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
37 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
38 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
39 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
40
41 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
42 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
43 };
44
45 // general-purpose util function
VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH+1],VP8Histogram * const histo)46 void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
47 VP8Histogram* const histo) {
48 int max_value = 0, last_non_zero = 1;
49 int k;
50 for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
51 const int value = distribution[k];
52 if (value > 0) {
53 if (value > max_value) max_value = value;
54 last_non_zero = k;
55 }
56 }
57 histo->max_value = max_value;
58 histo->last_non_zero = last_non_zero;
59 }
60
61 #if !WEBP_NEON_OMIT_C_CODE
CollectHistogram_C(const uint8_t * WEBP_RESTRICT ref,const uint8_t * WEBP_RESTRICT pred,int start_block,int end_block,VP8Histogram * WEBP_RESTRICT const histo)62 static void CollectHistogram_C(const uint8_t* WEBP_RESTRICT ref,
63 const uint8_t* WEBP_RESTRICT pred,
64 int start_block, int end_block,
65 VP8Histogram* WEBP_RESTRICT const histo) {
66 int j;
67 int distribution[MAX_COEFF_THRESH + 1] = { 0 };
68 for (j = start_block; j < end_block; ++j) {
69 int k;
70 int16_t out[16];
71
72 VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
73
74 // Convert coefficients to bin.
75 for (k = 0; k < 16; ++k) {
76 const int v = abs(out[k]) >> 3;
77 const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
78 ++distribution[clipped_value];
79 }
80 }
81 VP8SetHistogramData(distribution, histo);
82 }
83 #endif // !WEBP_NEON_OMIT_C_CODE
84
85 //------------------------------------------------------------------------------
86 // run-time tables (~4k)
87
88 static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
89
90 // We declare this variable 'volatile' to prevent instruction reordering
91 // and make sure it's set to true _last_ (so as to be thread-safe)
92 static volatile int tables_ok = 0;
93
InitTables(void)94 static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
95 if (!tables_ok) {
96 int i;
97 for (i = -255; i <= 255 + 255; ++i) {
98 clip1[255 + i] = clip_8b(i);
99 }
100 tables_ok = 1;
101 }
102 }
103
104
105 //------------------------------------------------------------------------------
106 // Transforms (Paragraph 14.4)
107
108 #if !WEBP_NEON_OMIT_C_CODE
109
110 #define STORE(x, y, v) \
111 dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
112
ITransformOne(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)113 static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
114 const int16_t* WEBP_RESTRICT in,
115 uint8_t* WEBP_RESTRICT dst) {
116 int C[4 * 4], *tmp;
117 int i;
118 tmp = C;
119 for (i = 0; i < 4; ++i) { // vertical pass
120 const int a = in[0] + in[8];
121 const int b = in[0] - in[8];
122 const int c =
123 WEBP_TRANSFORM_AC3_MUL2(in[4]) - WEBP_TRANSFORM_AC3_MUL1(in[12]);
124 const int d =
125 WEBP_TRANSFORM_AC3_MUL1(in[4]) + WEBP_TRANSFORM_AC3_MUL2(in[12]);
126 tmp[0] = a + d;
127 tmp[1] = b + c;
128 tmp[2] = b - c;
129 tmp[3] = a - d;
130 tmp += 4;
131 in++;
132 }
133
134 tmp = C;
135 for (i = 0; i < 4; ++i) { // horizontal pass
136 const int dc = tmp[0] + 4;
137 const int a = dc + tmp[8];
138 const int b = dc - tmp[8];
139 const int c =
140 WEBP_TRANSFORM_AC3_MUL2(tmp[4]) - WEBP_TRANSFORM_AC3_MUL1(tmp[12]);
141 const int d =
142 WEBP_TRANSFORM_AC3_MUL1(tmp[4]) + WEBP_TRANSFORM_AC3_MUL2(tmp[12]);
143 STORE(0, i, a + d);
144 STORE(1, i, b + c);
145 STORE(2, i, b - c);
146 STORE(3, i, a - d);
147 tmp++;
148 }
149 }
150
ITransform_C(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst,int do_two)151 static void ITransform_C(const uint8_t* WEBP_RESTRICT ref,
152 const int16_t* WEBP_RESTRICT in,
153 uint8_t* WEBP_RESTRICT dst,
154 int do_two) {
155 ITransformOne(ref, in, dst);
156 if (do_two) {
157 ITransformOne(ref + 4, in + 16, dst + 4);
158 }
159 }
160
FTransform_C(const uint8_t * WEBP_RESTRICT src,const uint8_t * WEBP_RESTRICT ref,int16_t * WEBP_RESTRICT out)161 static void FTransform_C(const uint8_t* WEBP_RESTRICT src,
162 const uint8_t* WEBP_RESTRICT ref,
163 int16_t* WEBP_RESTRICT out) {
164 int i;
165 int tmp[16];
166 for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
167 const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255])
168 const int d1 = src[1] - ref[1];
169 const int d2 = src[2] - ref[2];
170 const int d3 = src[3] - ref[3];
171 const int a0 = (d0 + d3); // 10b [-510,510]
172 const int a1 = (d1 + d2);
173 const int a2 = (d1 - d2);
174 const int a3 = (d0 - d3);
175 tmp[0 + i * 4] = (a0 + a1) * 8; // 14b [-8160,8160]
176 tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542]
177 tmp[2 + i * 4] = (a0 - a1) * 8;
178 tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9;
179 }
180 for (i = 0; i < 4; ++i) {
181 const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b
182 const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
183 const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
184 const int a3 = (tmp[0 + i] - tmp[12 + i]);
185 out[0 + i] = (a0 + a1 + 7) >> 4; // 12b
186 out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
187 out[8 + i] = (a0 - a1 + 7) >> 4;
188 out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
189 }
190 }
191 #endif // !WEBP_NEON_OMIT_C_CODE
192
FTransform2_C(const uint8_t * WEBP_RESTRICT src,const uint8_t * WEBP_RESTRICT ref,int16_t * WEBP_RESTRICT out)193 static void FTransform2_C(const uint8_t* WEBP_RESTRICT src,
194 const uint8_t* WEBP_RESTRICT ref,
195 int16_t* WEBP_RESTRICT out) {
196 VP8FTransform(src, ref, out);
197 VP8FTransform(src + 4, ref + 4, out + 16);
198 }
199
200 #if !WEBP_NEON_OMIT_C_CODE
FTransformWHT_C(const int16_t * WEBP_RESTRICT in,int16_t * WEBP_RESTRICT out)201 static void FTransformWHT_C(const int16_t* WEBP_RESTRICT in,
202 int16_t* WEBP_RESTRICT out) {
203 // input is 12b signed
204 int32_t tmp[16];
205 int i;
206 for (i = 0; i < 4; ++i, in += 64) {
207 const int a0 = (in[0 * 16] + in[2 * 16]); // 13b
208 const int a1 = (in[1 * 16] + in[3 * 16]);
209 const int a2 = (in[1 * 16] - in[3 * 16]);
210 const int a3 = (in[0 * 16] - in[2 * 16]);
211 tmp[0 + i * 4] = a0 + a1; // 14b
212 tmp[1 + i * 4] = a3 + a2;
213 tmp[2 + i * 4] = a3 - a2;
214 tmp[3 + i * 4] = a0 - a1;
215 }
216 for (i = 0; i < 4; ++i) {
217 const int a0 = (tmp[0 + i] + tmp[8 + i]); // 15b
218 const int a1 = (tmp[4 + i] + tmp[12+ i]);
219 const int a2 = (tmp[4 + i] - tmp[12+ i]);
220 const int a3 = (tmp[0 + i] - tmp[8 + i]);
221 const int b0 = a0 + a1; // 16b
222 const int b1 = a3 + a2;
223 const int b2 = a3 - a2;
224 const int b3 = a0 - a1;
225 out[ 0 + i] = b0 >> 1; // 15b
226 out[ 4 + i] = b1 >> 1;
227 out[ 8 + i] = b2 >> 1;
228 out[12 + i] = b3 >> 1;
229 }
230 }
231 #endif // !WEBP_NEON_OMIT_C_CODE
232
233 #undef STORE
234
235 //------------------------------------------------------------------------------
236 // Intra predictions
237
Fill(uint8_t * dst,int value,int size)238 static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
239 int j;
240 for (j = 0; j < size; ++j) {
241 memset(dst + j * BPS, value, size);
242 }
243 }
244
VerticalPred(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top,int size)245 static WEBP_INLINE void VerticalPred(uint8_t* WEBP_RESTRICT dst,
246 const uint8_t* WEBP_RESTRICT top,
247 int size) {
248 int j;
249 if (top != NULL) {
250 for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
251 } else {
252 Fill(dst, 127, size);
253 }
254 }
255
HorizontalPred(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,int size)256 static WEBP_INLINE void HorizontalPred(uint8_t* WEBP_RESTRICT dst,
257 const uint8_t* WEBP_RESTRICT left,
258 int size) {
259 if (left != NULL) {
260 int j;
261 for (j = 0; j < size; ++j) {
262 memset(dst + j * BPS, left[j], size);
263 }
264 } else {
265 Fill(dst, 129, size);
266 }
267 }
268
TrueMotion(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top,int size)269 static WEBP_INLINE void TrueMotion(uint8_t* WEBP_RESTRICT dst,
270 const uint8_t* WEBP_RESTRICT left,
271 const uint8_t* WEBP_RESTRICT top, int size) {
272 int y;
273 if (left != NULL) {
274 if (top != NULL) {
275 const uint8_t* const clip = clip1 + 255 - left[-1];
276 for (y = 0; y < size; ++y) {
277 const uint8_t* const clip_table = clip + left[y];
278 int x;
279 for (x = 0; x < size; ++x) {
280 dst[x] = clip_table[top[x]];
281 }
282 dst += BPS;
283 }
284 } else {
285 HorizontalPred(dst, left, size);
286 }
287 } else {
288 // true motion without left samples (hence: with default 129 value)
289 // is equivalent to VE prediction where you just copy the top samples.
290 // Note that if top samples are not available, the default value is
291 // then 129, and not 127 as in the VerticalPred case.
292 if (top != NULL) {
293 VerticalPred(dst, top, size);
294 } else {
295 Fill(dst, 129, size);
296 }
297 }
298 }
299
DCMode(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top,int size,int round,int shift)300 static WEBP_INLINE void DCMode(uint8_t* WEBP_RESTRICT dst,
301 const uint8_t* WEBP_RESTRICT left,
302 const uint8_t* WEBP_RESTRICT top,
303 int size, int round, int shift) {
304 int DC = 0;
305 int j;
306 if (top != NULL) {
307 for (j = 0; j < size; ++j) DC += top[j];
308 if (left != NULL) { // top and left present
309 for (j = 0; j < size; ++j) DC += left[j];
310 } else { // top, but no left
311 DC += DC;
312 }
313 DC = (DC + round) >> shift;
314 } else if (left != NULL) { // left but no top
315 for (j = 0; j < size; ++j) DC += left[j];
316 DC += DC;
317 DC = (DC + round) >> shift;
318 } else { // no top, no left, nothing.
319 DC = 0x80;
320 }
321 Fill(dst, DC, size);
322 }
323
324 //------------------------------------------------------------------------------
325 // Chroma 8x8 prediction (paragraph 12.2)
326
IntraChromaPreds_C(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top)327 static void IntraChromaPreds_C(uint8_t* WEBP_RESTRICT dst,
328 const uint8_t* WEBP_RESTRICT left,
329 const uint8_t* WEBP_RESTRICT top) {
330 // U block
331 DCMode(C8DC8 + dst, left, top, 8, 8, 4);
332 VerticalPred(C8VE8 + dst, top, 8);
333 HorizontalPred(C8HE8 + dst, left, 8);
334 TrueMotion(C8TM8 + dst, left, top, 8);
335 // V block
336 dst += 8;
337 if (top != NULL) top += 8;
338 if (left != NULL) left += 16;
339 DCMode(C8DC8 + dst, left, top, 8, 8, 4);
340 VerticalPred(C8VE8 + dst, top, 8);
341 HorizontalPred(C8HE8 + dst, left, 8);
342 TrueMotion(C8TM8 + dst, left, top, 8);
343 }
344
345 //------------------------------------------------------------------------------
346 // luma 16x16 prediction (paragraph 12.3)
347
348 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
Intra16Preds_C(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top)349 static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst,
350 const uint8_t* WEBP_RESTRICT left,
351 const uint8_t* WEBP_RESTRICT top) {
352 DCMode(I16DC16 + dst, left, top, 16, 16, 5);
353 VerticalPred(I16VE16 + dst, top, 16);
354 HorizontalPred(I16HE16 + dst, left, 16);
355 TrueMotion(I16TM16 + dst, left, top, 16);
356 }
357 #endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
358
359 //------------------------------------------------------------------------------
360 // luma 4x4 prediction
361
362 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
363
364 #define DST(x, y) dst[(x) + (y) * BPS]
365 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
366 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
367
368 // vertical
VE4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)369 static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
370 const uint8_t vals[4] = {
371 AVG3(top[-1], top[0], top[1]),
372 AVG3(top[ 0], top[1], top[2]),
373 AVG3(top[ 1], top[2], top[3]),
374 AVG3(top[ 2], top[3], top[4])
375 };
376 int i;
377 for (i = 0; i < 4; ++i) {
378 memcpy(dst + i * BPS, vals, 4);
379 }
380 }
381
382 // horizontal
HE4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)383 static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
384 const int X = top[-1];
385 const int I = top[-2];
386 const int J = top[-3];
387 const int K = top[-4];
388 const int L = top[-5];
389 WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
390 WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
391 WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
392 WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
393 }
394
DC4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)395 static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
396 uint32_t dc = 4;
397 int i;
398 for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
399 Fill(dst, dc >> 3, 4);
400 }
401
RD4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)402 static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
403 const int X = top[-1];
404 const int I = top[-2];
405 const int J = top[-3];
406 const int K = top[-4];
407 const int L = top[-5];
408 const int A = top[0];
409 const int B = top[1];
410 const int C = top[2];
411 const int D = top[3];
412 DST(0, 3) = AVG3(J, K, L);
413 DST(0, 2) = DST(1, 3) = AVG3(I, J, K);
414 DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J);
415 DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
416 DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X);
417 DST(2, 0) = DST(3, 1) = AVG3(C, B, A);
418 DST(3, 0) = AVG3(D, C, B);
419 }
420
LD4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)421 static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
422 const int A = top[0];
423 const int B = top[1];
424 const int C = top[2];
425 const int D = top[3];
426 const int E = top[4];
427 const int F = top[5];
428 const int G = top[6];
429 const int H = top[7];
430 DST(0, 0) = AVG3(A, B, C);
431 DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
432 DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
433 DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
434 DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
435 DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
436 DST(3, 3) = AVG3(G, H, H);
437 }
438
VR4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)439 static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
440 const int X = top[-1];
441 const int I = top[-2];
442 const int J = top[-3];
443 const int K = top[-4];
444 const int A = top[0];
445 const int B = top[1];
446 const int C = top[2];
447 const int D = top[3];
448 DST(0, 0) = DST(1, 2) = AVG2(X, A);
449 DST(1, 0) = DST(2, 2) = AVG2(A, B);
450 DST(2, 0) = DST(3, 2) = AVG2(B, C);
451 DST(3, 0) = AVG2(C, D);
452
453 DST(0, 3) = AVG3(K, J, I);
454 DST(0, 2) = AVG3(J, I, X);
455 DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
456 DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
457 DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
458 DST(3, 1) = AVG3(B, C, D);
459 }
460
VL4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)461 static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
462 const int A = top[0];
463 const int B = top[1];
464 const int C = top[2];
465 const int D = top[3];
466 const int E = top[4];
467 const int F = top[5];
468 const int G = top[6];
469 const int H = top[7];
470 DST(0, 0) = AVG2(A, B);
471 DST(1, 0) = DST(0, 2) = AVG2(B, C);
472 DST(2, 0) = DST(1, 2) = AVG2(C, D);
473 DST(3, 0) = DST(2, 2) = AVG2(D, E);
474
475 DST(0, 1) = AVG3(A, B, C);
476 DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
477 DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
478 DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
479 DST(3, 2) = AVG3(E, F, G);
480 DST(3, 3) = AVG3(F, G, H);
481 }
482
HU4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)483 static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
484 const int I = top[-2];
485 const int J = top[-3];
486 const int K = top[-4];
487 const int L = top[-5];
488 DST(0, 0) = AVG2(I, J);
489 DST(2, 0) = DST(0, 1) = AVG2(J, K);
490 DST(2, 1) = DST(0, 2) = AVG2(K, L);
491 DST(1, 0) = AVG3(I, J, K);
492 DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
493 DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
494 DST(3, 2) = DST(2, 2) =
495 DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
496 }
497
HD4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)498 static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
499 const int X = top[-1];
500 const int I = top[-2];
501 const int J = top[-3];
502 const int K = top[-4];
503 const int L = top[-5];
504 const int A = top[0];
505 const int B = top[1];
506 const int C = top[2];
507
508 DST(0, 0) = DST(2, 1) = AVG2(I, X);
509 DST(0, 1) = DST(2, 2) = AVG2(J, I);
510 DST(0, 2) = DST(2, 3) = AVG2(K, J);
511 DST(0, 3) = AVG2(L, K);
512
513 DST(3, 0) = AVG3(A, B, C);
514 DST(2, 0) = AVG3(X, A, B);
515 DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
516 DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
517 DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
518 DST(1, 3) = AVG3(L, K, J);
519 }
520
TM4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)521 static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
522 int x, y;
523 const uint8_t* const clip = clip1 + 255 - top[-1];
524 for (y = 0; y < 4; ++y) {
525 const uint8_t* const clip_table = clip + top[-2 - y];
526 for (x = 0; x < 4; ++x) {
527 dst[x] = clip_table[top[x]];
528 }
529 dst += BPS;
530 }
531 }
532
533 #undef DST
534 #undef AVG3
535 #undef AVG2
536
537 // Left samples are top[-5 .. -2], top_left is top[-1], top are
538 // located at top[0..3], and top right is top[4..7]
Intra4Preds_C(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)539 static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst,
540 const uint8_t* WEBP_RESTRICT top) {
541 DC4(I4DC4 + dst, top);
542 TM4(I4TM4 + dst, top);
543 VE4(I4VE4 + dst, top);
544 HE4(I4HE4 + dst, top);
545 RD4(I4RD4 + dst, top);
546 VR4(I4VR4 + dst, top);
547 LD4(I4LD4 + dst, top);
548 VL4(I4VL4 + dst, top);
549 HD4(I4HD4 + dst, top);
550 HU4(I4HU4 + dst, top);
551 }
552
553 #endif // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
554
555 //------------------------------------------------------------------------------
556 // Metric
557
558 #if !WEBP_NEON_OMIT_C_CODE
GetSSE(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b,int w,int h)559 static WEBP_INLINE int GetSSE(const uint8_t* WEBP_RESTRICT a,
560 const uint8_t* WEBP_RESTRICT b,
561 int w, int h) {
562 int count = 0;
563 int y, x;
564 for (y = 0; y < h; ++y) {
565 for (x = 0; x < w; ++x) {
566 const int diff = (int)a[x] - b[x];
567 count += diff * diff;
568 }
569 a += BPS;
570 b += BPS;
571 }
572 return count;
573 }
574
SSE16x16_C(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)575 static int SSE16x16_C(const uint8_t* WEBP_RESTRICT a,
576 const uint8_t* WEBP_RESTRICT b) {
577 return GetSSE(a, b, 16, 16);
578 }
SSE16x8_C(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)579 static int SSE16x8_C(const uint8_t* WEBP_RESTRICT a,
580 const uint8_t* WEBP_RESTRICT b) {
581 return GetSSE(a, b, 16, 8);
582 }
SSE8x8_C(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)583 static int SSE8x8_C(const uint8_t* WEBP_RESTRICT a,
584 const uint8_t* WEBP_RESTRICT b) {
585 return GetSSE(a, b, 8, 8);
586 }
SSE4x4_C(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)587 static int SSE4x4_C(const uint8_t* WEBP_RESTRICT a,
588 const uint8_t* WEBP_RESTRICT b) {
589 return GetSSE(a, b, 4, 4);
590 }
591 #endif // !WEBP_NEON_OMIT_C_CODE
592
Mean16x4_C(const uint8_t * WEBP_RESTRICT ref,uint32_t dc[4])593 static void Mean16x4_C(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) {
594 int k, x, y;
595 for (k = 0; k < 4; ++k) {
596 uint32_t avg = 0;
597 for (y = 0; y < 4; ++y) {
598 for (x = 0; x < 4; ++x) {
599 avg += ref[x + y * BPS];
600 }
601 }
602 dc[k] = avg;
603 ref += 4; // go to next 4x4 block.
604 }
605 }
606
607 //------------------------------------------------------------------------------
608 // Texture distortion
609 //
610 // We try to match the spectral content (weighted) between source and
611 // reconstructed samples.
612
613 #if !WEBP_NEON_OMIT_C_CODE
614 // Hadamard transform
615 // Returns the weighted sum of the absolute value of transformed coefficients.
616 // w[] contains a row-major 4 by 4 symmetric matrix.
TTransform(const uint8_t * WEBP_RESTRICT in,const uint16_t * WEBP_RESTRICT w)617 static int TTransform(const uint8_t* WEBP_RESTRICT in,
618 const uint16_t* WEBP_RESTRICT w) {
619 int sum = 0;
620 int tmp[16];
621 int i;
622 // horizontal pass
623 for (i = 0; i < 4; ++i, in += BPS) {
624 const int a0 = in[0] + in[2];
625 const int a1 = in[1] + in[3];
626 const int a2 = in[1] - in[3];
627 const int a3 = in[0] - in[2];
628 tmp[0 + i * 4] = a0 + a1;
629 tmp[1 + i * 4] = a3 + a2;
630 tmp[2 + i * 4] = a3 - a2;
631 tmp[3 + i * 4] = a0 - a1;
632 }
633 // vertical pass
634 for (i = 0; i < 4; ++i, ++w) {
635 const int a0 = tmp[0 + i] + tmp[8 + i];
636 const int a1 = tmp[4 + i] + tmp[12+ i];
637 const int a2 = tmp[4 + i] - tmp[12+ i];
638 const int a3 = tmp[0 + i] - tmp[8 + i];
639 const int b0 = a0 + a1;
640 const int b1 = a3 + a2;
641 const int b2 = a3 - a2;
642 const int b3 = a0 - a1;
643
644 sum += w[ 0] * abs(b0);
645 sum += w[ 4] * abs(b1);
646 sum += w[ 8] * abs(b2);
647 sum += w[12] * abs(b3);
648 }
649 return sum;
650 }
651
Disto4x4_C(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,const uint16_t * WEBP_RESTRICT const w)652 static int Disto4x4_C(const uint8_t* WEBP_RESTRICT const a,
653 const uint8_t* WEBP_RESTRICT const b,
654 const uint16_t* WEBP_RESTRICT const w) {
655 const int sum1 = TTransform(a, w);
656 const int sum2 = TTransform(b, w);
657 return abs(sum2 - sum1) >> 5;
658 }
659
Disto16x16_C(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,const uint16_t * WEBP_RESTRICT const w)660 static int Disto16x16_C(const uint8_t* WEBP_RESTRICT const a,
661 const uint8_t* WEBP_RESTRICT const b,
662 const uint16_t* WEBP_RESTRICT const w) {
663 int D = 0;
664 int x, y;
665 for (y = 0; y < 16 * BPS; y += 4 * BPS) {
666 for (x = 0; x < 16; x += 4) {
667 D += Disto4x4_C(a + x + y, b + x + y, w);
668 }
669 }
670 return D;
671 }
672 #endif // !WEBP_NEON_OMIT_C_CODE
673
674 //------------------------------------------------------------------------------
675 // Quantization
676 //
677
678 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
679 static const uint8_t kZigzag[16] = {
680 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
681 };
682
683 // Simple quantization
QuantizeBlock_C(int16_t in[16],int16_t out[16],const VP8Matrix * WEBP_RESTRICT const mtx)684 static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
685 const VP8Matrix* WEBP_RESTRICT const mtx) {
686 int last = -1;
687 int n;
688 for (n = 0; n < 16; ++n) {
689 const int j = kZigzag[n];
690 const int sign = (in[j] < 0);
691 const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
692 if (coeff > mtx->zthresh_[j]) {
693 const uint32_t Q = mtx->q_[j];
694 const uint32_t iQ = mtx->iq_[j];
695 const uint32_t B = mtx->bias_[j];
696 int level = QUANTDIV(coeff, iQ, B);
697 if (level > MAX_LEVEL) level = MAX_LEVEL;
698 if (sign) level = -level;
699 in[j] = level * (int)Q;
700 out[n] = level;
701 if (level) last = n;
702 } else {
703 out[n] = 0;
704 in[j] = 0;
705 }
706 }
707 return (last >= 0);
708 }
709
Quantize2Blocks_C(int16_t in[32],int16_t out[32],const VP8Matrix * WEBP_RESTRICT const mtx)710 static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
711 const VP8Matrix* WEBP_RESTRICT const mtx) {
712 int nz;
713 nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
714 nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
715 return nz;
716 }
717 #endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
718
719 //------------------------------------------------------------------------------
720 // Block copy
721
Copy(const uint8_t * WEBP_RESTRICT src,uint8_t * WEBP_RESTRICT dst,int w,int h)722 static WEBP_INLINE void Copy(const uint8_t* WEBP_RESTRICT src,
723 uint8_t* WEBP_RESTRICT dst, int w, int h) {
724 int y;
725 for (y = 0; y < h; ++y) {
726 memcpy(dst, src, w);
727 src += BPS;
728 dst += BPS;
729 }
730 }
731
Copy4x4_C(const uint8_t * WEBP_RESTRICT src,uint8_t * WEBP_RESTRICT dst)732 static void Copy4x4_C(const uint8_t* WEBP_RESTRICT src,
733 uint8_t* WEBP_RESTRICT dst) {
734 Copy(src, dst, 4, 4);
735 }
736
Copy16x8_C(const uint8_t * WEBP_RESTRICT src,uint8_t * WEBP_RESTRICT dst)737 static void Copy16x8_C(const uint8_t* WEBP_RESTRICT src,
738 uint8_t* WEBP_RESTRICT dst) {
739 Copy(src, dst, 16, 8);
740 }
741
742 //------------------------------------------------------------------------------
743 // Initialization
744
745 // Speed-critical function pointers. We have to initialize them to the default
746 // implementations within VP8EncDspInit().
747 VP8CHisto VP8CollectHistogram;
748 VP8Idct VP8ITransform;
749 VP8Fdct VP8FTransform;
750 VP8Fdct VP8FTransform2;
751 VP8WHT VP8FTransformWHT;
752 VP8Intra4Preds VP8EncPredLuma4;
753 VP8IntraPreds VP8EncPredLuma16;
754 VP8IntraPreds VP8EncPredChroma8;
755 VP8Metric VP8SSE16x16;
756 VP8Metric VP8SSE8x8;
757 VP8Metric VP8SSE16x8;
758 VP8Metric VP8SSE4x4;
759 VP8WMetric VP8TDisto4x4;
760 VP8WMetric VP8TDisto16x16;
761 VP8MeanMetric VP8Mean16x4;
762 VP8QuantizeBlock VP8EncQuantizeBlock;
763 VP8Quantize2Blocks VP8EncQuantize2Blocks;
764 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
765 VP8BlockCopy VP8Copy4x4;
766 VP8BlockCopy VP8Copy16x8;
767
768 extern VP8CPUInfo VP8GetCPUInfo;
769 extern void VP8EncDspInitSSE2(void);
770 extern void VP8EncDspInitSSE41(void);
771 extern void VP8EncDspInitNEON(void);
772 extern void VP8EncDspInitMIPS32(void);
773 extern void VP8EncDspInitMIPSdspR2(void);
774 extern void VP8EncDspInitMSA(void);
775
WEBP_DSP_INIT_FUNC(VP8EncDspInit)776 WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
777 VP8DspInit(); // common inverse transforms
778 InitTables();
779
780 // default C implementations
781 #if !WEBP_NEON_OMIT_C_CODE
782 VP8ITransform = ITransform_C;
783 VP8FTransform = FTransform_C;
784 VP8FTransformWHT = FTransformWHT_C;
785 VP8TDisto4x4 = Disto4x4_C;
786 VP8TDisto16x16 = Disto16x16_C;
787 VP8CollectHistogram = CollectHistogram_C;
788 VP8SSE16x16 = SSE16x16_C;
789 VP8SSE16x8 = SSE16x8_C;
790 VP8SSE8x8 = SSE8x8_C;
791 VP8SSE4x4 = SSE4x4_C;
792 #endif
793
794 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
795 VP8EncQuantizeBlock = QuantizeBlock_C;
796 VP8EncQuantize2Blocks = Quantize2Blocks_C;
797 VP8EncQuantizeBlockWHT = QuantizeBlock_C;
798 #endif
799
800 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
801 VP8EncPredLuma4 = Intra4Preds_C;
802 #endif
803 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
804 VP8EncPredLuma16 = Intra16Preds_C;
805 #endif
806
807 VP8FTransform2 = FTransform2_C;
808 VP8EncPredChroma8 = IntraChromaPreds_C;
809 VP8Mean16x4 = Mean16x4_C;
810 VP8Copy4x4 = Copy4x4_C;
811 VP8Copy16x8 = Copy16x8_C;
812
813 // If defined, use CPUInfo() to overwrite some pointers with faster versions.
814 if (VP8GetCPUInfo != NULL) {
815 #if defined(WEBP_HAVE_SSE2)
816 if (VP8GetCPUInfo(kSSE2)) {
817 VP8EncDspInitSSE2();
818 #if defined(WEBP_HAVE_SSE41)
819 if (VP8GetCPUInfo(kSSE4_1)) {
820 VP8EncDspInitSSE41();
821 }
822 #endif
823 }
824 #endif
825 #if defined(WEBP_USE_MIPS32)
826 if (VP8GetCPUInfo(kMIPS32)) {
827 VP8EncDspInitMIPS32();
828 }
829 #endif
830 #if defined(WEBP_USE_MIPS_DSP_R2)
831 if (VP8GetCPUInfo(kMIPSdspR2)) {
832 VP8EncDspInitMIPSdspR2();
833 }
834 #endif
835 #if defined(WEBP_USE_MSA)
836 if (VP8GetCPUInfo(kMSA)) {
837 VP8EncDspInitMSA();
838 }
839 #endif
840 }
841
842 #if defined(WEBP_HAVE_NEON)
843 if (WEBP_NEON_OMIT_C_CODE ||
844 (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
845 VP8EncDspInitNEON();
846 }
847 #endif
848
849 assert(VP8ITransform != NULL);
850 assert(VP8FTransform != NULL);
851 assert(VP8FTransformWHT != NULL);
852 assert(VP8TDisto4x4 != NULL);
853 assert(VP8TDisto16x16 != NULL);
854 assert(VP8CollectHistogram != NULL);
855 assert(VP8SSE16x16 != NULL);
856 assert(VP8SSE16x8 != NULL);
857 assert(VP8SSE8x8 != NULL);
858 assert(VP8SSE4x4 != NULL);
859 assert(VP8EncQuantizeBlock != NULL);
860 assert(VP8EncQuantize2Blocks != NULL);
861 assert(VP8FTransform2 != NULL);
862 assert(VP8EncPredLuma4 != NULL);
863 assert(VP8EncPredLuma16 != NULL);
864 assert(VP8EncPredChroma8 != NULL);
865 assert(VP8Mean16x4 != NULL);
866 assert(VP8EncQuantizeBlockWHT != NULL);
867 assert(VP8Copy4x4 != NULL);
868 assert(VP8Copy16x8 != NULL);
869 }
870