• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // Speed-critical encoding functions.
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13 
14 #include <assert.h>
15 #include <stdlib.h>  // for abs()
16 
17 #include "src/dsp/dsp.h"
18 #include "src/enc/vp8i_enc.h"
19 
clip_8b(int v)20 static WEBP_INLINE uint8_t clip_8b(int v) {
21   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
22 }
23 
24 #if !WEBP_NEON_OMIT_C_CODE
clip_max(int v,int max)25 static WEBP_INLINE int clip_max(int v, int max) {
26   return (v > max) ? max : v;
27 }
28 #endif  // !WEBP_NEON_OMIT_C_CODE
29 
30 //------------------------------------------------------------------------------
31 // Compute susceptibility based on DCT-coeff histograms:
32 // the higher, the "easier" the macroblock is to compress.
33 
34 const int VP8DspScan[16 + 4 + 4] = {
35   // Luma
36   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
37   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
38   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
39   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
40 
41   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
42   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
43 };
44 
45 // general-purpose util function
VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH+1],VP8Histogram * const histo)46 void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
47                          VP8Histogram* const histo) {
48   int max_value = 0, last_non_zero = 1;
49   int k;
50   for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
51     const int value = distribution[k];
52     if (value > 0) {
53       if (value > max_value) max_value = value;
54       last_non_zero = k;
55     }
56   }
57   histo->max_value = max_value;
58   histo->last_non_zero = last_non_zero;
59 }
60 
61 #if !WEBP_NEON_OMIT_C_CODE
CollectHistogram_C(const uint8_t * WEBP_RESTRICT ref,const uint8_t * WEBP_RESTRICT pred,int start_block,int end_block,VP8Histogram * WEBP_RESTRICT const histo)62 static void CollectHistogram_C(const uint8_t* WEBP_RESTRICT ref,
63                                const uint8_t* WEBP_RESTRICT pred,
64                                int start_block, int end_block,
65                                VP8Histogram* WEBP_RESTRICT const histo) {
66   int j;
67   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
68   for (j = start_block; j < end_block; ++j) {
69     int k;
70     int16_t out[16];
71 
72     VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
73 
74     // Convert coefficients to bin.
75     for (k = 0; k < 16; ++k) {
76       const int v = abs(out[k]) >> 3;
77       const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
78       ++distribution[clipped_value];
79     }
80   }
81   VP8SetHistogramData(distribution, histo);
82 }
83 #endif  // !WEBP_NEON_OMIT_C_CODE
84 
85 //------------------------------------------------------------------------------
86 // run-time tables (~4k)
87 
88 static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
89 
90 // We declare this variable 'volatile' to prevent instruction reordering
91 // and make sure it's set to true _last_ (so as to be thread-safe)
92 static volatile int tables_ok = 0;
93 
InitTables(void)94 static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
95   if (!tables_ok) {
96     int i;
97     for (i = -255; i <= 255 + 255; ++i) {
98       clip1[255 + i] = clip_8b(i);
99     }
100     tables_ok = 1;
101   }
102 }
103 
104 
105 //------------------------------------------------------------------------------
106 // Transforms (Paragraph 14.4)
107 
108 #if !WEBP_NEON_OMIT_C_CODE
109 
110 #define STORE(x, y, v) \
111   dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
112 
ITransformOne(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)113 static WEBP_INLINE void ITransformOne(const uint8_t* WEBP_RESTRICT ref,
114                                       const int16_t* WEBP_RESTRICT in,
115                                       uint8_t* WEBP_RESTRICT dst) {
116   int C[4 * 4], *tmp;
117   int i;
118   tmp = C;
119   for (i = 0; i < 4; ++i) {    // vertical pass
120     const int a = in[0] + in[8];
121     const int b = in[0] - in[8];
122     const int c =
123         WEBP_TRANSFORM_AC3_MUL2(in[4]) - WEBP_TRANSFORM_AC3_MUL1(in[12]);
124     const int d =
125         WEBP_TRANSFORM_AC3_MUL1(in[4]) + WEBP_TRANSFORM_AC3_MUL2(in[12]);
126     tmp[0] = a + d;
127     tmp[1] = b + c;
128     tmp[2] = b - c;
129     tmp[3] = a - d;
130     tmp += 4;
131     in++;
132   }
133 
134   tmp = C;
135   for (i = 0; i < 4; ++i) {    // horizontal pass
136     const int dc = tmp[0] + 4;
137     const int a = dc + tmp[8];
138     const int b = dc - tmp[8];
139     const int c =
140         WEBP_TRANSFORM_AC3_MUL2(tmp[4]) - WEBP_TRANSFORM_AC3_MUL1(tmp[12]);
141     const int d =
142         WEBP_TRANSFORM_AC3_MUL1(tmp[4]) + WEBP_TRANSFORM_AC3_MUL2(tmp[12]);
143     STORE(0, i, a + d);
144     STORE(1, i, b + c);
145     STORE(2, i, b - c);
146     STORE(3, i, a - d);
147     tmp++;
148   }
149 }
150 
ITransform_C(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst,int do_two)151 static void ITransform_C(const uint8_t* WEBP_RESTRICT ref,
152                          const int16_t* WEBP_RESTRICT in,
153                          uint8_t* WEBP_RESTRICT dst,
154                          int do_two) {
155   ITransformOne(ref, in, dst);
156   if (do_two) {
157     ITransformOne(ref + 4, in + 16, dst + 4);
158   }
159 }
160 
FTransform_C(const uint8_t * WEBP_RESTRICT src,const uint8_t * WEBP_RESTRICT ref,int16_t * WEBP_RESTRICT out)161 static void FTransform_C(const uint8_t* WEBP_RESTRICT src,
162                          const uint8_t* WEBP_RESTRICT ref,
163                          int16_t* WEBP_RESTRICT out) {
164   int i;
165   int tmp[16];
166   for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
167     const int d0 = src[0] - ref[0];   // 9bit dynamic range ([-255,255])
168     const int d1 = src[1] - ref[1];
169     const int d2 = src[2] - ref[2];
170     const int d3 = src[3] - ref[3];
171     const int a0 = (d0 + d3);         // 10b                      [-510,510]
172     const int a1 = (d1 + d2);
173     const int a2 = (d1 - d2);
174     const int a3 = (d0 - d3);
175     tmp[0 + i * 4] = (a0 + a1) * 8;   // 14b                      [-8160,8160]
176     tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9;      // [-7536,7542]
177     tmp[2 + i * 4] = (a0 - a1) * 8;
178     tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 +  937) >> 9;
179   }
180   for (i = 0; i < 4; ++i) {
181     const int a0 = (tmp[0 + i] + tmp[12 + i]);  // 15b
182     const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
183     const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
184     const int a3 = (tmp[0 + i] - tmp[12 + i]);
185     out[0 + i] = (a0 + a1 + 7) >> 4;            // 12b
186     out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
187     out[8 + i] = (a0 - a1 + 7) >> 4;
188     out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
189   }
190 }
191 #endif  // !WEBP_NEON_OMIT_C_CODE
192 
FTransform2_C(const uint8_t * WEBP_RESTRICT src,const uint8_t * WEBP_RESTRICT ref,int16_t * WEBP_RESTRICT out)193 static void FTransform2_C(const uint8_t* WEBP_RESTRICT src,
194                           const uint8_t* WEBP_RESTRICT ref,
195                           int16_t* WEBP_RESTRICT out) {
196   VP8FTransform(src, ref, out);
197   VP8FTransform(src + 4, ref + 4, out + 16);
198 }
199 
200 #if !WEBP_NEON_OMIT_C_CODE
FTransformWHT_C(const int16_t * WEBP_RESTRICT in,int16_t * WEBP_RESTRICT out)201 static void FTransformWHT_C(const int16_t* WEBP_RESTRICT in,
202                             int16_t* WEBP_RESTRICT out) {
203   // input is 12b signed
204   int32_t tmp[16];
205   int i;
206   for (i = 0; i < 4; ++i, in += 64) {
207     const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
208     const int a1 = (in[1 * 16] + in[3 * 16]);
209     const int a2 = (in[1 * 16] - in[3 * 16]);
210     const int a3 = (in[0 * 16] - in[2 * 16]);
211     tmp[0 + i * 4] = a0 + a1;   // 14b
212     tmp[1 + i * 4] = a3 + a2;
213     tmp[2 + i * 4] = a3 - a2;
214     tmp[3 + i * 4] = a0 - a1;
215   }
216   for (i = 0; i < 4; ++i) {
217     const int a0 = (tmp[0 + i] + tmp[8 + i]);  // 15b
218     const int a1 = (tmp[4 + i] + tmp[12+ i]);
219     const int a2 = (tmp[4 + i] - tmp[12+ i]);
220     const int a3 = (tmp[0 + i] - tmp[8 + i]);
221     const int b0 = a0 + a1;    // 16b
222     const int b1 = a3 + a2;
223     const int b2 = a3 - a2;
224     const int b3 = a0 - a1;
225     out[ 0 + i] = b0 >> 1;     // 15b
226     out[ 4 + i] = b1 >> 1;
227     out[ 8 + i] = b2 >> 1;
228     out[12 + i] = b3 >> 1;
229   }
230 }
231 #endif  // !WEBP_NEON_OMIT_C_CODE
232 
233 #undef STORE
234 
235 //------------------------------------------------------------------------------
236 // Intra predictions
237 
Fill(uint8_t * dst,int value,int size)238 static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
239   int j;
240   for (j = 0; j < size; ++j) {
241     memset(dst + j * BPS, value, size);
242   }
243 }
244 
VerticalPred(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top,int size)245 static WEBP_INLINE void VerticalPred(uint8_t* WEBP_RESTRICT dst,
246                                      const uint8_t* WEBP_RESTRICT top,
247                                      int size) {
248   int j;
249   if (top != NULL) {
250     for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
251   } else {
252     Fill(dst, 127, size);
253   }
254 }
255 
HorizontalPred(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,int size)256 static WEBP_INLINE void HorizontalPred(uint8_t* WEBP_RESTRICT dst,
257                                        const uint8_t* WEBP_RESTRICT left,
258                                        int size) {
259   if (left != NULL) {
260     int j;
261     for (j = 0; j < size; ++j) {
262       memset(dst + j * BPS, left[j], size);
263     }
264   } else {
265     Fill(dst, 129, size);
266   }
267 }
268 
TrueMotion(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top,int size)269 static WEBP_INLINE void TrueMotion(uint8_t* WEBP_RESTRICT dst,
270                                    const uint8_t* WEBP_RESTRICT left,
271                                    const uint8_t* WEBP_RESTRICT top, int size) {
272   int y;
273   if (left != NULL) {
274     if (top != NULL) {
275       const uint8_t* const clip = clip1 + 255 - left[-1];
276       for (y = 0; y < size; ++y) {
277         const uint8_t* const clip_table = clip + left[y];
278         int x;
279         for (x = 0; x < size; ++x) {
280           dst[x] = clip_table[top[x]];
281         }
282         dst += BPS;
283       }
284     } else {
285       HorizontalPred(dst, left, size);
286     }
287   } else {
288     // true motion without left samples (hence: with default 129 value)
289     // is equivalent to VE prediction where you just copy the top samples.
290     // Note that if top samples are not available, the default value is
291     // then 129, and not 127 as in the VerticalPred case.
292     if (top != NULL) {
293       VerticalPred(dst, top, size);
294     } else {
295       Fill(dst, 129, size);
296     }
297   }
298 }
299 
DCMode(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top,int size,int round,int shift)300 static WEBP_INLINE void DCMode(uint8_t* WEBP_RESTRICT dst,
301                                const uint8_t* WEBP_RESTRICT left,
302                                const uint8_t* WEBP_RESTRICT top,
303                                int size, int round, int shift) {
304   int DC = 0;
305   int j;
306   if (top != NULL) {
307     for (j = 0; j < size; ++j) DC += top[j];
308     if (left != NULL) {   // top and left present
309       for (j = 0; j < size; ++j) DC += left[j];
310     } else {      // top, but no left
311       DC += DC;
312     }
313     DC = (DC + round) >> shift;
314   } else if (left != NULL) {   // left but no top
315     for (j = 0; j < size; ++j) DC += left[j];
316     DC += DC;
317     DC = (DC + round) >> shift;
318   } else {   // no top, no left, nothing.
319     DC = 0x80;
320   }
321   Fill(dst, DC, size);
322 }
323 
324 //------------------------------------------------------------------------------
325 // Chroma 8x8 prediction (paragraph 12.2)
326 
IntraChromaPreds_C(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top)327 static void IntraChromaPreds_C(uint8_t* WEBP_RESTRICT dst,
328                                const uint8_t* WEBP_RESTRICT left,
329                                const uint8_t* WEBP_RESTRICT top) {
330   // U block
331   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
332   VerticalPred(C8VE8 + dst, top, 8);
333   HorizontalPred(C8HE8 + dst, left, 8);
334   TrueMotion(C8TM8 + dst, left, top, 8);
335   // V block
336   dst += 8;
337   if (top != NULL) top += 8;
338   if (left != NULL) left += 16;
339   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
340   VerticalPred(C8VE8 + dst, top, 8);
341   HorizontalPred(C8HE8 + dst, left, 8);
342   TrueMotion(C8TM8 + dst, left, top, 8);
343 }
344 
345 //------------------------------------------------------------------------------
346 // luma 16x16 prediction (paragraph 12.3)
347 
348 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
Intra16Preds_C(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top)349 static void Intra16Preds_C(uint8_t* WEBP_RESTRICT dst,
350                            const uint8_t* WEBP_RESTRICT left,
351                            const uint8_t* WEBP_RESTRICT top) {
352   DCMode(I16DC16 + dst, left, top, 16, 16, 5);
353   VerticalPred(I16VE16 + dst, top, 16);
354   HorizontalPred(I16HE16 + dst, left, 16);
355   TrueMotion(I16TM16 + dst, left, top, 16);
356 }
357 #endif  // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
358 
359 //------------------------------------------------------------------------------
360 // luma 4x4 prediction
361 
362 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
363 
364 #define DST(x, y) dst[(x) + (y) * BPS]
365 #define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
366 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
367 
368 // vertical
VE4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)369 static void VE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
370   const uint8_t vals[4] = {
371     AVG3(top[-1], top[0], top[1]),
372     AVG3(top[ 0], top[1], top[2]),
373     AVG3(top[ 1], top[2], top[3]),
374     AVG3(top[ 2], top[3], top[4])
375   };
376   int i;
377   for (i = 0; i < 4; ++i) {
378     memcpy(dst + i * BPS, vals, 4);
379   }
380 }
381 
382 // horizontal
HE4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)383 static void HE4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
384   const int X = top[-1];
385   const int I = top[-2];
386   const int J = top[-3];
387   const int K = top[-4];
388   const int L = top[-5];
389   WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
390   WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
391   WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
392   WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
393 }
394 
DC4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)395 static void DC4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
396   uint32_t dc = 4;
397   int i;
398   for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
399   Fill(dst, dc >> 3, 4);
400 }
401 
RD4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)402 static void RD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
403   const int X = top[-1];
404   const int I = top[-2];
405   const int J = top[-3];
406   const int K = top[-4];
407   const int L = top[-5];
408   const int A = top[0];
409   const int B = top[1];
410   const int C = top[2];
411   const int D = top[3];
412   DST(0, 3)                                     = AVG3(J, K, L);
413   DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
414   DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
415   DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
416   DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
417   DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
418   DST(3, 0)                                     = AVG3(D, C, B);
419 }
420 
LD4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)421 static void LD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
422   const int A = top[0];
423   const int B = top[1];
424   const int C = top[2];
425   const int D = top[3];
426   const int E = top[4];
427   const int F = top[5];
428   const int G = top[6];
429   const int H = top[7];
430   DST(0, 0)                                     = AVG3(A, B, C);
431   DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
432   DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
433   DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
434   DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
435   DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
436   DST(3, 3)                                     = AVG3(G, H, H);
437 }
438 
VR4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)439 static void VR4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
440   const int X = top[-1];
441   const int I = top[-2];
442   const int J = top[-3];
443   const int K = top[-4];
444   const int A = top[0];
445   const int B = top[1];
446   const int C = top[2];
447   const int D = top[3];
448   DST(0, 0) = DST(1, 2) = AVG2(X, A);
449   DST(1, 0) = DST(2, 2) = AVG2(A, B);
450   DST(2, 0) = DST(3, 2) = AVG2(B, C);
451   DST(3, 0)             = AVG2(C, D);
452 
453   DST(0, 3) =             AVG3(K, J, I);
454   DST(0, 2) =             AVG3(J, I, X);
455   DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
456   DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
457   DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
458   DST(3, 1) =             AVG3(B, C, D);
459 }
460 
VL4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)461 static void VL4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
462   const int A = top[0];
463   const int B = top[1];
464   const int C = top[2];
465   const int D = top[3];
466   const int E = top[4];
467   const int F = top[5];
468   const int G = top[6];
469   const int H = top[7];
470   DST(0, 0) =             AVG2(A, B);
471   DST(1, 0) = DST(0, 2) = AVG2(B, C);
472   DST(2, 0) = DST(1, 2) = AVG2(C, D);
473   DST(3, 0) = DST(2, 2) = AVG2(D, E);
474 
475   DST(0, 1) =             AVG3(A, B, C);
476   DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
477   DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
478   DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
479               DST(3, 2) = AVG3(E, F, G);
480               DST(3, 3) = AVG3(F, G, H);
481 }
482 
HU4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)483 static void HU4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
484   const int I = top[-2];
485   const int J = top[-3];
486   const int K = top[-4];
487   const int L = top[-5];
488   DST(0, 0) =             AVG2(I, J);
489   DST(2, 0) = DST(0, 1) = AVG2(J, K);
490   DST(2, 1) = DST(0, 2) = AVG2(K, L);
491   DST(1, 0) =             AVG3(I, J, K);
492   DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
493   DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
494   DST(3, 2) = DST(2, 2) =
495   DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
496 }
497 
HD4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)498 static void HD4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
499   const int X = top[-1];
500   const int I = top[-2];
501   const int J = top[-3];
502   const int K = top[-4];
503   const int L = top[-5];
504   const int A = top[0];
505   const int B = top[1];
506   const int C = top[2];
507 
508   DST(0, 0) = DST(2, 1) = AVG2(I, X);
509   DST(0, 1) = DST(2, 2) = AVG2(J, I);
510   DST(0, 2) = DST(2, 3) = AVG2(K, J);
511   DST(0, 3)             = AVG2(L, K);
512 
513   DST(3, 0)             = AVG3(A, B, C);
514   DST(2, 0)             = AVG3(X, A, B);
515   DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
516   DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
517   DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
518   DST(1, 3)             = AVG3(L, K, J);
519 }
520 
TM4(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)521 static void TM4(uint8_t* WEBP_RESTRICT dst, const uint8_t* WEBP_RESTRICT top) {
522   int x, y;
523   const uint8_t* const clip = clip1 + 255 - top[-1];
524   for (y = 0; y < 4; ++y) {
525     const uint8_t* const clip_table = clip + top[-2 - y];
526     for (x = 0; x < 4; ++x) {
527       dst[x] = clip_table[top[x]];
528     }
529     dst += BPS;
530   }
531 }
532 
533 #undef DST
534 #undef AVG3
535 #undef AVG2
536 
537 // Left samples are top[-5 .. -2], top_left is top[-1], top are
538 // located at top[0..3], and top right is top[4..7]
Intra4Preds_C(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)539 static void Intra4Preds_C(uint8_t* WEBP_RESTRICT dst,
540                           const uint8_t* WEBP_RESTRICT top) {
541   DC4(I4DC4 + dst, top);
542   TM4(I4TM4 + dst, top);
543   VE4(I4VE4 + dst, top);
544   HE4(I4HE4 + dst, top);
545   RD4(I4RD4 + dst, top);
546   VR4(I4VR4 + dst, top);
547   LD4(I4LD4 + dst, top);
548   VL4(I4VL4 + dst, top);
549   HD4(I4HD4 + dst, top);
550   HU4(I4HU4 + dst, top);
551 }
552 
553 #endif  // !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
554 
555 //------------------------------------------------------------------------------
556 // Metric
557 
558 #if !WEBP_NEON_OMIT_C_CODE
GetSSE(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b,int w,int h)559 static WEBP_INLINE int GetSSE(const uint8_t* WEBP_RESTRICT a,
560                               const uint8_t* WEBP_RESTRICT b,
561                               int w, int h) {
562   int count = 0;
563   int y, x;
564   for (y = 0; y < h; ++y) {
565     for (x = 0; x < w; ++x) {
566       const int diff = (int)a[x] - b[x];
567       count += diff * diff;
568     }
569     a += BPS;
570     b += BPS;
571   }
572   return count;
573 }
574 
SSE16x16_C(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)575 static int SSE16x16_C(const uint8_t* WEBP_RESTRICT a,
576                       const uint8_t* WEBP_RESTRICT b) {
577   return GetSSE(a, b, 16, 16);
578 }
SSE16x8_C(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)579 static int SSE16x8_C(const uint8_t* WEBP_RESTRICT a,
580                      const uint8_t* WEBP_RESTRICT b) {
581   return GetSSE(a, b, 16, 8);
582 }
SSE8x8_C(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)583 static int SSE8x8_C(const uint8_t* WEBP_RESTRICT a,
584                     const uint8_t* WEBP_RESTRICT b) {
585   return GetSSE(a, b, 8, 8);
586 }
SSE4x4_C(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)587 static int SSE4x4_C(const uint8_t* WEBP_RESTRICT a,
588                     const uint8_t* WEBP_RESTRICT b) {
589   return GetSSE(a, b, 4, 4);
590 }
591 #endif  // !WEBP_NEON_OMIT_C_CODE
592 
Mean16x4_C(const uint8_t * WEBP_RESTRICT ref,uint32_t dc[4])593 static void Mean16x4_C(const uint8_t* WEBP_RESTRICT ref, uint32_t dc[4]) {
594   int k, x, y;
595   for (k = 0; k < 4; ++k) {
596     uint32_t avg = 0;
597     for (y = 0; y < 4; ++y) {
598       for (x = 0; x < 4; ++x) {
599         avg += ref[x + y * BPS];
600       }
601     }
602     dc[k] = avg;
603     ref += 4;   // go to next 4x4 block.
604   }
605 }
606 
607 //------------------------------------------------------------------------------
608 // Texture distortion
609 //
610 // We try to match the spectral content (weighted) between source and
611 // reconstructed samples.
612 
613 #if !WEBP_NEON_OMIT_C_CODE
614 // Hadamard transform
615 // Returns the weighted sum of the absolute value of transformed coefficients.
616 // w[] contains a row-major 4 by 4 symmetric matrix.
TTransform(const uint8_t * WEBP_RESTRICT in,const uint16_t * WEBP_RESTRICT w)617 static int TTransform(const uint8_t* WEBP_RESTRICT in,
618                       const uint16_t* WEBP_RESTRICT w) {
619   int sum = 0;
620   int tmp[16];
621   int i;
622   // horizontal pass
623   for (i = 0; i < 4; ++i, in += BPS) {
624     const int a0 = in[0] + in[2];
625     const int a1 = in[1] + in[3];
626     const int a2 = in[1] - in[3];
627     const int a3 = in[0] - in[2];
628     tmp[0 + i * 4] = a0 + a1;
629     tmp[1 + i * 4] = a3 + a2;
630     tmp[2 + i * 4] = a3 - a2;
631     tmp[3 + i * 4] = a0 - a1;
632   }
633   // vertical pass
634   for (i = 0; i < 4; ++i, ++w) {
635     const int a0 = tmp[0 + i] + tmp[8 + i];
636     const int a1 = tmp[4 + i] + tmp[12+ i];
637     const int a2 = tmp[4 + i] - tmp[12+ i];
638     const int a3 = tmp[0 + i] - tmp[8 + i];
639     const int b0 = a0 + a1;
640     const int b1 = a3 + a2;
641     const int b2 = a3 - a2;
642     const int b3 = a0 - a1;
643 
644     sum += w[ 0] * abs(b0);
645     sum += w[ 4] * abs(b1);
646     sum += w[ 8] * abs(b2);
647     sum += w[12] * abs(b3);
648   }
649   return sum;
650 }
651 
Disto4x4_C(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,const uint16_t * WEBP_RESTRICT const w)652 static int Disto4x4_C(const uint8_t* WEBP_RESTRICT const a,
653                       const uint8_t* WEBP_RESTRICT const b,
654                       const uint16_t* WEBP_RESTRICT const w) {
655   const int sum1 = TTransform(a, w);
656   const int sum2 = TTransform(b, w);
657   return abs(sum2 - sum1) >> 5;
658 }
659 
Disto16x16_C(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,const uint16_t * WEBP_RESTRICT const w)660 static int Disto16x16_C(const uint8_t* WEBP_RESTRICT const a,
661                         const uint8_t* WEBP_RESTRICT const b,
662                         const uint16_t* WEBP_RESTRICT const w) {
663   int D = 0;
664   int x, y;
665   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
666     for (x = 0; x < 16; x += 4) {
667       D += Disto4x4_C(a + x + y, b + x + y, w);
668     }
669   }
670   return D;
671 }
672 #endif  // !WEBP_NEON_OMIT_C_CODE
673 
674 //------------------------------------------------------------------------------
675 // Quantization
676 //
677 
678 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
679 static const uint8_t kZigzag[16] = {
680   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
681 };
682 
683 // Simple quantization
QuantizeBlock_C(int16_t in[16],int16_t out[16],const VP8Matrix * WEBP_RESTRICT const mtx)684 static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
685                            const VP8Matrix* WEBP_RESTRICT const mtx) {
686   int last = -1;
687   int n;
688   for (n = 0; n < 16; ++n) {
689     const int j = kZigzag[n];
690     const int sign = (in[j] < 0);
691     const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
692     if (coeff > mtx->zthresh_[j]) {
693       const uint32_t Q = mtx->q_[j];
694       const uint32_t iQ = mtx->iq_[j];
695       const uint32_t B = mtx->bias_[j];
696       int level = QUANTDIV(coeff, iQ, B);
697       if (level > MAX_LEVEL) level = MAX_LEVEL;
698       if (sign) level = -level;
699       in[j] = level * (int)Q;
700       out[n] = level;
701       if (level) last = n;
702     } else {
703       out[n] = 0;
704       in[j] = 0;
705     }
706   }
707   return (last >= 0);
708 }
709 
Quantize2Blocks_C(int16_t in[32],int16_t out[32],const VP8Matrix * WEBP_RESTRICT const mtx)710 static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
711                              const VP8Matrix* WEBP_RESTRICT const mtx) {
712   int nz;
713   nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
714   nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
715   return nz;
716 }
717 #endif  // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
718 
719 //------------------------------------------------------------------------------
720 // Block copy
721 
Copy(const uint8_t * WEBP_RESTRICT src,uint8_t * WEBP_RESTRICT dst,int w,int h)722 static WEBP_INLINE void Copy(const uint8_t* WEBP_RESTRICT src,
723                              uint8_t* WEBP_RESTRICT dst, int w, int h) {
724   int y;
725   for (y = 0; y < h; ++y) {
726     memcpy(dst, src, w);
727     src += BPS;
728     dst += BPS;
729   }
730 }
731 
Copy4x4_C(const uint8_t * WEBP_RESTRICT src,uint8_t * WEBP_RESTRICT dst)732 static void Copy4x4_C(const uint8_t* WEBP_RESTRICT src,
733                       uint8_t* WEBP_RESTRICT dst) {
734   Copy(src, dst, 4, 4);
735 }
736 
Copy16x8_C(const uint8_t * WEBP_RESTRICT src,uint8_t * WEBP_RESTRICT dst)737 static void Copy16x8_C(const uint8_t* WEBP_RESTRICT src,
738                        uint8_t* WEBP_RESTRICT dst) {
739   Copy(src, dst, 16, 8);
740 }
741 
742 //------------------------------------------------------------------------------
743 // Initialization
744 
745 // Speed-critical function pointers. We have to initialize them to the default
746 // implementations within VP8EncDspInit().
747 VP8CHisto VP8CollectHistogram;
748 VP8Idct VP8ITransform;
749 VP8Fdct VP8FTransform;
750 VP8Fdct VP8FTransform2;
751 VP8WHT VP8FTransformWHT;
752 VP8Intra4Preds VP8EncPredLuma4;
753 VP8IntraPreds VP8EncPredLuma16;
754 VP8IntraPreds VP8EncPredChroma8;
755 VP8Metric VP8SSE16x16;
756 VP8Metric VP8SSE8x8;
757 VP8Metric VP8SSE16x8;
758 VP8Metric VP8SSE4x4;
759 VP8WMetric VP8TDisto4x4;
760 VP8WMetric VP8TDisto16x16;
761 VP8MeanMetric VP8Mean16x4;
762 VP8QuantizeBlock VP8EncQuantizeBlock;
763 VP8Quantize2Blocks VP8EncQuantize2Blocks;
764 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
765 VP8BlockCopy VP8Copy4x4;
766 VP8BlockCopy VP8Copy16x8;
767 
768 extern VP8CPUInfo VP8GetCPUInfo;
769 extern void VP8EncDspInitSSE2(void);
770 extern void VP8EncDspInitSSE41(void);
771 extern void VP8EncDspInitNEON(void);
772 extern void VP8EncDspInitMIPS32(void);
773 extern void VP8EncDspInitMIPSdspR2(void);
774 extern void VP8EncDspInitMSA(void);
775 
WEBP_DSP_INIT_FUNC(VP8EncDspInit)776 WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
777   VP8DspInit();  // common inverse transforms
778   InitTables();
779 
780   // default C implementations
781 #if !WEBP_NEON_OMIT_C_CODE
782   VP8ITransform = ITransform_C;
783   VP8FTransform = FTransform_C;
784   VP8FTransformWHT = FTransformWHT_C;
785   VP8TDisto4x4 = Disto4x4_C;
786   VP8TDisto16x16 = Disto16x16_C;
787   VP8CollectHistogram = CollectHistogram_C;
788   VP8SSE16x16 = SSE16x16_C;
789   VP8SSE16x8 = SSE16x8_C;
790   VP8SSE8x8 = SSE8x8_C;
791   VP8SSE4x4 = SSE4x4_C;
792 #endif
793 
794 #if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
795   VP8EncQuantizeBlock = QuantizeBlock_C;
796   VP8EncQuantize2Blocks = Quantize2Blocks_C;
797   VP8EncQuantizeBlockWHT = QuantizeBlock_C;
798 #endif
799 
800 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64 || BPS != 32
801   VP8EncPredLuma4 = Intra4Preds_C;
802 #endif
803 #if !WEBP_NEON_OMIT_C_CODE || !WEBP_AARCH64
804   VP8EncPredLuma16 = Intra16Preds_C;
805 #endif
806 
807   VP8FTransform2 = FTransform2_C;
808   VP8EncPredChroma8 = IntraChromaPreds_C;
809   VP8Mean16x4 = Mean16x4_C;
810   VP8Copy4x4 = Copy4x4_C;
811   VP8Copy16x8 = Copy16x8_C;
812 
813   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
814   if (VP8GetCPUInfo != NULL) {
815 #if defined(WEBP_HAVE_SSE2)
816     if (VP8GetCPUInfo(kSSE2)) {
817       VP8EncDspInitSSE2();
818 #if defined(WEBP_HAVE_SSE41)
819       if (VP8GetCPUInfo(kSSE4_1)) {
820         VP8EncDspInitSSE41();
821       }
822 #endif
823     }
824 #endif
825 #if defined(WEBP_USE_MIPS32)
826     if (VP8GetCPUInfo(kMIPS32)) {
827       VP8EncDspInitMIPS32();
828     }
829 #endif
830 #if defined(WEBP_USE_MIPS_DSP_R2)
831     if (VP8GetCPUInfo(kMIPSdspR2)) {
832       VP8EncDspInitMIPSdspR2();
833     }
834 #endif
835 #if defined(WEBP_USE_MSA)
836     if (VP8GetCPUInfo(kMSA)) {
837       VP8EncDspInitMSA();
838     }
839 #endif
840   }
841 
842 #if defined(WEBP_HAVE_NEON)
843   if (WEBP_NEON_OMIT_C_CODE ||
844       (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
845     VP8EncDspInitNEON();
846   }
847 #endif
848 
849   assert(VP8ITransform != NULL);
850   assert(VP8FTransform != NULL);
851   assert(VP8FTransformWHT != NULL);
852   assert(VP8TDisto4x4 != NULL);
853   assert(VP8TDisto16x16 != NULL);
854   assert(VP8CollectHistogram != NULL);
855   assert(VP8SSE16x16 != NULL);
856   assert(VP8SSE16x8 != NULL);
857   assert(VP8SSE8x8 != NULL);
858   assert(VP8SSE4x4 != NULL);
859   assert(VP8EncQuantizeBlock != NULL);
860   assert(VP8EncQuantize2Blocks != NULL);
861   assert(VP8FTransform2 != NULL);
862   assert(VP8EncPredLuma4 != NULL);
863   assert(VP8EncPredLuma16 != NULL);
864   assert(VP8EncPredChroma8 != NULL);
865   assert(VP8Mean16x4 != NULL);
866   assert(VP8EncQuantizeBlockWHT != NULL);
867   assert(VP8Copy4x4 != NULL);
868   assert(VP8Copy16x8 != NULL);
869 }
870