• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // NEON variant of alpha filters
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13 
14 #include "src/dsp/dsp.h"
15 
16 #if defined(WEBP_USE_NEON)
17 
18 #include <assert.h>
19 #include "src/dsp/neon.h"
20 
21 //------------------------------------------------------------------------------
22 // Helpful macros.
23 
24 #define DCHECK(in, out)                                                        \
25   do {                                                                         \
26     assert(in != NULL);                                                        \
27     assert(out != NULL);                                                       \
28     assert(width > 0);                                                         \
29     assert(height > 0);                                                        \
30     assert(stride >= width);                                                   \
31     assert(row >= 0 && num_rows > 0 && row + num_rows <= height);              \
32     (void)height;  /* Silence unused warning. */                               \
33   } while (0)
34 
35 // load eight u8 and widen to s16
36 #define U8_TO_S16(A) vreinterpretq_s16_u16(vmovl_u8(A))
37 #define LOAD_U8_TO_S16(A) U8_TO_S16(vld1_u8(A))
38 
39 // shift left or right by N byte, inserting zeros
40 #define SHIFT_RIGHT_N_Q(A, N) vextq_u8((A), zero, (N))
41 #define SHIFT_LEFT_N_Q(A, N) vextq_u8(zero, (A), (16 - (N)) % 16)
42 
43 // rotate left by N bytes
44 #define ROTATE_LEFT_N(A, N)   vext_u8((A), (A), (N))
45 // rotate right by N bytes
46 #define ROTATE_RIGHT_N(A, N)   vext_u8((A), (A), (8 - (N)) % 8)
47 
PredictLine_NEON(const uint8_t * src,const uint8_t * pred,uint8_t * dst,int length)48 static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
49                              uint8_t* dst, int length) {
50   int i;
51   assert(length >= 0);
52   for (i = 0; i + 16 <= length; i += 16) {
53     const uint8x16_t A = vld1q_u8(&src[i]);
54     const uint8x16_t B = vld1q_u8(&pred[i]);
55     const uint8x16_t C = vsubq_u8(A, B);
56     vst1q_u8(&dst[i], C);
57   }
58   for (; i < length; ++i) dst[i] = src[i] - pred[i];
59 }
60 
61 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
PredictLineLeft_NEON(const uint8_t * src,uint8_t * dst,int length)62 static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) {
63   PredictLine_NEON(src, src - 1, dst, length);
64 }
65 
66 //------------------------------------------------------------------------------
67 // Horizontal filter.
68 
DoHorizontalFilter_NEON(const uint8_t * in,int width,int height,int stride,int row,int num_rows,uint8_t * out)69 static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in,
70                                                 int width, int height,
71                                                 int stride,
72                                                 int row, int num_rows,
73                                                 uint8_t* out) {
74   const size_t start_offset = row * stride;
75   const int last_row = row + num_rows;
76   DCHECK(in, out);
77   in += start_offset;
78   out += start_offset;
79 
80   if (row == 0) {
81     // Leftmost pixel is the same as input for topmost scanline.
82     out[0] = in[0];
83     PredictLineLeft_NEON(in + 1, out + 1, width - 1);
84     row = 1;
85     in += stride;
86     out += stride;
87   }
88 
89   // Filter line-by-line.
90   while (row < last_row) {
91     // Leftmost pixel is predicted from above.
92     out[0] = in[0] - in[-stride];
93     PredictLineLeft_NEON(in + 1, out + 1, width - 1);
94     ++row;
95     in += stride;
96     out += stride;
97   }
98 }
99 
HorizontalFilter_NEON(const uint8_t * data,int width,int height,int stride,uint8_t * filtered_data)100 static void HorizontalFilter_NEON(const uint8_t* data, int width, int height,
101                                   int stride, uint8_t* filtered_data) {
102   DoHorizontalFilter_NEON(data, width, height, stride, 0, height,
103                           filtered_data);
104 }
105 
106 //------------------------------------------------------------------------------
107 // Vertical filter.
108 
DoVerticalFilter_NEON(const uint8_t * in,int width,int height,int stride,int row,int num_rows,uint8_t * out)109 static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
110                                               int width, int height, int stride,
111                                               int row, int num_rows,
112                                               uint8_t* out) {
113   const size_t start_offset = row * stride;
114   const int last_row = row + num_rows;
115   DCHECK(in, out);
116   in += start_offset;
117   out += start_offset;
118 
119   if (row == 0) {
120     // Very first top-left pixel is copied.
121     out[0] = in[0];
122     // Rest of top scan-line is left-predicted.
123     PredictLineLeft_NEON(in + 1, out + 1, width - 1);
124     row = 1;
125     in += stride;
126     out += stride;
127   }
128 
129   // Filter line-by-line.
130   while (row < last_row) {
131     PredictLine_NEON(in, in - stride, out, width);
132     ++row;
133     in += stride;
134     out += stride;
135   }
136 }
137 
VerticalFilter_NEON(const uint8_t * data,int width,int height,int stride,uint8_t * filtered_data)138 static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
139                                 int stride, uint8_t* filtered_data) {
140   DoVerticalFilter_NEON(data, width, height, stride, 0, height,
141                         filtered_data);
142 }
143 
144 //------------------------------------------------------------------------------
145 // Gradient filter.
146 
GradientPredictor_C(uint8_t a,uint8_t b,uint8_t c)147 static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
148   const int g = a + b - c;
149   return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
150 }
151 
GradientPredictDirect_NEON(const uint8_t * const row,const uint8_t * const top,uint8_t * const out,int length)152 static void GradientPredictDirect_NEON(const uint8_t* const row,
153                                        const uint8_t* const top,
154                                        uint8_t* const out, int length) {
155   int i;
156   for (i = 0; i + 8 <= length; i += 8) {
157     const uint8x8_t A = vld1_u8(&row[i - 1]);
158     const uint8x8_t B = vld1_u8(&top[i + 0]);
159     const int16x8_t C = vreinterpretq_s16_u16(vaddl_u8(A, B));
160     const int16x8_t D = LOAD_U8_TO_S16(&top[i - 1]);
161     const uint8x8_t E = vqmovun_s16(vsubq_s16(C, D));
162     const uint8x8_t F = vld1_u8(&row[i + 0]);
163     vst1_u8(&out[i], vsub_u8(F, E));
164   }
165   for (; i < length; ++i) {
166     out[i] = row[i] - GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
167   }
168 }
169 
DoGradientFilter_NEON(const uint8_t * in,int width,int height,int stride,int row,int num_rows,uint8_t * out)170 static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
171                                               int width, int height,
172                                               int stride,
173                                               int row, int num_rows,
174                                               uint8_t* out) {
175   const size_t start_offset = row * stride;
176   const int last_row = row + num_rows;
177   DCHECK(in, out);
178   in += start_offset;
179   out += start_offset;
180 
181   // left prediction for top scan-line
182   if (row == 0) {
183     out[0] = in[0];
184     PredictLineLeft_NEON(in + 1, out + 1, width - 1);
185     row = 1;
186     in += stride;
187     out += stride;
188   }
189 
190   // Filter line-by-line.
191   while (row < last_row) {
192     out[0] = in[0] - in[-stride];
193     GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1);
194     ++row;
195     in += stride;
196     out += stride;
197   }
198 }
199 
GradientFilter_NEON(const uint8_t * data,int width,int height,int stride,uint8_t * filtered_data)200 static void GradientFilter_NEON(const uint8_t* data, int width, int height,
201                                 int stride, uint8_t* filtered_data) {
202   DoGradientFilter_NEON(data, width, height, stride, 0, height,
203                         filtered_data);
204 }
205 
206 #undef DCHECK
207 
208 //------------------------------------------------------------------------------
209 // Inverse transforms
210 
HorizontalUnfilter_NEON(const uint8_t * prev,const uint8_t * in,uint8_t * out,int width)211 static void HorizontalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
212                                     uint8_t* out, int width) {
213   int i;
214   const uint8x16_t zero = vdupq_n_u8(0);
215   uint8x16_t last;
216   out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
217   if (width <= 1) return;
218   last = vsetq_lane_u8(out[0], zero, 0);
219   for (i = 1; i + 16 <= width; i += 16) {
220     const uint8x16_t A0 = vld1q_u8(&in[i]);
221     const uint8x16_t A1 = vaddq_u8(A0, last);
222     const uint8x16_t A2 = SHIFT_LEFT_N_Q(A1, 1);
223     const uint8x16_t A3 = vaddq_u8(A1, A2);
224     const uint8x16_t A4 = SHIFT_LEFT_N_Q(A3, 2);
225     const uint8x16_t A5 = vaddq_u8(A3, A4);
226     const uint8x16_t A6 = SHIFT_LEFT_N_Q(A5, 4);
227     const uint8x16_t A7 = vaddq_u8(A5, A6);
228     const uint8x16_t A8 = SHIFT_LEFT_N_Q(A7, 8);
229     const uint8x16_t A9 = vaddq_u8(A7, A8);
230     vst1q_u8(&out[i], A9);
231     last = SHIFT_RIGHT_N_Q(A9, 15);
232   }
233   for (; i < width; ++i) out[i] = in[i] + out[i - 1];
234 }
235 
VerticalUnfilter_NEON(const uint8_t * prev,const uint8_t * in,uint8_t * out,int width)236 static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
237                                   uint8_t* out, int width) {
238   if (prev == NULL) {
239     HorizontalUnfilter_NEON(NULL, in, out, width);
240   } else {
241     int i;
242     assert(width >= 0);
243     for (i = 0; i + 16 <= width; i += 16) {
244       const uint8x16_t A = vld1q_u8(&in[i]);
245       const uint8x16_t B = vld1q_u8(&prev[i]);
246       const uint8x16_t C = vaddq_u8(A, B);
247       vst1q_u8(&out[i], C);
248     }
249     for (; i < width; ++i) out[i] = in[i] + prev[i];
250   }
251 }
252 
253 // GradientUnfilter_NEON is correct but slower than the C-version,
254 // at least on ARM64. For armv7, it's a wash.
255 // So best is to disable it for now, but keep the idea around...
256 #if !defined(USE_GRADIENT_UNFILTER)
257 #define USE_GRADIENT_UNFILTER 0   // ALTERNATE_CODE
258 #endif
259 
260 #if (USE_GRADIENT_UNFILTER == 1)
261 #define GRAD_PROCESS_LANE(L)  do {                                             \
262   const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1);  /* rotate predictor in */   \
263   const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1));                       \
264   const uint8x8_t delta = vqmovun_s16(tmp2);                                   \
265   pred = vadd_u8(D, delta);                                                    \
266   out = vext_u8(out, ROTATE_LEFT_N(pred, (L)), 1);                             \
267 } while (0)
268 
GradientPredictInverse_NEON(const uint8_t * const in,const uint8_t * const top,uint8_t * const row,int length)269 static void GradientPredictInverse_NEON(const uint8_t* const in,
270                                         const uint8_t* const top,
271                                         uint8_t* const row, int length) {
272   if (length > 0) {
273     int i;
274     uint8x8_t pred = vdup_n_u8(row[-1]);   // left sample
275     uint8x8_t out = vdup_n_u8(0);
276     for (i = 0; i + 8 <= length; i += 8) {
277       const int16x8_t B = LOAD_U8_TO_S16(&top[i + 0]);
278       const int16x8_t C = LOAD_U8_TO_S16(&top[i - 1]);
279       const int16x8_t BC = vsubq_s16(B, C);  // unclipped gradient basis B - C
280       const uint8x8_t D = vld1_u8(&in[i]);   // base input
281       GRAD_PROCESS_LANE(0);
282       GRAD_PROCESS_LANE(1);
283       GRAD_PROCESS_LANE(2);
284       GRAD_PROCESS_LANE(3);
285       GRAD_PROCESS_LANE(4);
286       GRAD_PROCESS_LANE(5);
287       GRAD_PROCESS_LANE(6);
288       GRAD_PROCESS_LANE(7);
289       vst1_u8(&row[i], out);
290     }
291     for (; i < length; ++i) {
292       row[i] = in[i] + GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
293     }
294   }
295 }
296 #undef GRAD_PROCESS_LANE
297 
GradientUnfilter_NEON(const uint8_t * prev,const uint8_t * in,uint8_t * out,int width)298 static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
299                                   uint8_t* out, int width) {
300   if (prev == NULL) {
301     HorizontalUnfilter_NEON(NULL, in, out, width);
302   } else {
303     out[0] = in[0] + prev[0];  // predict from above
304     GradientPredictInverse_NEON(in + 1, prev + 1, out + 1, width - 1);
305   }
306 }
307 
308 #endif   // USE_GRADIENT_UNFILTER
309 
310 //------------------------------------------------------------------------------
311 // Entry point
312 
313 extern void VP8FiltersInitNEON(void);
314 
VP8FiltersInitNEON(void)315 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
316   WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
317   WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
318 #if (USE_GRADIENT_UNFILTER == 1)
319   WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
320 #endif
321 
322   WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_NEON;
323   WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_NEON;
324   WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_NEON;
325 }
326 
327 #else  // !WEBP_USE_NEON
328 
329 WEBP_DSP_INIT_STUB(VP8FiltersInitNEON)
330 
331 #endif  // WEBP_USE_NEON
332