1 // Copyright 2017 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // NEON variant of alpha filters
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13
14 #include "src/dsp/dsp.h"
15
16 #if defined(WEBP_USE_NEON)
17
18 #include <assert.h>
19 #include "src/dsp/neon.h"
20
21 //------------------------------------------------------------------------------
22 // Helpful macros.
23
24 #define DCHECK(in, out) \
25 do { \
26 assert(in != NULL); \
27 assert(out != NULL); \
28 assert(width > 0); \
29 assert(height > 0); \
30 assert(stride >= width); \
31 assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
32 (void)height; /* Silence unused warning. */ \
33 } while (0)
34
35 // load eight u8 and widen to s16
36 #define U8_TO_S16(A) vreinterpretq_s16_u16(vmovl_u8(A))
37 #define LOAD_U8_TO_S16(A) U8_TO_S16(vld1_u8(A))
38
39 // shift left or right by N byte, inserting zeros
40 #define SHIFT_RIGHT_N_Q(A, N) vextq_u8((A), zero, (N))
41 #define SHIFT_LEFT_N_Q(A, N) vextq_u8(zero, (A), (16 - (N)) % 16)
42
43 // rotate left by N bytes
44 #define ROTATE_LEFT_N(A, N) vext_u8((A), (A), (N))
45 // rotate right by N bytes
46 #define ROTATE_RIGHT_N(A, N) vext_u8((A), (A), (8 - (N)) % 8)
47
PredictLine_NEON(const uint8_t * src,const uint8_t * pred,uint8_t * dst,int length)48 static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
49 uint8_t* dst, int length) {
50 int i;
51 assert(length >= 0);
52 for (i = 0; i + 16 <= length; i += 16) {
53 const uint8x16_t A = vld1q_u8(&src[i]);
54 const uint8x16_t B = vld1q_u8(&pred[i]);
55 const uint8x16_t C = vsubq_u8(A, B);
56 vst1q_u8(&dst[i], C);
57 }
58 for (; i < length; ++i) dst[i] = src[i] - pred[i];
59 }
60
61 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
PredictLineLeft_NEON(const uint8_t * src,uint8_t * dst,int length)62 static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) {
63 PredictLine_NEON(src, src - 1, dst, length);
64 }
65
66 //------------------------------------------------------------------------------
67 // Horizontal filter.
68
DoHorizontalFilter_NEON(const uint8_t * in,int width,int height,int stride,int row,int num_rows,uint8_t * out)69 static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in,
70 int width, int height,
71 int stride,
72 int row, int num_rows,
73 uint8_t* out) {
74 const size_t start_offset = row * stride;
75 const int last_row = row + num_rows;
76 DCHECK(in, out);
77 in += start_offset;
78 out += start_offset;
79
80 if (row == 0) {
81 // Leftmost pixel is the same as input for topmost scanline.
82 out[0] = in[0];
83 PredictLineLeft_NEON(in + 1, out + 1, width - 1);
84 row = 1;
85 in += stride;
86 out += stride;
87 }
88
89 // Filter line-by-line.
90 while (row < last_row) {
91 // Leftmost pixel is predicted from above.
92 out[0] = in[0] - in[-stride];
93 PredictLineLeft_NEON(in + 1, out + 1, width - 1);
94 ++row;
95 in += stride;
96 out += stride;
97 }
98 }
99
HorizontalFilter_NEON(const uint8_t * data,int width,int height,int stride,uint8_t * filtered_data)100 static void HorizontalFilter_NEON(const uint8_t* data, int width, int height,
101 int stride, uint8_t* filtered_data) {
102 DoHorizontalFilter_NEON(data, width, height, stride, 0, height,
103 filtered_data);
104 }
105
106 //------------------------------------------------------------------------------
107 // Vertical filter.
108
DoVerticalFilter_NEON(const uint8_t * in,int width,int height,int stride,int row,int num_rows,uint8_t * out)109 static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
110 int width, int height, int stride,
111 int row, int num_rows,
112 uint8_t* out) {
113 const size_t start_offset = row * stride;
114 const int last_row = row + num_rows;
115 DCHECK(in, out);
116 in += start_offset;
117 out += start_offset;
118
119 if (row == 0) {
120 // Very first top-left pixel is copied.
121 out[0] = in[0];
122 // Rest of top scan-line is left-predicted.
123 PredictLineLeft_NEON(in + 1, out + 1, width - 1);
124 row = 1;
125 in += stride;
126 out += stride;
127 }
128
129 // Filter line-by-line.
130 while (row < last_row) {
131 PredictLine_NEON(in, in - stride, out, width);
132 ++row;
133 in += stride;
134 out += stride;
135 }
136 }
137
VerticalFilter_NEON(const uint8_t * data,int width,int height,int stride,uint8_t * filtered_data)138 static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
139 int stride, uint8_t* filtered_data) {
140 DoVerticalFilter_NEON(data, width, height, stride, 0, height,
141 filtered_data);
142 }
143
144 //------------------------------------------------------------------------------
145 // Gradient filter.
146
GradientPredictor_C(uint8_t a,uint8_t b,uint8_t c)147 static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
148 const int g = a + b - c;
149 return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
150 }
151
GradientPredictDirect_NEON(const uint8_t * const row,const uint8_t * const top,uint8_t * const out,int length)152 static void GradientPredictDirect_NEON(const uint8_t* const row,
153 const uint8_t* const top,
154 uint8_t* const out, int length) {
155 int i;
156 for (i = 0; i + 8 <= length; i += 8) {
157 const uint8x8_t A = vld1_u8(&row[i - 1]);
158 const uint8x8_t B = vld1_u8(&top[i + 0]);
159 const int16x8_t C = vreinterpretq_s16_u16(vaddl_u8(A, B));
160 const int16x8_t D = LOAD_U8_TO_S16(&top[i - 1]);
161 const uint8x8_t E = vqmovun_s16(vsubq_s16(C, D));
162 const uint8x8_t F = vld1_u8(&row[i + 0]);
163 vst1_u8(&out[i], vsub_u8(F, E));
164 }
165 for (; i < length; ++i) {
166 out[i] = row[i] - GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
167 }
168 }
169
DoGradientFilter_NEON(const uint8_t * in,int width,int height,int stride,int row,int num_rows,uint8_t * out)170 static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
171 int width, int height,
172 int stride,
173 int row, int num_rows,
174 uint8_t* out) {
175 const size_t start_offset = row * stride;
176 const int last_row = row + num_rows;
177 DCHECK(in, out);
178 in += start_offset;
179 out += start_offset;
180
181 // left prediction for top scan-line
182 if (row == 0) {
183 out[0] = in[0];
184 PredictLineLeft_NEON(in + 1, out + 1, width - 1);
185 row = 1;
186 in += stride;
187 out += stride;
188 }
189
190 // Filter line-by-line.
191 while (row < last_row) {
192 out[0] = in[0] - in[-stride];
193 GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1);
194 ++row;
195 in += stride;
196 out += stride;
197 }
198 }
199
GradientFilter_NEON(const uint8_t * data,int width,int height,int stride,uint8_t * filtered_data)200 static void GradientFilter_NEON(const uint8_t* data, int width, int height,
201 int stride, uint8_t* filtered_data) {
202 DoGradientFilter_NEON(data, width, height, stride, 0, height,
203 filtered_data);
204 }
205
206 #undef DCHECK
207
208 //------------------------------------------------------------------------------
209 // Inverse transforms
210
HorizontalUnfilter_NEON(const uint8_t * prev,const uint8_t * in,uint8_t * out,int width)211 static void HorizontalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
212 uint8_t* out, int width) {
213 int i;
214 const uint8x16_t zero = vdupq_n_u8(0);
215 uint8x16_t last;
216 out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
217 if (width <= 1) return;
218 last = vsetq_lane_u8(out[0], zero, 0);
219 for (i = 1; i + 16 <= width; i += 16) {
220 const uint8x16_t A0 = vld1q_u8(&in[i]);
221 const uint8x16_t A1 = vaddq_u8(A0, last);
222 const uint8x16_t A2 = SHIFT_LEFT_N_Q(A1, 1);
223 const uint8x16_t A3 = vaddq_u8(A1, A2);
224 const uint8x16_t A4 = SHIFT_LEFT_N_Q(A3, 2);
225 const uint8x16_t A5 = vaddq_u8(A3, A4);
226 const uint8x16_t A6 = SHIFT_LEFT_N_Q(A5, 4);
227 const uint8x16_t A7 = vaddq_u8(A5, A6);
228 const uint8x16_t A8 = SHIFT_LEFT_N_Q(A7, 8);
229 const uint8x16_t A9 = vaddq_u8(A7, A8);
230 vst1q_u8(&out[i], A9);
231 last = SHIFT_RIGHT_N_Q(A9, 15);
232 }
233 for (; i < width; ++i) out[i] = in[i] + out[i - 1];
234 }
235
VerticalUnfilter_NEON(const uint8_t * prev,const uint8_t * in,uint8_t * out,int width)236 static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
237 uint8_t* out, int width) {
238 if (prev == NULL) {
239 HorizontalUnfilter_NEON(NULL, in, out, width);
240 } else {
241 int i;
242 assert(width >= 0);
243 for (i = 0; i + 16 <= width; i += 16) {
244 const uint8x16_t A = vld1q_u8(&in[i]);
245 const uint8x16_t B = vld1q_u8(&prev[i]);
246 const uint8x16_t C = vaddq_u8(A, B);
247 vst1q_u8(&out[i], C);
248 }
249 for (; i < width; ++i) out[i] = in[i] + prev[i];
250 }
251 }
252
253 // GradientUnfilter_NEON is correct but slower than the C-version,
254 // at least on ARM64. For armv7, it's a wash.
255 // So best is to disable it for now, but keep the idea around...
256 #if !defined(USE_GRADIENT_UNFILTER)
257 #define USE_GRADIENT_UNFILTER 0 // ALTERNATE_CODE
258 #endif
259
260 #if (USE_GRADIENT_UNFILTER == 1)
261 #define GRAD_PROCESS_LANE(L) do { \
262 const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1); /* rotate predictor in */ \
263 const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1)); \
264 const uint8x8_t delta = vqmovun_s16(tmp2); \
265 pred = vadd_u8(D, delta); \
266 out = vext_u8(out, ROTATE_LEFT_N(pred, (L)), 1); \
267 } while (0)
268
GradientPredictInverse_NEON(const uint8_t * const in,const uint8_t * const top,uint8_t * const row,int length)269 static void GradientPredictInverse_NEON(const uint8_t* const in,
270 const uint8_t* const top,
271 uint8_t* const row, int length) {
272 if (length > 0) {
273 int i;
274 uint8x8_t pred = vdup_n_u8(row[-1]); // left sample
275 uint8x8_t out = vdup_n_u8(0);
276 for (i = 0; i + 8 <= length; i += 8) {
277 const int16x8_t B = LOAD_U8_TO_S16(&top[i + 0]);
278 const int16x8_t C = LOAD_U8_TO_S16(&top[i - 1]);
279 const int16x8_t BC = vsubq_s16(B, C); // unclipped gradient basis B - C
280 const uint8x8_t D = vld1_u8(&in[i]); // base input
281 GRAD_PROCESS_LANE(0);
282 GRAD_PROCESS_LANE(1);
283 GRAD_PROCESS_LANE(2);
284 GRAD_PROCESS_LANE(3);
285 GRAD_PROCESS_LANE(4);
286 GRAD_PROCESS_LANE(5);
287 GRAD_PROCESS_LANE(6);
288 GRAD_PROCESS_LANE(7);
289 vst1_u8(&row[i], out);
290 }
291 for (; i < length; ++i) {
292 row[i] = in[i] + GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
293 }
294 }
295 }
296 #undef GRAD_PROCESS_LANE
297
GradientUnfilter_NEON(const uint8_t * prev,const uint8_t * in,uint8_t * out,int width)298 static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
299 uint8_t* out, int width) {
300 if (prev == NULL) {
301 HorizontalUnfilter_NEON(NULL, in, out, width);
302 } else {
303 out[0] = in[0] + prev[0]; // predict from above
304 GradientPredictInverse_NEON(in + 1, prev + 1, out + 1, width - 1);
305 }
306 }
307
308 #endif // USE_GRADIENT_UNFILTER
309
310 //------------------------------------------------------------------------------
311 // Entry point
312
313 extern void VP8FiltersInitNEON(void);
314
VP8FiltersInitNEON(void)315 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
316 WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
317 WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
318 #if (USE_GRADIENT_UNFILTER == 1)
319 WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
320 #endif
321
322 WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_NEON;
323 WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_NEON;
324 WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_NEON;
325 }
326
327 #else // !WEBP_USE_NEON
328
329 WEBP_DSP_INIT_STUB(VP8FiltersInitNEON)
330
331 #endif // WEBP_USE_NEON
332