1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of dsp functions and loop filtering.
11 //
12 // Authors: Somnath Banerjee (somnath@google.com)
13 // Johann Koenig (johannkoenig@google.com)
14
15 #include "src/dsp/dsp.h"
16
17 #if defined(WEBP_USE_NEON)
18
19 #include "src/dsp/neon.h"
20 #include "src/dec/vp8i_dec.h"
21
22 //------------------------------------------------------------------------------
23 // NxM Loading functions
24
25 #if !defined(WORK_AROUND_GCC)
26
27 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
28 // (register alloc, probably). The variants somewhat mitigate the problem, but
29 // not quite. HFilter16i() remains problematic.
Load4x8_NEON(const uint8_t * const src,int stride)30 static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
31 int stride) {
32 const uint8x8_t zero = vdup_n_u8(0);
33 uint8x8x4_t out;
34 INIT_VECTOR4(out, zero, zero, zero, zero);
35 out = vld4_lane_u8(src + 0 * stride, out, 0);
36 out = vld4_lane_u8(src + 1 * stride, out, 1);
37 out = vld4_lane_u8(src + 2 * stride, out, 2);
38 out = vld4_lane_u8(src + 3 * stride, out, 3);
39 out = vld4_lane_u8(src + 4 * stride, out, 4);
40 out = vld4_lane_u8(src + 5 * stride, out, 5);
41 out = vld4_lane_u8(src + 6 * stride, out, 6);
42 out = vld4_lane_u8(src + 7 * stride, out, 7);
43 return out;
44 }
45
Load4x16_NEON(const uint8_t * const src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)46 static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
47 uint8x16_t* const p1,
48 uint8x16_t* const p0,
49 uint8x16_t* const q0,
50 uint8x16_t* const q1) {
51 // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
52 // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
53 const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
54 const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
55 *p1 = vcombine_u8(row0.val[0], row8.val[0]);
56 *p0 = vcombine_u8(row0.val[1], row8.val[1]);
57 *q0 = vcombine_u8(row0.val[2], row8.val[2]);
58 *q1 = vcombine_u8(row0.val[3], row8.val[3]);
59 }
60
61 #else // WORK_AROUND_GCC
62
63 #define LOADQ_LANE_32b(VALUE, LANE) do { \
64 (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \
65 src += stride; \
66 } while (0)
67
Load4x16_NEON(const uint8_t * src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)68 static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
69 uint8x16_t* const p1,
70 uint8x16_t* const p0,
71 uint8x16_t* const q0,
72 uint8x16_t* const q1) {
73 const uint32x4_t zero = vdupq_n_u32(0);
74 uint32x4x4_t in;
75 INIT_VECTOR4(in, zero, zero, zero, zero);
76 src -= 2;
77 LOADQ_LANE_32b(in.val[0], 0);
78 LOADQ_LANE_32b(in.val[1], 0);
79 LOADQ_LANE_32b(in.val[2], 0);
80 LOADQ_LANE_32b(in.val[3], 0);
81 LOADQ_LANE_32b(in.val[0], 1);
82 LOADQ_LANE_32b(in.val[1], 1);
83 LOADQ_LANE_32b(in.val[2], 1);
84 LOADQ_LANE_32b(in.val[3], 1);
85 LOADQ_LANE_32b(in.val[0], 2);
86 LOADQ_LANE_32b(in.val[1], 2);
87 LOADQ_LANE_32b(in.val[2], 2);
88 LOADQ_LANE_32b(in.val[3], 2);
89 LOADQ_LANE_32b(in.val[0], 3);
90 LOADQ_LANE_32b(in.val[1], 3);
91 LOADQ_LANE_32b(in.val[2], 3);
92 LOADQ_LANE_32b(in.val[3], 3);
93 // Transpose four 4x4 parts:
94 {
95 const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
96 vreinterpretq_u8_u32(in.val[1]));
97 const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
98 vreinterpretq_u8_u32(in.val[3]));
99 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
100 vreinterpretq_u16_u8(row23.val[0]));
101 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
102 vreinterpretq_u16_u8(row23.val[1]));
103 *p1 = vreinterpretq_u8_u16(row02.val[0]);
104 *p0 = vreinterpretq_u8_u16(row13.val[0]);
105 *q0 = vreinterpretq_u8_u16(row02.val[1]);
106 *q1 = vreinterpretq_u8_u16(row13.val[1]);
107 }
108 }
109 #undef LOADQ_LANE_32b
110
111 #endif // !WORK_AROUND_GCC
112
Load8x16_NEON(const uint8_t * const src,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)113 static WEBP_INLINE void Load8x16_NEON(
114 const uint8_t* const src, int stride,
115 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
116 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
117 uint8x16_t* const q2, uint8x16_t* const q3) {
118 Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
119 Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
120 }
121
Load16x4_NEON(const uint8_t * const src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)122 static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
123 uint8x16_t* const p1,
124 uint8x16_t* const p0,
125 uint8x16_t* const q0,
126 uint8x16_t* const q1) {
127 *p1 = vld1q_u8(src - 2 * stride);
128 *p0 = vld1q_u8(src - 1 * stride);
129 *q0 = vld1q_u8(src + 0 * stride);
130 *q1 = vld1q_u8(src + 1 * stride);
131 }
132
Load16x8_NEON(const uint8_t * const src,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)133 static WEBP_INLINE void Load16x8_NEON(
134 const uint8_t* const src, int stride,
135 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
136 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
137 uint8x16_t* const q2, uint8x16_t* const q3) {
138 Load16x4_NEON(src - 2 * stride, stride, p3, p2, p1, p0);
139 Load16x4_NEON(src + 2 * stride, stride, q0, q1, q2, q3);
140 }
141
Load8x8x2_NEON(const uint8_t * const u,const uint8_t * const v,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)142 static WEBP_INLINE void Load8x8x2_NEON(
143 const uint8_t* const u, const uint8_t* const v, int stride,
144 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
145 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
146 uint8x16_t* const q2, uint8x16_t* const q3) {
147 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
148 // and the v-samples on the higher half.
149 *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
150 *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
151 *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
152 *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
153 *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
154 *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
155 *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
156 *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
157 }
158
159 #if !defined(WORK_AROUND_GCC)
160
161 #define LOAD_UV_8(ROW) \
162 vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
163
Load8x8x2T_NEON(const uint8_t * const u,const uint8_t * const v,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)164 static WEBP_INLINE void Load8x8x2T_NEON(
165 const uint8_t* const u, const uint8_t* const v, int stride,
166 uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
167 uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
168 uint8x16_t* const q2, uint8x16_t* const q3) {
169 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
170 // and the v-samples on the higher half.
171 const uint8x16_t row0 = LOAD_UV_8(0);
172 const uint8x16_t row1 = LOAD_UV_8(1);
173 const uint8x16_t row2 = LOAD_UV_8(2);
174 const uint8x16_t row3 = LOAD_UV_8(3);
175 const uint8x16_t row4 = LOAD_UV_8(4);
176 const uint8x16_t row5 = LOAD_UV_8(5);
177 const uint8x16_t row6 = LOAD_UV_8(6);
178 const uint8x16_t row7 = LOAD_UV_8(7);
179 // Perform two side-by-side 8x8 transposes
180 // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
181 // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
182 // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
183 // u30 u31 u32 u33 u34 u35 u36 u37 | ...
184 // u40 u41 u42 u43 u44 u45 u46 u47 | ...
185 // u50 u51 u52 u53 u54 u55 u56 u57 | ...
186 // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
187 // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
188 const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...
189 // u01 u11 u03 u13 ...
190 const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...
191 // u21 u31 u23 u33 ...
192 const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...
193 const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...
194 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
195 vreinterpretq_u16_u8(row23.val[0]));
196 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
197 vreinterpretq_u16_u8(row23.val[1]));
198 const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
199 vreinterpretq_u16_u8(row67.val[0]));
200 const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
201 vreinterpretq_u16_u8(row67.val[1]));
202 const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
203 vreinterpretq_u32_u16(row46.val[0]));
204 const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
205 vreinterpretq_u32_u16(row46.val[1]));
206 const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
207 vreinterpretq_u32_u16(row57.val[0]));
208 const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
209 vreinterpretq_u32_u16(row57.val[1]));
210 *p3 = vreinterpretq_u8_u32(row04.val[0]);
211 *p2 = vreinterpretq_u8_u32(row15.val[0]);
212 *p1 = vreinterpretq_u8_u32(row26.val[0]);
213 *p0 = vreinterpretq_u8_u32(row37.val[0]);
214 *q0 = vreinterpretq_u8_u32(row04.val[1]);
215 *q1 = vreinterpretq_u8_u32(row15.val[1]);
216 *q2 = vreinterpretq_u8_u32(row26.val[1]);
217 *q3 = vreinterpretq_u8_u32(row37.val[1]);
218 }
219 #undef LOAD_UV_8
220
221 #endif // !WORK_AROUND_GCC
222
Store2x8_NEON(const uint8x8x2_t v,uint8_t * const dst,int stride)223 static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
224 uint8_t* const dst, int stride) {
225 vst2_lane_u8(dst + 0 * stride, v, 0);
226 vst2_lane_u8(dst + 1 * stride, v, 1);
227 vst2_lane_u8(dst + 2 * stride, v, 2);
228 vst2_lane_u8(dst + 3 * stride, v, 3);
229 vst2_lane_u8(dst + 4 * stride, v, 4);
230 vst2_lane_u8(dst + 5 * stride, v, 5);
231 vst2_lane_u8(dst + 6 * stride, v, 6);
232 vst2_lane_u8(dst + 7 * stride, v, 7);
233 }
234
Store2x16_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const dst,int stride)235 static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
236 uint8_t* const dst, int stride) {
237 uint8x8x2_t lo, hi;
238 lo.val[0] = vget_low_u8(p0);
239 lo.val[1] = vget_low_u8(q0);
240 hi.val[0] = vget_high_u8(p0);
241 hi.val[1] = vget_high_u8(q0);
242 Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
243 Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
244 }
245
246 #if !defined(WORK_AROUND_GCC)
Store4x8_NEON(const uint8x8x4_t v,uint8_t * const dst,int stride)247 static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
248 uint8_t* const dst, int stride) {
249 vst4_lane_u8(dst + 0 * stride, v, 0);
250 vst4_lane_u8(dst + 1 * stride, v, 1);
251 vst4_lane_u8(dst + 2 * stride, v, 2);
252 vst4_lane_u8(dst + 3 * stride, v, 3);
253 vst4_lane_u8(dst + 4 * stride, v, 4);
254 vst4_lane_u8(dst + 5 * stride, v, 5);
255 vst4_lane_u8(dst + 6 * stride, v, 6);
256 vst4_lane_u8(dst + 7 * stride, v, 7);
257 }
258
Store4x16_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const dst,int stride)259 static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
260 const uint8x16_t q0, const uint8x16_t q1,
261 uint8_t* const dst, int stride) {
262 uint8x8x4_t lo, hi;
263 INIT_VECTOR4(lo,
264 vget_low_u8(p1), vget_low_u8(p0),
265 vget_low_u8(q0), vget_low_u8(q1));
266 INIT_VECTOR4(hi,
267 vget_high_u8(p1), vget_high_u8(p0),
268 vget_high_u8(q0), vget_high_u8(q1));
269 Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
270 Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
271 }
272 #endif // !WORK_AROUND_GCC
273
Store16x2_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const dst,int stride)274 static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
275 uint8_t* const dst, int stride) {
276 vst1q_u8(dst - stride, p0);
277 vst1q_u8(dst, q0);
278 }
279
Store16x4_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const dst,int stride)280 static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
281 const uint8x16_t q0, const uint8x16_t q1,
282 uint8_t* const dst, int stride) {
283 Store16x2_NEON(p1, p0, dst - stride, stride);
284 Store16x2_NEON(q0, q1, dst + stride, stride);
285 }
286
Store8x2x2_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const u,uint8_t * const v,int stride)287 static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
288 const uint8x16_t q0,
289 uint8_t* const u, uint8_t* const v,
290 int stride) {
291 // p0 and q0 contain the u+v samples packed in low/high halves.
292 vst1_u8(u - stride, vget_low_u8(p0));
293 vst1_u8(u, vget_low_u8(q0));
294 vst1_u8(v - stride, vget_high_u8(p0));
295 vst1_u8(v, vget_high_u8(q0));
296 }
297
Store8x4x2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const u,uint8_t * const v,int stride)298 static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
299 const uint8x16_t p0,
300 const uint8x16_t q0,
301 const uint8x16_t q1,
302 uint8_t* const u, uint8_t* const v,
303 int stride) {
304 // The p1...q1 registers contain the u+v samples packed in low/high halves.
305 Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
306 Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
307 }
308
309 #if !defined(WORK_AROUND_GCC)
310
311 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
312 vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
313 vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
314 (DST) += stride; \
315 } while (0)
316
Store6x8x2_NEON(const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,uint8_t * u,uint8_t * v,int stride)317 static WEBP_INLINE void Store6x8x2_NEON(
318 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
319 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
320 uint8_t* u, uint8_t* v, int stride) {
321 uint8x8x3_t u0, u1, v0, v1;
322 INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
323 INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
324 INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
325 INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
326 STORE6_LANE(u, u0, u1, 0);
327 STORE6_LANE(u, u0, u1, 1);
328 STORE6_LANE(u, u0, u1, 2);
329 STORE6_LANE(u, u0, u1, 3);
330 STORE6_LANE(u, u0, u1, 4);
331 STORE6_LANE(u, u0, u1, 5);
332 STORE6_LANE(u, u0, u1, 6);
333 STORE6_LANE(u, u0, u1, 7);
334 STORE6_LANE(v, v0, v1, 0);
335 STORE6_LANE(v, v0, v1, 1);
336 STORE6_LANE(v, v0, v1, 2);
337 STORE6_LANE(v, v0, v1, 3);
338 STORE6_LANE(v, v0, v1, 4);
339 STORE6_LANE(v, v0, v1, 5);
340 STORE6_LANE(v, v0, v1, 6);
341 STORE6_LANE(v, v0, v1, 7);
342 }
343 #undef STORE6_LANE
344
Store4x8x2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const u,uint8_t * const v,int stride)345 static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
346 const uint8x16_t p0,
347 const uint8x16_t q0,
348 const uint8x16_t q1,
349 uint8_t* const u, uint8_t* const v,
350 int stride) {
351 uint8x8x4_t u0, v0;
352 INIT_VECTOR4(u0,
353 vget_low_u8(p1), vget_low_u8(p0),
354 vget_low_u8(q0), vget_low_u8(q1));
355 INIT_VECTOR4(v0,
356 vget_high_u8(p1), vget_high_u8(p0),
357 vget_high_u8(q0), vget_high_u8(q1));
358 vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
359 vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
360 vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
361 vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
362 vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
363 vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
364 vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
365 vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
366 vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
367 vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
368 vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
369 vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
370 vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
371 vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
372 vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
373 vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
374 }
375
376 #endif // !WORK_AROUND_GCC
377
378 // Zero extend 'v' to an int16x8_t.
ConvertU8ToS16_NEON(uint8x8_t v)379 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
380 return vreinterpretq_s16_u16(vmovl_u8(v));
381 }
382
383 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
384 // to the corresponding rows of 'dst'.
SaturateAndStore4x4_NEON(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)385 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
386 const int16x8_t dst01,
387 const int16x8_t dst23) {
388 // Unsigned saturate to 8b.
389 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
390 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
391
392 // Store the results.
393 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
394 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
395 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
396 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
397 }
398
Add4x4_NEON(const int16x8_t row01,const int16x8_t row23,uint8_t * const dst)399 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
400 const int16x8_t row23,
401 uint8_t* const dst) {
402 uint32x2_t dst01 = vdup_n_u32(0);
403 uint32x2_t dst23 = vdup_n_u32(0);
404
405 // Load the source pixels.
406 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
407 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
408 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
409 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
410
411 {
412 // Convert to 16b.
413 const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
414 const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
415
416 // Descale with rounding.
417 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
418 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
419 // Add the inverse transform.
420 SaturateAndStore4x4_NEON(dst, out01, out23);
421 }
422 }
423
424 //-----------------------------------------------------------------------------
425 // Simple In-loop filtering (Paragraph 15.2)
426
NeedsFilter_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,int thresh)427 static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
428 const uint8x16_t q0, const uint8x16_t q1,
429 int thresh) {
430 const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
431 const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
432 const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
433 const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
434 const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2
435 const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
436 const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
437 return mask;
438 }
439
FlipSign_NEON(const uint8x16_t v)440 static int8x16_t FlipSign_NEON(const uint8x16_t v) {
441 const uint8x16_t sign_bit = vdupq_n_u8(0x80);
442 return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
443 }
444
FlipSignBack_NEON(const int8x16_t v)445 static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
446 const int8x16_t sign_bit = vdupq_n_s8(0x80);
447 return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
448 }
449
GetBaseDelta_NEON(const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1)450 static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
451 const int8x16_t q0, const int8x16_t q1) {
452 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
453 const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
454 const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
455 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
456 const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
457 return s3;
458 }
459
GetBaseDelta0_NEON(const int8x16_t p0,const int8x16_t q0)460 static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
461 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
462 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
463 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
464 return s2;
465 }
466
467 //------------------------------------------------------------------------------
468
ApplyFilter2NoFlip_NEON(const int8x16_t p0s,const int8x16_t q0s,const int8x16_t delta,int8x16_t * const op0,int8x16_t * const oq0)469 static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
470 const int8x16_t delta,
471 int8x16_t* const op0,
472 int8x16_t* const oq0) {
473 const int8x16_t kCst3 = vdupq_n_s8(0x03);
474 const int8x16_t kCst4 = vdupq_n_s8(0x04);
475 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
476 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
477 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
478 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
479 *op0 = vqaddq_s8(p0s, delta3);
480 *oq0 = vqsubq_s8(q0s, delta4);
481 }
482
483 #if defined(WEBP_USE_INTRINSICS)
484
ApplyFilter2_NEON(const int8x16_t p0s,const int8x16_t q0s,const int8x16_t delta,uint8x16_t * const op0,uint8x16_t * const oq0)485 static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
486 const int8x16_t delta,
487 uint8x16_t* const op0, uint8x16_t* const oq0) {
488 const int8x16_t kCst3 = vdupq_n_s8(0x03);
489 const int8x16_t kCst4 = vdupq_n_s8(0x04);
490 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
491 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
492 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
493 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
494 const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
495 const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
496 *op0 = FlipSignBack_NEON(sp0);
497 *oq0 = FlipSignBack_NEON(sq0);
498 }
499
DoFilter2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t mask,uint8x16_t * const op0,uint8x16_t * const oq0)500 static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
501 const uint8x16_t q0, const uint8x16_t q1,
502 const uint8x16_t mask,
503 uint8x16_t* const op0, uint8x16_t* const oq0) {
504 const int8x16_t p1s = FlipSign_NEON(p1);
505 const int8x16_t p0s = FlipSign_NEON(p0);
506 const int8x16_t q0s = FlipSign_NEON(q0);
507 const int8x16_t q1s = FlipSign_NEON(q1);
508 const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
509 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
510 ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
511 }
512
SimpleVFilter16_NEON(uint8_t * p,int stride,int thresh)513 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
514 uint8x16_t p1, p0, q0, q1, op0, oq0;
515 Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
516 {
517 const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
518 DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
519 }
520 Store16x2_NEON(op0, oq0, p, stride);
521 }
522
SimpleHFilter16_NEON(uint8_t * p,int stride,int thresh)523 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
524 uint8x16_t p1, p0, q0, q1, oq0, op0;
525 Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
526 {
527 const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
528 DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
529 }
530 Store2x16_NEON(op0, oq0, p, stride);
531 }
532
533 #else
534
535 // Load/Store vertical edge
536 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
537 "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
538 "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
539 "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
540 "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
541 "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
542 "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
543 "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
544 "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
545
546 #define STORE8x2(c1, c2, p, stride) \
547 "vst2.8 {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n" \
548 "vst2.8 {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n" \
549 "vst2.8 {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n" \
550 "vst2.8 {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n" \
551 "vst2.8 {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n" \
552 "vst2.8 {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n" \
553 "vst2.8 {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n" \
554 "vst2.8 {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
555
556 #define QRegs "q0", "q1", "q2", "q3", \
557 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558
559 #define FLIP_SIGN_BIT2(a, b, s) \
560 "veor " #a "," #a "," #s " \n" \
561 "veor " #b "," #b "," #s " \n" \
562
563 #define FLIP_SIGN_BIT4(a, b, c, d, s) \
564 FLIP_SIGN_BIT2(a, b, s) \
565 FLIP_SIGN_BIT2(c, d, s) \
566
567 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
568 "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
569 "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \
570 "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
571 "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \
572 "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
573 "vdup.8 q14, " #thresh " \n" \
574 "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */
575
576 #define GET_BASE_DELTA(p1, p0, q0, q1, o) \
577 "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
578 "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \
579 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
580 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
581 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
582
583 #define DO_SIMPLE_FILTER(p0, q0, fl) \
584 "vmov.i8 q15, #0x03 \n" \
585 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \
586 "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \
587 "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \
588 \
589 "vmov.i8 q15, #0x04 \n" \
590 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \
591 "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \
592 "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */
593
594 // Applies filter on 2 pixels (p0 and q0)
595 #define DO_FILTER2(p1, p0, q0, q1, thresh) \
596 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
597 "vmov.i8 q10, #0x80 \n" /* sign bit */ \
598 FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
599 GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
600 "vand q9, q9, q11 \n" /* apply filter mask */ \
601 DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
602 FLIP_SIGN_BIT2(p0, q0, q10)
603
SimpleVFilter16_NEON(uint8_t * p,int stride,int thresh)604 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
605 __asm__ volatile (
606 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
607
608 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
609 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
610 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
611 "vld1.u8 {q12}, [%[p]] \n" // q1
612
613 DO_FILTER2(q1, q2, q3, q12, %[thresh])
614
615 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
616
617 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0
618 "vst1.u8 {q3}, [%[p]] \n" // store oq0
619 : [p] "+r"(p)
620 : [stride] "r"(stride), [thresh] "r"(thresh)
621 : "memory", QRegs
622 );
623 }
624
SimpleHFilter16_NEON(uint8_t * p,int stride,int thresh)625 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
626 __asm__ volatile (
627 "sub r4, %[p], #2 \n" // base1 = p - 2
628 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride
629 "add r5, r4, %[stride] \n" // base2 = base1 + stride
630
631 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
632 LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
633 "vswp d3, d24 \n" // p1:q1 p0:q3
634 "vswp d5, d26 \n" // q0:q2 q1:q4
635 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
636
637 DO_FILTER2(q1, q2, q12, q13, %[thresh])
638
639 "sub %[p], %[p], #1 \n" // p - 1
640
641 "vswp d5, d24 \n"
642 STORE8x2(d4, d5, [%[p]], %[stride])
643 STORE8x2(d24, d25, [%[p]], %[stride])
644
645 : [p] "+r"(p)
646 : [stride] "r"(stride), [thresh] "r"(thresh)
647 : "memory", "r4", "r5", "r6", QRegs
648 );
649 }
650
651 #undef LOAD8x4
652 #undef STORE8x2
653
654 #endif // WEBP_USE_INTRINSICS
655
SimpleVFilter16i_NEON(uint8_t * p,int stride,int thresh)656 static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
657 uint32_t k;
658 for (k = 3; k != 0; --k) {
659 p += 4 * stride;
660 SimpleVFilter16_NEON(p, stride, thresh);
661 }
662 }
663
SimpleHFilter16i_NEON(uint8_t * p,int stride,int thresh)664 static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
665 uint32_t k;
666 for (k = 3; k != 0; --k) {
667 p += 4;
668 SimpleHFilter16_NEON(p, stride, thresh);
669 }
670 }
671
672 //------------------------------------------------------------------------------
673 // Complex In-loop filtering (Paragraph 15.3)
674
NeedsHev_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,int hev_thresh)675 static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
676 const uint8x16_t q0, const uint8x16_t q1,
677 int hev_thresh) {
678 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
679 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
680 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
681 const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
682 const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
683 return mask;
684 }
685
NeedsFilter2_NEON(const uint8x16_t p3,const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t q3,int ithresh,int thresh)686 static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
687 const uint8x16_t p1, const uint8x16_t p0,
688 const uint8x16_t q0, const uint8x16_t q1,
689 const uint8x16_t q2, const uint8x16_t q3,
690 int ithresh, int thresh) {
691 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
692 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
693 const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
694 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
695 const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)
696 const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)
697 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
698 const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
699 const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
700 const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
701 const uint8x16_t max12 = vmaxq_u8(max1, max2);
702 const uint8x16_t max123 = vmaxq_u8(max12, max3);
703 const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
704 const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
705 const uint8x16_t mask = vandq_u8(mask1, mask2);
706 return mask;
707 }
708
709 // 4-points filter
710
ApplyFilter4_NEON(const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1,const int8x16_t delta0,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1)711 static void ApplyFilter4_NEON(
712 const int8x16_t p1, const int8x16_t p0,
713 const int8x16_t q0, const int8x16_t q1,
714 const int8x16_t delta0,
715 uint8x16_t* const op1, uint8x16_t* const op0,
716 uint8x16_t* const oq0, uint8x16_t* const oq1) {
717 const int8x16_t kCst3 = vdupq_n_s8(0x03);
718 const int8x16_t kCst4 = vdupq_n_s8(0x04);
719 const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
720 const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
721 const int8x16_t a1 = vshrq_n_s8(delta1, 3);
722 const int8x16_t a2 = vshrq_n_s8(delta2, 3);
723 const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1
724 *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2)); // clip(p0 + a2)
725 *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - a1)
726 *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3)); // clip(p1 + a3)
727 *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3)); // clip(q1 - a3)
728 }
729
DoFilter4_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t mask,const uint8x16_t hev_mask,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1)730 static void DoFilter4_NEON(
731 const uint8x16_t p1, const uint8x16_t p0,
732 const uint8x16_t q0, const uint8x16_t q1,
733 const uint8x16_t mask, const uint8x16_t hev_mask,
734 uint8x16_t* const op1, uint8x16_t* const op0,
735 uint8x16_t* const oq0, uint8x16_t* const oq1) {
736 // This is a fused version of DoFilter2() calling ApplyFilter2 directly
737 const int8x16_t p1s = FlipSign_NEON(p1);
738 int8x16_t p0s = FlipSign_NEON(p0);
739 int8x16_t q0s = FlipSign_NEON(q0);
740 const int8x16_t q1s = FlipSign_NEON(q1);
741 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
742
743 // do_filter2 part (simple loopfilter on pixels with hev)
744 {
745 const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
746 const int8x16_t simple_lf_delta =
747 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
748 ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
749 }
750
751 // do_filter4 part (complex loopfilter on pixels without hev)
752 {
753 const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
754 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
755 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
756 const int8x16_t complex_lf_delta =
757 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
758 ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
759 }
760 }
761
762 // 6-points filter
763
ApplyFilter6_NEON(const int8x16_t p2,const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1,const int8x16_t q2,const int8x16_t delta,uint8x16_t * const op2,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1,uint8x16_t * const oq2)764 static void ApplyFilter6_NEON(
765 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
766 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
767 const int8x16_t delta,
768 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
769 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
770 // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
771 // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
772 // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
773 // X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
774 const int8x8_t delta_lo = vget_low_s8(delta);
775 const int8x8_t delta_hi = vget_high_s8(delta);
776 const int8x8_t kCst9 = vdup_n_s8(9);
777 const int16x8_t kCstm1 = vdupq_n_s16(-1);
778 const int8x8_t kCst18 = vdup_n_s8(18);
779 const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo); // S = 9 * a - 1
780 const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
781 const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo); // S + 18 * a
782 const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
783 const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7); // (9 * a + 63) >> 7
784 const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
785 const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6); // (9 * a + 31) >> 6
786 const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
787 const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7); // (27 * a + 63) >> 7
788 const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
789 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
790 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
791 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
792
793 *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1)); // clip(p0 + a1)
794 *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1)); // clip(q0 - q1)
795 *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2)); // clip(q1 - a2)
796 *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2)); // clip(p1 + a2)
797 *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3)); // clip(q2 - a3)
798 *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3)); // clip(p2 + a3)
799 }
800
DoFilter6_NEON(const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t mask,const uint8x16_t hev_mask,uint8x16_t * const op2,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1,uint8x16_t * const oq2)801 static void DoFilter6_NEON(
802 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
803 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
804 const uint8x16_t mask, const uint8x16_t hev_mask,
805 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
806 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
807 // This is a fused version of DoFilter2() calling ApplyFilter2 directly
808 const int8x16_t p2s = FlipSign_NEON(p2);
809 const int8x16_t p1s = FlipSign_NEON(p1);
810 int8x16_t p0s = FlipSign_NEON(p0);
811 int8x16_t q0s = FlipSign_NEON(q0);
812 const int8x16_t q1s = FlipSign_NEON(q1);
813 const int8x16_t q2s = FlipSign_NEON(q2);
814 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
815 const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
816
817 // do_filter2 part (simple loopfilter on pixels with hev)
818 {
819 const int8x16_t simple_lf_delta =
820 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
821 ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
822 }
823
824 // do_filter6 part (complex loopfilter on pixels without hev)
825 {
826 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
827 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
828 const int8x16_t complex_lf_delta =
829 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
830 ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
831 op2, op1, op0, oq0, oq1, oq2);
832 }
833 }
834
835 // on macroblock edges
836
VFilter16_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)837 static void VFilter16_NEON(uint8_t* p, int stride,
838 int thresh, int ithresh, int hev_thresh) {
839 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840 Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841 {
842 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
843 ithresh, thresh);
844 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
845 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847 &op2, &op1, &op0, &oq0, &oq1, &oq2);
848 Store16x2_NEON(op2, op1, p - 2 * stride, stride);
849 Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
850 Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
851 }
852 }
853
HFilter16_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)854 static void HFilter16_NEON(uint8_t* p, int stride,
855 int thresh, int ithresh, int hev_thresh) {
856 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
857 Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
858 {
859 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
860 ithresh, thresh);
861 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
862 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
863 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
864 &op2, &op1, &op0, &oq0, &oq1, &oq2);
865 Store2x16_NEON(op2, op1, p - 2, stride);
866 Store2x16_NEON(op0, oq0, p + 0, stride);
867 Store2x16_NEON(oq1, oq2, p + 2, stride);
868 }
869 }
870
871 // on three inner edges
VFilter16i_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)872 static void VFilter16i_NEON(uint8_t* p, int stride,
873 int thresh, int ithresh, int hev_thresh) {
874 uint32_t k;
875 uint8x16_t p3, p2, p1, p0;
876 Load16x4_NEON(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
877 for (k = 3; k != 0; --k) {
878 uint8x16_t q0, q1, q2, q3;
879 p += 4 * stride;
880 Load16x4_NEON(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
881 {
882 const uint8x16_t mask =
883 NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
884 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
885 // p3 and p2 are not just temporary variables here: they will be
886 // re-used for next span. And q2/q3 will become p1/p0 accordingly.
887 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
888 Store16x4_NEON(p1, p0, p3, p2, p, stride);
889 p1 = q2;
890 p0 = q3;
891 }
892 }
893 }
894
895 #if !defined(WORK_AROUND_GCC)
HFilter16i_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)896 static void HFilter16i_NEON(uint8_t* p, int stride,
897 int thresh, int ithresh, int hev_thresh) {
898 uint32_t k;
899 uint8x16_t p3, p2, p1, p0;
900 Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
901 for (k = 3; k != 0; --k) {
902 uint8x16_t q0, q1, q2, q3;
903 p += 4;
904 Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
905 {
906 const uint8x16_t mask =
907 NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
908 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
909 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
910 Store4x16_NEON(p1, p0, p3, p2, p, stride);
911 p1 = q2;
912 p0 = q3;
913 }
914 }
915 }
916 #endif // !WORK_AROUND_GCC
917
918 // 8-pixels wide variant, for chroma filtering
VFilter8_NEON(uint8_t * WEBP_RESTRICT u,uint8_t * WEBP_RESTRICT v,int stride,int thresh,int ithresh,int hev_thresh)919 static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
920 int stride, int thresh, int ithresh, int hev_thresh) {
921 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
922 Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
923 {
924 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
925 ithresh, thresh);
926 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
927 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
928 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
929 &op2, &op1, &op0, &oq0, &oq1, &oq2);
930 Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
931 Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
932 Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
933 }
934 }
VFilter8i_NEON(uint8_t * WEBP_RESTRICT u,uint8_t * WEBP_RESTRICT v,int stride,int thresh,int ithresh,int hev_thresh)935 static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
936 int stride,
937 int thresh, int ithresh, int hev_thresh) {
938 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
939 u += 4 * stride;
940 v += 4 * stride;
941 Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
942 {
943 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
944 ithresh, thresh);
945 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
946 uint8x16_t op1, op0, oq0, oq1;
947 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
948 Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
949 }
950 }
951
952 #if !defined(WORK_AROUND_GCC)
HFilter8_NEON(uint8_t * WEBP_RESTRICT u,uint8_t * WEBP_RESTRICT v,int stride,int thresh,int ithresh,int hev_thresh)953 static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
954 int stride, int thresh, int ithresh, int hev_thresh) {
955 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
956 Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
957 {
958 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
959 ithresh, thresh);
960 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
961 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
962 DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
963 &op2, &op1, &op0, &oq0, &oq1, &oq2);
964 Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
965 }
966 }
967
HFilter8i_NEON(uint8_t * WEBP_RESTRICT u,uint8_t * WEBP_RESTRICT v,int stride,int thresh,int ithresh,int hev_thresh)968 static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
969 int stride,
970 int thresh, int ithresh, int hev_thresh) {
971 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
972 u += 4;
973 v += 4;
974 Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
975 {
976 const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
977 ithresh, thresh);
978 const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
979 uint8x16_t op1, op0, oq0, oq1;
980 DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
981 Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
982 }
983 }
984 #endif // !WORK_AROUND_GCC
985
986 //-----------------------------------------------------------------------------
987 // Inverse transforms (Paragraph 14.4)
988
989 // Technically these are unsigned but vqdmulh is only available in signed.
990 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
991 // changing the >> 16 to >> 15 and requiring an additional >> 1.
992 // We use this to our advantage with kC2. The canonical value is 35468.
993 // However, the high bit is set so treating it as signed will give incorrect
994 // results. We avoid this by down shifting by 1 here to clear the highest bit.
995 // Combined with the doubling effect of vqdmulh we get >> 16.
996 // This can not be applied to kC1 because the lowest bit is set. Down shifting
997 // the constant would reduce precision.
998
999 // libwebp uses a trick to avoid some extra addition that libvpx does.
1000 // Instead of:
1001 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1002 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1003 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
1004
1005 static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
1006 static const int16_t kC2 =
1007 WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above.
1008
1009 #if defined(WEBP_USE_INTRINSICS)
Transpose8x2_NEON(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)1010 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1011 const int16x8_t in1,
1012 int16x8x2_t* const out) {
1013 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
1014 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
1015 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
1016 // b0 d0 b1 d1 b2 d2 ...
1017 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1018 }
1019
TransformPass_NEON(int16x8x2_t * const rows)1020 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1021 // {rows} = in0 | in4
1022 // in8 | in12
1023 // B1 = in4 | in12
1024 const int16x8_t B1 =
1025 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1026 // C0 = kC1 * in4 | kC1 * in12
1027 // C1 = kC2 * in4 | kC2 * in12
1028 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1029 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1030 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1031 vget_low_s16(rows->val[1])); // in0 + in8
1032 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1033 vget_low_s16(rows->val[1])); // in0 - in8
1034 // c = kC2 * in4 - kC1 * in12
1035 // d = kC1 * in4 + kC2 * in12
1036 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1037 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1038 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
1039 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
1040 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
1041 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
1042 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1043 Transpose8x2_NEON(E0, E1, rows);
1044 }
1045
TransformOne_NEON(const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)1046 static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
1047 uint8_t* WEBP_RESTRICT dst) {
1048 int16x8x2_t rows;
1049 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1050 TransformPass_NEON(&rows);
1051 TransformPass_NEON(&rows);
1052 Add4x4_NEON(rows.val[0], rows.val[1], dst);
1053 }
1054
1055 #else
1056
TransformOne_NEON(const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)1057 static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
1058 uint8_t* WEBP_RESTRICT dst) {
1059 const int kBPS = BPS;
1060 // kC1, kC2. Padded because vld1.16 loads 8 bytes
1061 const int16_t constants[4] = { kC1, kC2, 0, 0 };
1062 /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1063 __asm__ volatile (
1064 "vld1.16 {q1, q2}, [%[in]] \n"
1065 "vld1.16 {d0}, [%[constants]] \n"
1066
1067 /* d2: in[0]
1068 * d3: in[8]
1069 * d4: in[4]
1070 * d5: in[12]
1071 */
1072 "vswp d3, d4 \n"
1073
1074 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1075 * q9 = {in[4], in[12]} * kC2 >> 16
1076 */
1077 "vqdmulh.s16 q8, q2, d0[0] \n"
1078 "vqdmulh.s16 q9, q2, d0[1] \n"
1079
1080 /* d22 = a = in[0] + in[8]
1081 * d23 = b = in[0] - in[8]
1082 */
1083 "vqadd.s16 d22, d2, d3 \n"
1084 "vqsub.s16 d23, d2, d3 \n"
1085
1086 /* The multiplication should be x * kC1 >> 16
1087 * However, with vqdmulh we get x * kC1 * 2 >> 16
1088 * (multiply, double, return high half)
1089 * We avoided this in kC2 by pre-shifting the constant.
1090 * q8 = in[4]/[12] * kC1 >> 16
1091 */
1092 "vshr.s16 q8, q8, #1 \n"
1093
1094 /* Add {in[4], in[12]} back after the multiplication. This is handled by
1095 * adding 1 << 16 to kC1 in the libwebp C code.
1096 */
1097 "vqadd.s16 q8, q2, q8 \n"
1098
1099 /* d20 = c = in[4]*kC2 - in[12]*kC1
1100 * d21 = d = in[4]*kC1 + in[12]*kC2
1101 */
1102 "vqsub.s16 d20, d18, d17 \n"
1103 "vqadd.s16 d21, d19, d16 \n"
1104
1105 /* d2 = tmp[0] = a + d
1106 * d3 = tmp[1] = b + c
1107 * d4 = tmp[2] = b - c
1108 * d5 = tmp[3] = a - d
1109 */
1110 "vqadd.s16 d2, d22, d21 \n"
1111 "vqadd.s16 d3, d23, d20 \n"
1112 "vqsub.s16 d4, d23, d20 \n"
1113 "vqsub.s16 d5, d22, d21 \n"
1114
1115 "vzip.16 q1, q2 \n"
1116 "vzip.16 q1, q2 \n"
1117
1118 "vswp d3, d4 \n"
1119
1120 /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1121 * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1122 */
1123 "vqdmulh.s16 q8, q2, d0[0] \n"
1124 "vqdmulh.s16 q9, q2, d0[1] \n"
1125
1126 /* d22 = a = tmp[0] + tmp[8]
1127 * d23 = b = tmp[0] - tmp[8]
1128 */
1129 "vqadd.s16 d22, d2, d3 \n"
1130 "vqsub.s16 d23, d2, d3 \n"
1131
1132 /* See long winded explanations prior */
1133 "vshr.s16 q8, q8, #1 \n"
1134 "vqadd.s16 q8, q2, q8 \n"
1135
1136 /* d20 = c = in[4]*kC2 - in[12]*kC1
1137 * d21 = d = in[4]*kC1 + in[12]*kC2
1138 */
1139 "vqsub.s16 d20, d18, d17 \n"
1140 "vqadd.s16 d21, d19, d16 \n"
1141
1142 /* d2 = tmp[0] = a + d
1143 * d3 = tmp[1] = b + c
1144 * d4 = tmp[2] = b - c
1145 * d5 = tmp[3] = a - d
1146 */
1147 "vqadd.s16 d2, d22, d21 \n"
1148 "vqadd.s16 d3, d23, d20 \n"
1149 "vqsub.s16 d4, d23, d20 \n"
1150 "vqsub.s16 d5, d22, d21 \n"
1151
1152 "vld1.32 d6[0], [%[dst]], %[kBPS] \n"
1153 "vld1.32 d6[1], [%[dst]], %[kBPS] \n"
1154 "vld1.32 d7[0], [%[dst]], %[kBPS] \n"
1155 "vld1.32 d7[1], [%[dst]], %[kBPS] \n"
1156
1157 "sub %[dst], %[dst], %[kBPS], lsl #2 \n"
1158
1159 /* (val) + 4 >> 3 */
1160 "vrshr.s16 d2, d2, #3 \n"
1161 "vrshr.s16 d3, d3, #3 \n"
1162 "vrshr.s16 d4, d4, #3 \n"
1163 "vrshr.s16 d5, d5, #3 \n"
1164
1165 "vzip.16 q1, q2 \n"
1166 "vzip.16 q1, q2 \n"
1167
1168 /* Must accumulate before saturating */
1169 "vmovl.u8 q8, d6 \n"
1170 "vmovl.u8 q9, d7 \n"
1171
1172 "vqadd.s16 q1, q1, q8 \n"
1173 "vqadd.s16 q2, q2, q9 \n"
1174
1175 "vqmovun.s16 d0, q1 \n"
1176 "vqmovun.s16 d1, q2 \n"
1177
1178 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
1179 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
1180 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
1181 "vst1.32 d1[1], [%[dst]] \n"
1182
1183 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
1184 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */
1185 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */
1186 );
1187 }
1188
1189 #endif // WEBP_USE_INTRINSICS
1190
TransformTwo_NEON(const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst,int do_two)1191 static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
1192 uint8_t* WEBP_RESTRICT dst, int do_two) {
1193 TransformOne_NEON(in, dst);
1194 if (do_two) {
1195 TransformOne_NEON(in + 16, dst + 4);
1196 }
1197 }
1198
TransformDC_NEON(const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)1199 static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
1200 uint8_t* WEBP_RESTRICT dst) {
1201 const int16x8_t DC = vdupq_n_s16(in[0]);
1202 Add4x4_NEON(DC, DC, dst);
1203 }
1204
1205 //------------------------------------------------------------------------------
1206
1207 #define STORE_WHT(dst, col, rows) do { \
1208 *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1209 *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1210 *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1211 *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1212 } while (0)
1213
TransformWHT_NEON(const int16_t * WEBP_RESTRICT in,int16_t * WEBP_RESTRICT out)1214 static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
1215 int16_t* WEBP_RESTRICT out) {
1216 int32x4x4_t tmp;
1217
1218 {
1219 // Load the source.
1220 const int16x4_t in00_03 = vld1_s16(in + 0);
1221 const int16x4_t in04_07 = vld1_s16(in + 4);
1222 const int16x4_t in08_11 = vld1_s16(in + 8);
1223 const int16x4_t in12_15 = vld1_s16(in + 12);
1224 const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
1225 const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
1226 const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
1227 const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
1228 tmp.val[0] = vaddq_s32(a0, a1);
1229 tmp.val[1] = vaddq_s32(a3, a2);
1230 tmp.val[2] = vsubq_s32(a0, a1);
1231 tmp.val[3] = vsubq_s32(a3, a2);
1232 // Arrange the temporary results column-wise.
1233 tmp = Transpose4x4_NEON(tmp);
1234 }
1235
1236 {
1237 const int32x4_t kCst3 = vdupq_n_s32(3);
1238 const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder
1239 const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1240 const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1241 const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1242 const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1243
1244 tmp.val[0] = vaddq_s32(a0, a1);
1245 tmp.val[1] = vaddq_s32(a3, a2);
1246 tmp.val[2] = vsubq_s32(a0, a1);
1247 tmp.val[3] = vsubq_s32(a3, a2);
1248
1249 // right shift the results by 3.
1250 tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1251 tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1252 tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1253 tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1254
1255 STORE_WHT(out, 0, tmp);
1256 STORE_WHT(out, 1, tmp);
1257 STORE_WHT(out, 2, tmp);
1258 STORE_WHT(out, 3, tmp);
1259 }
1260 }
1261
1262 #undef STORE_WHT
1263
1264 //------------------------------------------------------------------------------
1265
TransformAC3_NEON(const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)1266 static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
1267 uint8_t* WEBP_RESTRICT dst) {
1268 const int16x4_t A = vld1_dup_s16(in);
1269 const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
1270 const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
1271 const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
1272 const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
1273 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |
1274 (uint64_t)( c1 & 0xffff) << 16 |
1275 (uint64_t)(-c1 & 0xffff) << 32 |
1276 (uint64_t)(-d1 & 0xffff) << 48;
1277 const int16x4_t CD = vcreate_s16(cd);
1278 const int16x4_t B = vqadd_s16(A, CD);
1279 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1280 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1281 Add4x4_NEON(m0_m1, m2_m3, dst);
1282 }
1283
1284 //------------------------------------------------------------------------------
1285 // 4x4
1286
DC4_NEON(uint8_t * dst)1287 static void DC4_NEON(uint8_t* dst) { // DC
1288 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1289 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1290 const uint16x4_t p1 = vpadd_u16(p0, p0);
1291 const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1292 const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1293 const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1294 const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1295 const uint16x8_t s0 = vaddl_u8(L0, L1);
1296 const uint16x8_t s1 = vaddl_u8(L2, L3);
1297 const uint16x8_t s01 = vaddq_u16(s0, s1);
1298 const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1299 const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); // (sum + 4) >> 3
1300 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1301 int i;
1302 for (i = 0; i < 4; ++i) {
1303 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1304 }
1305 }
1306
1307 // TrueMotion (4x4 + 8x8)
TrueMotion_NEON(uint8_t * dst,int size)1308 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1309 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1310 const uint8x8_t T = vld1_u8(dst - BPS); // top row 'A[0..3]'
1311 const uint16x8_t d = vsubl_u8(T, TL); // A[c] - A[-1]
1312 int y;
1313 for (y = 0; y < size; y += 4) {
1314 // left edge
1315 const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
1316 const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
1317 const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
1318 const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
1319 // L[r] + A[c] - A[-1]
1320 const int16x8_t r0 = vreinterpretq_s16_u16(vaddw_u8(d, L0));
1321 const int16x8_t r1 = vreinterpretq_s16_u16(vaddw_u8(d, L1));
1322 const int16x8_t r2 = vreinterpretq_s16_u16(vaddw_u8(d, L2));
1323 const int16x8_t r3 = vreinterpretq_s16_u16(vaddw_u8(d, L3));
1324 // Saturate and store the result.
1325 const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1326 const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1327 const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1328 const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1329 if (size == 4) {
1330 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1331 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1332 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1333 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1334 } else {
1335 vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1336 vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1337 vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1338 vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1339 }
1340 dst += 4 * BPS;
1341 }
1342 }
1343
TM4_NEON(uint8_t * dst)1344 static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1345
VE4_NEON(uint8_t * dst)1346 static void VE4_NEON(uint8_t* dst) { // vertical
1347 // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1348 const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1)); // top row
1349 const uint64x1_t A1 = vshr_n_u64(A0, 8);
1350 const uint64x1_t A2 = vshr_n_u64(A0, 16);
1351 const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1352 const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1353 const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1354 const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1355 const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1356 int i;
1357 for (i = 0; i < 4; ++i) {
1358 vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1359 }
1360 }
1361
RD4_NEON(uint8_t * dst)1362 static void RD4_NEON(uint8_t* dst) { // Down-right
1363 const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1364 const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1365 const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1366 const uint32_t I = dst[-1 + 0 * BPS];
1367 const uint32_t J = dst[-1 + 1 * BPS];
1368 const uint32_t K = dst[-1 + 2 * BPS];
1369 const uint32_t L = dst[-1 + 3 * BPS];
1370 const uint64x1_t LKJI____ =
1371 vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
1372 const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1373 const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1374 const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1375 const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1376 const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1377 const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1378 const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1379 const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1380 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1381 const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1382 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1383 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1384 const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1385 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1386 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1387 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1388 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1389 }
1390
LD4_NEON(uint8_t * dst)1391 static void LD4_NEON(uint8_t* dst) { // Down-left
1392 // Note using the same shift trick as VE4() is slower here.
1393 const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1394 const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1395 const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1396 const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1397 const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1398 const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1399 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1400 const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1401 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1402 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1403 const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1404 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1405 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1406 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1407 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1408 }
1409
1410 //------------------------------------------------------------------------------
1411 // Chroma
1412
VE8uv_NEON(uint8_t * dst)1413 static void VE8uv_NEON(uint8_t* dst) { // vertical
1414 const uint8x8_t top = vld1_u8(dst - BPS);
1415 int j;
1416 for (j = 0; j < 8; ++j) {
1417 vst1_u8(dst + j * BPS, top);
1418 }
1419 }
1420
HE8uv_NEON(uint8_t * dst)1421 static void HE8uv_NEON(uint8_t* dst) { // horizontal
1422 int j;
1423 for (j = 0; j < 8; ++j) {
1424 const uint8x8_t left = vld1_dup_u8(dst - 1);
1425 vst1_u8(dst, left);
1426 dst += BPS;
1427 }
1428 }
1429
DC8_NEON(uint8_t * dst,int do_top,int do_left)1430 static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1431 uint16x8_t sum_top;
1432 uint16x8_t sum_left;
1433 uint8x8_t dc0;
1434
1435 if (do_top) {
1436 const uint8x8_t A = vld1_u8(dst - BPS); // top row
1437 #if WEBP_AARCH64
1438 const uint16_t p2 = vaddlv_u8(A);
1439 sum_top = vdupq_n_u16(p2);
1440 #else
1441 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
1442 const uint16x4_t p1 = vpadd_u16(p0, p0);
1443 const uint16x4_t p2 = vpadd_u16(p1, p1);
1444 sum_top = vcombine_u16(p2, p2);
1445 #endif
1446 }
1447
1448 if (do_left) {
1449 const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1450 const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1451 const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1452 const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1453 const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
1454 const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
1455 const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
1456 const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
1457 const uint16x8_t s0 = vaddl_u8(L0, L1);
1458 const uint16x8_t s1 = vaddl_u8(L2, L3);
1459 const uint16x8_t s2 = vaddl_u8(L4, L5);
1460 const uint16x8_t s3 = vaddl_u8(L6, L7);
1461 const uint16x8_t s01 = vaddq_u16(s0, s1);
1462 const uint16x8_t s23 = vaddq_u16(s2, s3);
1463 sum_left = vaddq_u16(s01, s23);
1464 }
1465
1466 if (do_top && do_left) {
1467 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1468 dc0 = vrshrn_n_u16(sum, 4);
1469 } else if (do_top) {
1470 dc0 = vrshrn_n_u16(sum_top, 3);
1471 } else if (do_left) {
1472 dc0 = vrshrn_n_u16(sum_left, 3);
1473 } else {
1474 dc0 = vdup_n_u8(0x80);
1475 }
1476
1477 {
1478 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1479 int i;
1480 for (i = 0; i < 8; ++i) {
1481 vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1482 }
1483 }
1484 }
1485
DC8uv_NEON(uint8_t * dst)1486 static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
DC8uvNoTop_NEON(uint8_t * dst)1487 static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
DC8uvNoLeft_NEON(uint8_t * dst)1488 static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
DC8uvNoTopLeft_NEON(uint8_t * dst)1489 static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1490
TM8uv_NEON(uint8_t * dst)1491 static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1492
1493 //------------------------------------------------------------------------------
1494 // 16x16
1495
VE16_NEON(uint8_t * dst)1496 static void VE16_NEON(uint8_t* dst) { // vertical
1497 const uint8x16_t top = vld1q_u8(dst - BPS);
1498 int j;
1499 for (j = 0; j < 16; ++j) {
1500 vst1q_u8(dst + j * BPS, top);
1501 }
1502 }
1503
HE16_NEON(uint8_t * dst)1504 static void HE16_NEON(uint8_t* dst) { // horizontal
1505 int j;
1506 for (j = 0; j < 16; ++j) {
1507 const uint8x16_t left = vld1q_dup_u8(dst - 1);
1508 vst1q_u8(dst, left);
1509 dst += BPS;
1510 }
1511 }
1512
DC16_NEON(uint8_t * dst,int do_top,int do_left)1513 static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1514 uint16x8_t sum_top;
1515 uint16x8_t sum_left;
1516 uint8x8_t dc0;
1517
1518 if (do_top) {
1519 const uint8x16_t A = vld1q_u8(dst - BPS); // top row
1520 #if WEBP_AARCH64
1521 const uint16_t p3 = vaddlvq_u8(A);
1522 sum_top = vdupq_n_u16(p3);
1523 #else
1524 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
1525 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1526 const uint16x4_t p2 = vpadd_u16(p1, p1);
1527 const uint16x4_t p3 = vpadd_u16(p2, p2);
1528 sum_top = vcombine_u16(p3, p3);
1529 #endif
1530 }
1531
1532 if (do_left) {
1533 int i;
1534 sum_left = vdupq_n_u16(0);
1535 for (i = 0; i < 16; i += 8) {
1536 const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
1537 const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
1538 const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
1539 const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
1540 const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
1541 const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
1542 const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
1543 const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
1544 const uint16x8_t s0 = vaddl_u8(L0, L1);
1545 const uint16x8_t s1 = vaddl_u8(L2, L3);
1546 const uint16x8_t s2 = vaddl_u8(L4, L5);
1547 const uint16x8_t s3 = vaddl_u8(L6, L7);
1548 const uint16x8_t s01 = vaddq_u16(s0, s1);
1549 const uint16x8_t s23 = vaddq_u16(s2, s3);
1550 const uint16x8_t sum = vaddq_u16(s01, s23);
1551 sum_left = vaddq_u16(sum_left, sum);
1552 }
1553 }
1554
1555 if (do_top && do_left) {
1556 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1557 dc0 = vrshrn_n_u16(sum, 5);
1558 } else if (do_top) {
1559 dc0 = vrshrn_n_u16(sum_top, 4);
1560 } else if (do_left) {
1561 dc0 = vrshrn_n_u16(sum_left, 4);
1562 } else {
1563 dc0 = vdup_n_u8(0x80);
1564 }
1565
1566 {
1567 const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1568 int i;
1569 for (i = 0; i < 16; ++i) {
1570 vst1q_u8(dst + i * BPS, dc);
1571 }
1572 }
1573 }
1574
DC16TopLeft_NEON(uint8_t * dst)1575 static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
DC16NoTop_NEON(uint8_t * dst)1576 static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
DC16NoLeft_NEON(uint8_t * dst)1577 static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
DC16NoTopLeft_NEON(uint8_t * dst)1578 static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1579
TM16_NEON(uint8_t * dst)1580 static void TM16_NEON(uint8_t* dst) {
1581 const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1); // top-left pixel 'A[-1]'
1582 const uint8x16_t T = vld1q_u8(dst - BPS); // top row 'A[0..15]'
1583 // A[c] - A[-1]
1584 const uint16x8_t d_lo = vsubl_u8(vget_low_u8(T), TL);
1585 const uint16x8_t d_hi = vsubl_u8(vget_high_u8(T), TL);
1586 int y;
1587 for (y = 0; y < 16; y += 4) {
1588 // left edge
1589 const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
1590 const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
1591 const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
1592 const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
1593 // L[r] + A[c] - A[-1]
1594 const int16x8_t r0_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L0));
1595 const int16x8_t r1_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L1));
1596 const int16x8_t r2_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L2));
1597 const int16x8_t r3_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L3));
1598 const int16x8_t r0_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L0));
1599 const int16x8_t r1_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L1));
1600 const int16x8_t r2_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L2));
1601 const int16x8_t r3_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L3));
1602 // Saturate and store the result.
1603 const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1604 const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1605 const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1606 const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1607 vst1q_u8(dst + 0 * BPS, row0);
1608 vst1q_u8(dst + 1 * BPS, row1);
1609 vst1q_u8(dst + 2 * BPS, row2);
1610 vst1q_u8(dst + 3 * BPS, row3);
1611 dst += 4 * BPS;
1612 }
1613 }
1614
1615 //------------------------------------------------------------------------------
1616 // Entry point
1617
1618 extern void VP8DspInitNEON(void);
1619
VP8DspInitNEON(void)1620 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1621 VP8Transform = TransformTwo_NEON;
1622 VP8TransformAC3 = TransformAC3_NEON;
1623 VP8TransformDC = TransformDC_NEON;
1624 VP8TransformWHT = TransformWHT_NEON;
1625
1626 VP8VFilter16 = VFilter16_NEON;
1627 VP8VFilter16i = VFilter16i_NEON;
1628 VP8HFilter16 = HFilter16_NEON;
1629 #if !defined(WORK_AROUND_GCC)
1630 VP8HFilter16i = HFilter16i_NEON;
1631 #endif
1632 VP8VFilter8 = VFilter8_NEON;
1633 VP8VFilter8i = VFilter8i_NEON;
1634 #if !defined(WORK_AROUND_GCC)
1635 VP8HFilter8 = HFilter8_NEON;
1636 VP8HFilter8i = HFilter8i_NEON;
1637 #endif
1638 VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1639 VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1640 VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1641 VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1642
1643 VP8PredLuma4[0] = DC4_NEON;
1644 VP8PredLuma4[1] = TM4_NEON;
1645 VP8PredLuma4[2] = VE4_NEON;
1646 VP8PredLuma4[4] = RD4_NEON;
1647 VP8PredLuma4[6] = LD4_NEON;
1648
1649 VP8PredLuma16[0] = DC16TopLeft_NEON;
1650 VP8PredLuma16[1] = TM16_NEON;
1651 VP8PredLuma16[2] = VE16_NEON;
1652 VP8PredLuma16[3] = HE16_NEON;
1653 VP8PredLuma16[4] = DC16NoTop_NEON;
1654 VP8PredLuma16[5] = DC16NoLeft_NEON;
1655 VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1656
1657 VP8PredChroma8[0] = DC8uv_NEON;
1658 VP8PredChroma8[1] = TM8uv_NEON;
1659 VP8PredChroma8[2] = VE8uv_NEON;
1660 VP8PredChroma8[3] = HE8uv_NEON;
1661 VP8PredChroma8[4] = DC8uvNoTop_NEON;
1662 VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1663 VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1664 }
1665
1666 #else // !WEBP_USE_NEON
1667
1668 WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1669
1670 #endif // WEBP_USE_NEON
1671