1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of dsp functions and loop filtering.
11 //
12 // Authors: Somnath Banerjee (somnath@google.com)
13 //          Johann Koenig (johannkoenig@google.com)
14 
15 #include "src/dsp/dsp.h"
16 
17 #if defined(WEBP_USE_NEON)
18 
19 #include "src/dsp/neon.h"
20 #include "src/dec/vp8i_dec.h"
21 
22 //------------------------------------------------------------------------------
23 // NxM Loading functions
24 
25 #if !defined(WORK_AROUND_GCC)
26 
27 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
28 // (register alloc, probably). The variants somewhat mitigate the problem, but
29 // not quite. HFilter16i() remains problematic.
Load4x8_NEON(const uint8_t * const src,int stride)30 static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
31                                             int stride) {
32   const uint8x8_t zero = vdup_n_u8(0);
33   uint8x8x4_t out;
34   INIT_VECTOR4(out, zero, zero, zero, zero);
35   out = vld4_lane_u8(src + 0 * stride, out, 0);
36   out = vld4_lane_u8(src + 1 * stride, out, 1);
37   out = vld4_lane_u8(src + 2 * stride, out, 2);
38   out = vld4_lane_u8(src + 3 * stride, out, 3);
39   out = vld4_lane_u8(src + 4 * stride, out, 4);
40   out = vld4_lane_u8(src + 5 * stride, out, 5);
41   out = vld4_lane_u8(src + 6 * stride, out, 6);
42   out = vld4_lane_u8(src + 7 * stride, out, 7);
43   return out;
44 }
45 
Load4x16_NEON(const uint8_t * const src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)46 static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
47                                       uint8x16_t* const p1,
48                                       uint8x16_t* const p0,
49                                       uint8x16_t* const q0,
50                                       uint8x16_t* const q1) {
51   // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
52   // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
53   const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
54   const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
55   *p1 = vcombine_u8(row0.val[0], row8.val[0]);
56   *p0 = vcombine_u8(row0.val[1], row8.val[1]);
57   *q0 = vcombine_u8(row0.val[2], row8.val[2]);
58   *q1 = vcombine_u8(row0.val[3], row8.val[3]);
59 }
60 
61 #else  // WORK_AROUND_GCC
62 
63 #define LOADQ_LANE_32b(VALUE, LANE) do {                             \
64   (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
65   src += stride;                                                     \
66 } while (0)
67 
Load4x16_NEON(const uint8_t * src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)68 static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
69                                       uint8x16_t* const p1,
70                                       uint8x16_t* const p0,
71                                       uint8x16_t* const q0,
72                                       uint8x16_t* const q1) {
73   const uint32x4_t zero = vdupq_n_u32(0);
74   uint32x4x4_t in;
75   INIT_VECTOR4(in, zero, zero, zero, zero);
76   src -= 2;
77   LOADQ_LANE_32b(in.val[0], 0);
78   LOADQ_LANE_32b(in.val[1], 0);
79   LOADQ_LANE_32b(in.val[2], 0);
80   LOADQ_LANE_32b(in.val[3], 0);
81   LOADQ_LANE_32b(in.val[0], 1);
82   LOADQ_LANE_32b(in.val[1], 1);
83   LOADQ_LANE_32b(in.val[2], 1);
84   LOADQ_LANE_32b(in.val[3], 1);
85   LOADQ_LANE_32b(in.val[0], 2);
86   LOADQ_LANE_32b(in.val[1], 2);
87   LOADQ_LANE_32b(in.val[2], 2);
88   LOADQ_LANE_32b(in.val[3], 2);
89   LOADQ_LANE_32b(in.val[0], 3);
90   LOADQ_LANE_32b(in.val[1], 3);
91   LOADQ_LANE_32b(in.val[2], 3);
92   LOADQ_LANE_32b(in.val[3], 3);
93   // Transpose four 4x4 parts:
94   {
95     const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
96                                         vreinterpretq_u8_u32(in.val[1]));
97     const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
98                                         vreinterpretq_u8_u32(in.val[3]));
99     const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
100                                          vreinterpretq_u16_u8(row23.val[0]));
101     const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
102                                          vreinterpretq_u16_u8(row23.val[1]));
103     *p1 = vreinterpretq_u8_u16(row02.val[0]);
104     *p0 = vreinterpretq_u8_u16(row13.val[0]);
105     *q0 = vreinterpretq_u8_u16(row02.val[1]);
106     *q1 = vreinterpretq_u8_u16(row13.val[1]);
107   }
108 }
109 #undef LOADQ_LANE_32b
110 
111 #endif  // !WORK_AROUND_GCC
112 
Load8x16_NEON(const uint8_t * const src,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)113 static WEBP_INLINE void Load8x16_NEON(
114     const uint8_t* const src, int stride,
115     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
116     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
117     uint8x16_t* const q2, uint8x16_t* const q3) {
118   Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
119   Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
120 }
121 
Load16x4_NEON(const uint8_t * const src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)122 static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
123                                       uint8x16_t* const p1,
124                                       uint8x16_t* const p0,
125                                       uint8x16_t* const q0,
126                                       uint8x16_t* const q1) {
127   *p1 = vld1q_u8(src - 2 * stride);
128   *p0 = vld1q_u8(src - 1 * stride);
129   *q0 = vld1q_u8(src + 0 * stride);
130   *q1 = vld1q_u8(src + 1 * stride);
131 }
132 
Load16x8_NEON(const uint8_t * const src,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)133 static WEBP_INLINE void Load16x8_NEON(
134     const uint8_t* const src, int stride,
135     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
136     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
137     uint8x16_t* const q2, uint8x16_t* const q3) {
138   Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
139   Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
140 }
141 
Load8x8x2_NEON(const uint8_t * const u,const uint8_t * const v,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)142 static WEBP_INLINE void Load8x8x2_NEON(
143     const uint8_t* const u, const uint8_t* const v, int stride,
144     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
145     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
146     uint8x16_t* const q2, uint8x16_t* const q3) {
147   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
148   // and the v-samples on the higher half.
149   *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
150   *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
151   *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
152   *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
153   *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
154   *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
155   *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
156   *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
157 }
158 
159 #if !defined(WORK_AROUND_GCC)
160 
161 #define LOAD_UV_8(ROW) \
162   vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
163 
Load8x8x2T_NEON(const uint8_t * const u,const uint8_t * const v,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)164 static WEBP_INLINE void Load8x8x2T_NEON(
165     const uint8_t* const u, const uint8_t* const v, int stride,
166     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
167     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
168     uint8x16_t* const q2, uint8x16_t* const q3) {
169   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
170   // and the v-samples on the higher half.
171   const uint8x16_t row0 = LOAD_UV_8(0);
172   const uint8x16_t row1 = LOAD_UV_8(1);
173   const uint8x16_t row2 = LOAD_UV_8(2);
174   const uint8x16_t row3 = LOAD_UV_8(3);
175   const uint8x16_t row4 = LOAD_UV_8(4);
176   const uint8x16_t row5 = LOAD_UV_8(5);
177   const uint8x16_t row6 = LOAD_UV_8(6);
178   const uint8x16_t row7 = LOAD_UV_8(7);
179   // Perform two side-by-side 8x8 transposes
180   // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
181   // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
182   // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
183   // u30 u31 u32 u33 u34 u35 u36 u37 | ...
184   // u40 u41 u42 u43 u44 u45 u46 u47 | ...
185   // u50 u51 u52 u53 u54 u55 u56 u57 | ...
186   // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
187   // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
188   const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
189                                                     // u01 u11 u03 u13 ...
190   const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
191                                                     // u21 u31 u23 u33 ...
192   const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
193   const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
194   const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
195                                        vreinterpretq_u16_u8(row23.val[0]));
196   const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
197                                        vreinterpretq_u16_u8(row23.val[1]));
198   const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
199                                        vreinterpretq_u16_u8(row67.val[0]));
200   const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
201                                        vreinterpretq_u16_u8(row67.val[1]));
202   const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
203                                        vreinterpretq_u32_u16(row46.val[0]));
204   const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
205                                        vreinterpretq_u32_u16(row46.val[1]));
206   const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
207                                        vreinterpretq_u32_u16(row57.val[0]));
208   const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
209                                        vreinterpretq_u32_u16(row57.val[1]));
210   *p3 = vreinterpretq_u8_u32(row04.val[0]);
211   *p2 = vreinterpretq_u8_u32(row15.val[0]);
212   *p1 = vreinterpretq_u8_u32(row26.val[0]);
213   *p0 = vreinterpretq_u8_u32(row37.val[0]);
214   *q0 = vreinterpretq_u8_u32(row04.val[1]);
215   *q1 = vreinterpretq_u8_u32(row15.val[1]);
216   *q2 = vreinterpretq_u8_u32(row26.val[1]);
217   *q3 = vreinterpretq_u8_u32(row37.val[1]);
218 }
219 #undef LOAD_UV_8
220 
221 #endif  // !WORK_AROUND_GCC
222 
Store2x8_NEON(const uint8x8x2_t v,uint8_t * const dst,int stride)223 static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
224                                       uint8_t* const dst, int stride) {
225   vst2_lane_u8(dst + 0 * stride, v, 0);
226   vst2_lane_u8(dst + 1 * stride, v, 1);
227   vst2_lane_u8(dst + 2 * stride, v, 2);
228   vst2_lane_u8(dst + 3 * stride, v, 3);
229   vst2_lane_u8(dst + 4 * stride, v, 4);
230   vst2_lane_u8(dst + 5 * stride, v, 5);
231   vst2_lane_u8(dst + 6 * stride, v, 6);
232   vst2_lane_u8(dst + 7 * stride, v, 7);
233 }
234 
Store2x16_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const dst,int stride)235 static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
236                                        uint8_t* const dst, int stride) {
237   uint8x8x2_t lo, hi;
238   lo.val[0] = vget_low_u8(p0);
239   lo.val[1] = vget_low_u8(q0);
240   hi.val[0] = vget_high_u8(p0);
241   hi.val[1] = vget_high_u8(q0);
242   Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
243   Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
244 }
245 
246 #if !defined(WORK_AROUND_GCC)
Store4x8_NEON(const uint8x8x4_t v,uint8_t * const dst,int stride)247 static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
248                                       uint8_t* const dst, int stride) {
249   vst4_lane_u8(dst + 0 * stride, v, 0);
250   vst4_lane_u8(dst + 1 * stride, v, 1);
251   vst4_lane_u8(dst + 2 * stride, v, 2);
252   vst4_lane_u8(dst + 3 * stride, v, 3);
253   vst4_lane_u8(dst + 4 * stride, v, 4);
254   vst4_lane_u8(dst + 5 * stride, v, 5);
255   vst4_lane_u8(dst + 6 * stride, v, 6);
256   vst4_lane_u8(dst + 7 * stride, v, 7);
257 }
258 
Store4x16_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const dst,int stride)259 static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
260                                        const uint8x16_t q0, const uint8x16_t q1,
261                                        uint8_t* const dst, int stride) {
262   uint8x8x4_t lo, hi;
263   INIT_VECTOR4(lo,
264                vget_low_u8(p1), vget_low_u8(p0),
265                vget_low_u8(q0), vget_low_u8(q1));
266   INIT_VECTOR4(hi,
267                vget_high_u8(p1), vget_high_u8(p0),
268                vget_high_u8(q0), vget_high_u8(q1));
269   Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
270   Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
271 }
272 #endif  // !WORK_AROUND_GCC
273 
Store16x2_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const dst,int stride)274 static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
275                                        uint8_t* const dst, int stride) {
276   vst1q_u8(dst - stride, p0);
277   vst1q_u8(dst, q0);
278 }
279 
Store16x4_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const dst,int stride)280 static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
281                                        const uint8x16_t q0, const uint8x16_t q1,
282                                        uint8_t* const dst, int stride) {
283   Store16x2_NEON(p1, p0, dst - stride, stride);
284   Store16x2_NEON(q0, q1, dst + stride, stride);
285 }
286 
Store8x2x2_NEON(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const u,uint8_t * const v,int stride)287 static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
288                                         const uint8x16_t q0,
289                                         uint8_t* const u, uint8_t* const v,
290                                         int stride) {
291   // p0 and q0 contain the u+v samples packed in low/high halves.
292   vst1_u8(u - stride, vget_low_u8(p0));
293   vst1_u8(u,          vget_low_u8(q0));
294   vst1_u8(v - stride, vget_high_u8(p0));
295   vst1_u8(v,          vget_high_u8(q0));
296 }
297 
Store8x4x2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const u,uint8_t * const v,int stride)298 static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
299                                         const uint8x16_t p0,
300                                         const uint8x16_t q0,
301                                         const uint8x16_t q1,
302                                         uint8_t* const u, uint8_t* const v,
303                                         int stride) {
304   // The p1...q1 registers contain the u+v samples packed in low/high halves.
305   Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
306   Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
307 }
308 
309 #if !defined(WORK_AROUND_GCC)
310 
311 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
312   vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
313   vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
314   (DST) += stride;                                \
315 } while (0)
316 
Store6x8x2_NEON(const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,uint8_t * u,uint8_t * v,int stride)317 static WEBP_INLINE void Store6x8x2_NEON(
318     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
319     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
320     uint8_t* u, uint8_t* v, int stride) {
321   uint8x8x3_t u0, u1, v0, v1;
322   INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
323   INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
324   INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
325   INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
326   STORE6_LANE(u, u0, u1, 0);
327   STORE6_LANE(u, u0, u1, 1);
328   STORE6_LANE(u, u0, u1, 2);
329   STORE6_LANE(u, u0, u1, 3);
330   STORE6_LANE(u, u0, u1, 4);
331   STORE6_LANE(u, u0, u1, 5);
332   STORE6_LANE(u, u0, u1, 6);
333   STORE6_LANE(u, u0, u1, 7);
334   STORE6_LANE(v, v0, v1, 0);
335   STORE6_LANE(v, v0, v1, 1);
336   STORE6_LANE(v, v0, v1, 2);
337   STORE6_LANE(v, v0, v1, 3);
338   STORE6_LANE(v, v0, v1, 4);
339   STORE6_LANE(v, v0, v1, 5);
340   STORE6_LANE(v, v0, v1, 6);
341   STORE6_LANE(v, v0, v1, 7);
342 }
343 #undef STORE6_LANE
344 
Store4x8x2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const u,uint8_t * const v,int stride)345 static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
346                                         const uint8x16_t p0,
347                                         const uint8x16_t q0,
348                                         const uint8x16_t q1,
349                                         uint8_t* const u, uint8_t* const v,
350                                         int stride) {
351   uint8x8x4_t u0, v0;
352   INIT_VECTOR4(u0,
353                vget_low_u8(p1), vget_low_u8(p0),
354                vget_low_u8(q0), vget_low_u8(q1));
355   INIT_VECTOR4(v0,
356                vget_high_u8(p1), vget_high_u8(p0),
357                vget_high_u8(q0), vget_high_u8(q1));
358   vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
359   vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
360   vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
361   vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
362   vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
363   vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
364   vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
365   vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
366   vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
367   vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
368   vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
369   vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
370   vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
371   vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
372   vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
373   vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
374 }
375 
376 #endif  // !WORK_AROUND_GCC
377 
378 // Zero extend 'v' to an int16x8_t.
ConvertU8ToS16_NEON(uint8x8_t v)379 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
380   return vreinterpretq_s16_u16(vmovl_u8(v));
381 }
382 
383 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
384 // to the corresponding rows of 'dst'.
SaturateAndStore4x4_NEON(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)385 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
386                                                  const int16x8_t dst01,
387                                                  const int16x8_t dst23) {
388   // Unsigned saturate to 8b.
389   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
390   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
391 
392   // Store the results.
393   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
394   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
395   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
396   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
397 }
398 
Add4x4_NEON(const int16x8_t row01,const int16x8_t row23,uint8_t * const dst)399 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
400                                     const int16x8_t row23,
401                                     uint8_t* const dst) {
402   uint32x2_t dst01 = vdup_n_u32(0);
403   uint32x2_t dst23 = vdup_n_u32(0);
404 
405   // Load the source pixels.
406   dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
407   dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
408   dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
409   dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
410 
411   {
412     // Convert to 16b.
413     const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
414     const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
415 
416     // Descale with rounding.
417     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
418     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
419     // Add the inverse transform.
420     SaturateAndStore4x4_NEON(dst, out01, out23);
421   }
422 }
423 
424 //-----------------------------------------------------------------------------
425 // Simple In-loop filtering (Paragraph 15.2)
426 
NeedsFilter_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,int thresh)427 static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
428                                    const uint8x16_t q0, const uint8x16_t q1,
429                                    int thresh) {
430   const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
431   const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
432   const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
433   const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
434   const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
435   const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
436   const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
437   return mask;
438 }
439 
FlipSign_NEON(const uint8x16_t v)440 static int8x16_t FlipSign_NEON(const uint8x16_t v) {
441   const uint8x16_t sign_bit = vdupq_n_u8(0x80);
442   return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
443 }
444 
FlipSignBack_NEON(const int8x16_t v)445 static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
446   const int8x16_t sign_bit = vdupq_n_s8(0x80);
447   return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
448 }
449 
GetBaseDelta_NEON(const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1)450 static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
451                                    const int8x16_t q0, const int8x16_t q1) {
452   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
453   const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
454   const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
455   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
456   const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
457   return s3;
458 }
459 
GetBaseDelta0_NEON(const int8x16_t p0,const int8x16_t q0)460 static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
461   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
462   const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
463   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
464   return s2;
465 }
466 
467 //------------------------------------------------------------------------------
468 
ApplyFilter2NoFlip_NEON(const int8x16_t p0s,const int8x16_t q0s,const int8x16_t delta,int8x16_t * const op0,int8x16_t * const oq0)469 static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
470                                     const int8x16_t delta,
471                                     int8x16_t* const op0,
472                                     int8x16_t* const oq0) {
473   const int8x16_t kCst3 = vdupq_n_s8(0x03);
474   const int8x16_t kCst4 = vdupq_n_s8(0x04);
475   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
476   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
477   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
478   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
479   *op0 = vqaddq_s8(p0s, delta3);
480   *oq0 = vqsubq_s8(q0s, delta4);
481 }
482 
483 #if defined(WEBP_USE_INTRINSICS)
484 
ApplyFilter2_NEON(const int8x16_t p0s,const int8x16_t q0s,const int8x16_t delta,uint8x16_t * const op0,uint8x16_t * const oq0)485 static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
486                               const int8x16_t delta,
487                               uint8x16_t* const op0, uint8x16_t* const oq0) {
488   const int8x16_t kCst3 = vdupq_n_s8(0x03);
489   const int8x16_t kCst4 = vdupq_n_s8(0x04);
490   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
491   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
492   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
493   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
494   const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
495   const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
496   *op0 = FlipSignBack_NEON(sp0);
497   *oq0 = FlipSignBack_NEON(sq0);
498 }
499 
DoFilter2_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t mask,uint8x16_t * const op0,uint8x16_t * const oq0)500 static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
501                            const uint8x16_t q0, const uint8x16_t q1,
502                            const uint8x16_t mask,
503                            uint8x16_t* const op0, uint8x16_t* const oq0) {
504   const int8x16_t p1s = FlipSign_NEON(p1);
505   const int8x16_t p0s = FlipSign_NEON(p0);
506   const int8x16_t q0s = FlipSign_NEON(q0);
507   const int8x16_t q1s = FlipSign_NEON(q1);
508   const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
509   const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
510   ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
511 }
512 
SimpleVFilter16_NEON(uint8_t * p,int stride,int thresh)513 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
514   uint8x16_t p1, p0, q0, q1, op0, oq0;
515   Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
516   {
517     const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
518     DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
519   }
520   Store16x2_NEON(op0, oq0, p, stride);
521 }
522 
SimpleHFilter16_NEON(uint8_t * p,int stride,int thresh)523 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
524   uint8x16_t p1, p0, q0, q1, oq0, op0;
525   Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
526   {
527     const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
528     DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
529   }
530   Store2x16_NEON(op0, oq0, p, stride);
531 }
532 
533 #else
534 
535 // Load/Store vertical edge
536 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
537   "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
538   "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
539   "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
540   "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
541   "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
542   "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
543   "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
544   "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
545 
546 #define STORE8x2(c1, c2, p, stride)                                            \
547   "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
548   "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
549   "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
550   "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
551   "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
552   "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
553   "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
554   "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
555 
556 #define QRegs "q0", "q1", "q2", "q3",                                          \
557               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
558 
559 #define FLIP_SIGN_BIT2(a, b, s)                                                \
560   "veor     " #a "," #a "," #s "               \n"                             \
561   "veor     " #b "," #b "," #s "               \n"                             \
562 
563 #define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
564   FLIP_SIGN_BIT2(a, b, s)                                                      \
565   FLIP_SIGN_BIT2(c, d, s)                                                      \
566 
567 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
568   "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
569   "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
570   "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
571   "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
572   "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
573   "vdup.8     q14, " #thresh "            \n"                                  \
574   "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
575 
576 #define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
577   "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
578   "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
579   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
580   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
581   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
582 
583 #define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
584   "vmov.i8    q15, #0x03                  \n"                                  \
585   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
586   "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
587   "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
588                                                                                \
589   "vmov.i8    q15, #0x04                  \n"                                  \
590   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
591   "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
592   "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
593 
594 // Applies filter on 2 pixels (p0 and q0)
595 #define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
596   NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
597   "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
598   FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
599   GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
600   "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
601   DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
602   FLIP_SIGN_BIT2(p0, q0, q10)
603 
SimpleVFilter16_NEON(uint8_t * p,int stride,int thresh)604 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
605   __asm__ volatile (
606     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
607 
608     "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
609     "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
610     "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
611     "vld1.u8    {q12}, [%[p]]                  \n"  // q1
612 
613     DO_FILTER2(q1, q2, q3, q12, %[thresh])
614 
615     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
616 
617     "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
618     "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
619     : [p] "+r"(p)
620     : [stride] "r"(stride), [thresh] "r"(thresh)
621     : "memory", QRegs
622   );
623 }
624 
SimpleHFilter16_NEON(uint8_t * p,int stride,int thresh)625 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
626   __asm__ volatile (
627     "sub        r4, %[p], #2                   \n"  // base1 = p - 2
628     "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
629     "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
630 
631     LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
632     LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
633     "vswp       d3, d24                        \n"  // p1:q1 p0:q3
634     "vswp       d5, d26                        \n"  // q0:q2 q1:q4
635     "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
636 
637     DO_FILTER2(q1, q2, q12, q13, %[thresh])
638 
639     "sub        %[p], %[p], #1                 \n"  // p - 1
640 
641     "vswp        d5, d24                       \n"
642     STORE8x2(d4, d5, [%[p]], %[stride])
643     STORE8x2(d24, d25, [%[p]], %[stride])
644 
645     : [p] "+r"(p)
646     : [stride] "r"(stride), [thresh] "r"(thresh)
647     : "memory", "r4", "r5", "r6", QRegs
648   );
649 }
650 
651 #undef LOAD8x4
652 #undef STORE8x2
653 
654 #endif    // WEBP_USE_INTRINSICS
655 
SimpleVFilter16i_NEON(uint8_t * p,int stride,int thresh)656 static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
657   uint32_t k;
658   for (k = 3; k != 0; --k) {
659     p += 4 * stride;
660     SimpleVFilter16_NEON(p, stride, thresh);
661   }
662 }
663 
SimpleHFilter16i_NEON(uint8_t * p,int stride,int thresh)664 static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
665   uint32_t k;
666   for (k = 3; k != 0; --k) {
667     p += 4;
668     SimpleHFilter16_NEON(p, stride, thresh);
669   }
670 }
671 
672 //------------------------------------------------------------------------------
673 // Complex In-loop filtering (Paragraph 15.3)
674 
NeedsHev_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,int hev_thresh)675 static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
676                                 const uint8x16_t q0, const uint8x16_t q1,
677                                 int hev_thresh) {
678   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
679   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
680   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
681   const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
682   const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
683   return mask;
684 }
685 
NeedsFilter2_NEON(const uint8x16_t p3,const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t q3,int ithresh,int thresh)686 static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
687                                     const uint8x16_t p1, const uint8x16_t p0,
688                                     const uint8x16_t q0, const uint8x16_t q1,
689                                     const uint8x16_t q2, const uint8x16_t q3,
690                                     int ithresh, int thresh) {
691   const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
692   const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
693   const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
694   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
695   const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
696   const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
697   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
698   const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
699   const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
700   const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
701   const uint8x16_t max12 = vmaxq_u8(max1, max2);
702   const uint8x16_t max123 = vmaxq_u8(max12, max3);
703   const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
704   const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
705   const uint8x16_t mask = vandq_u8(mask1, mask2);
706   return mask;
707 }
708 
709 //  4-points filter
710 
ApplyFilter4_NEON(const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1,const int8x16_t delta0,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1)711 static void ApplyFilter4_NEON(
712     const int8x16_t p1, const int8x16_t p0,
713     const int8x16_t q0, const int8x16_t q1,
714     const int8x16_t delta0,
715     uint8x16_t* const op1, uint8x16_t* const op0,
716     uint8x16_t* const oq0, uint8x16_t* const oq1) {
717   const int8x16_t kCst3 = vdupq_n_s8(0x03);
718   const int8x16_t kCst4 = vdupq_n_s8(0x04);
719   const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
720   const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
721   const int8x16_t a1 = vshrq_n_s8(delta1, 3);
722   const int8x16_t a2 = vshrq_n_s8(delta2, 3);
723   const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
724   *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
725   *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
726   *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
727   *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
728 }
729 
DoFilter4_NEON(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t mask,const uint8x16_t hev_mask,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1)730 static void DoFilter4_NEON(
731     const uint8x16_t p1, const uint8x16_t p0,
732     const uint8x16_t q0, const uint8x16_t q1,
733     const uint8x16_t mask, const uint8x16_t hev_mask,
734     uint8x16_t* const op1, uint8x16_t* const op0,
735     uint8x16_t* const oq0, uint8x16_t* const oq1) {
736   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
737   const int8x16_t p1s = FlipSign_NEON(p1);
738   int8x16_t p0s = FlipSign_NEON(p0);
739   int8x16_t q0s = FlipSign_NEON(q0);
740   const int8x16_t q1s = FlipSign_NEON(q1);
741   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
742 
743   // do_filter2 part (simple loopfilter on pixels with hev)
744   {
745     const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
746     const int8x16_t simple_lf_delta =
747         vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
748     ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
749   }
750 
751   // do_filter4 part (complex loopfilter on pixels without hev)
752   {
753     const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
754     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
755     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
756     const int8x16_t complex_lf_delta =
757         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
758     ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
759   }
760 }
761 
762 //  6-points filter
763 
ApplyFilter6_NEON(const int8x16_t p2,const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1,const int8x16_t q2,const int8x16_t delta,uint8x16_t * const op2,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1,uint8x16_t * const oq2)764 static void ApplyFilter6_NEON(
765     const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
766     const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
767     const int8x16_t delta,
768     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
769     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
770   // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
771   // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
772   // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
773   //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
774   const int8x8_t delta_lo = vget_low_s8(delta);
775   const int8x8_t delta_hi = vget_high_s8(delta);
776   const int8x8_t kCst9 = vdup_n_s8(9);
777   const int16x8_t kCstm1 = vdupq_n_s16(-1);
778   const int8x8_t kCst18 = vdup_n_s8(18);
779   const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
780   const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
781   const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
782   const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
783   const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
784   const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
785   const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
786   const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
787   const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
788   const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
789   const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
790   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
791   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
792 
793   *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
794   *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
795   *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
796   *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
797   *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
798   *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
799 }
800 
DoFilter6_NEON(const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t mask,const uint8x16_t hev_mask,uint8x16_t * const op2,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1,uint8x16_t * const oq2)801 static void DoFilter6_NEON(
802     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
803     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
804     const uint8x16_t mask, const uint8x16_t hev_mask,
805     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
806     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
807   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
808   const int8x16_t p2s = FlipSign_NEON(p2);
809   const int8x16_t p1s = FlipSign_NEON(p1);
810   int8x16_t p0s = FlipSign_NEON(p0);
811   int8x16_t q0s = FlipSign_NEON(q0);
812   const int8x16_t q1s = FlipSign_NEON(q1);
813   const int8x16_t q2s = FlipSign_NEON(q2);
814   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
815   const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
816 
817   // do_filter2 part (simple loopfilter on pixels with hev)
818   {
819     const int8x16_t simple_lf_delta =
820         vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
821     ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
822   }
823 
824   // do_filter6 part (complex loopfilter on pixels without hev)
825   {
826     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
827     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
828     const int8x16_t complex_lf_delta =
829         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
830     ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
831                       op2, op1, op0, oq0, oq1, oq2);
832   }
833 }
834 
835 // on macroblock edges
836 
VFilter16_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)837 static void VFilter16_NEON(uint8_t* p, int stride,
838                            int thresh, int ithresh, int hev_thresh) {
839   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840   Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841   {
842     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
843                                               ithresh, thresh);
844     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
845     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
848     Store16x2_NEON(op2, op1, p - 2 * stride, stride);
849     Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
850     Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
851   }
852 }
853 
HFilter16_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)854 static void HFilter16_NEON(uint8_t* p, int stride,
855                            int thresh, int ithresh, int hev_thresh) {
856   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
857   Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
858   {
859     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
860                                               ithresh, thresh);
861     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
862     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
863     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
864                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
865     Store2x16_NEON(op2, op1, p - 2, stride);
866     Store2x16_NEON(op0, oq0, p + 0, stride);
867     Store2x16_NEON(oq1, oq2, p + 2, stride);
868   }
869 }
870 
871 // on three inner edges
VFilter16i_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)872 static void VFilter16i_NEON(uint8_t* p, int stride,
873                             int thresh, int ithresh, int hev_thresh) {
874   uint32_t k;
875   uint8x16_t p3, p2, p1, p0;
876   Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
877   for (k = 3; k != 0; --k) {
878     uint8x16_t q0, q1, q2, q3;
879     p += 4 * stride;
880     Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
881     {
882       const uint8x16_t mask =
883           NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
884       const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
885       // p3 and p2 are not just temporary variables here: they will be
886       // re-used for next span. And q2/q3 will become p1/p0 accordingly.
887       DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
888       Store16x4_NEON(p1, p0, p3, p2, p, stride);
889       p1 = q2;
890       p0 = q3;
891     }
892   }
893 }
894 
895 #if !defined(WORK_AROUND_GCC)
HFilter16i_NEON(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)896 static void HFilter16i_NEON(uint8_t* p, int stride,
897                             int thresh, int ithresh, int hev_thresh) {
898   uint32_t k;
899   uint8x16_t p3, p2, p1, p0;
900   Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
901   for (k = 3; k != 0; --k) {
902     uint8x16_t q0, q1, q2, q3;
903     p += 4;
904     Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
905     {
906       const uint8x16_t mask =
907           NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
908       const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
909       DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
910       Store4x16_NEON(p1, p0, p3, p2, p, stride);
911       p1 = q2;
912       p0 = q3;
913     }
914   }
915 }
916 #endif  // !WORK_AROUND_GCC
917 
918 // 8-pixels wide variant, for chroma filtering
VFilter8_NEON(uint8_t * WEBP_RESTRICT u,uint8_t * WEBP_RESTRICT v,int stride,int thresh,int ithresh,int hev_thresh)919 static void VFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
920                           int stride, int thresh, int ithresh, int hev_thresh) {
921   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
922   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
923   {
924     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
925                                               ithresh, thresh);
926     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
927     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
928     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
929                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
930     Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
931     Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
932     Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
933   }
934 }
VFilter8i_NEON(uint8_t * WEBP_RESTRICT u,uint8_t * WEBP_RESTRICT v,int stride,int thresh,int ithresh,int hev_thresh)935 static void VFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
936                            int stride,
937                            int thresh, int ithresh, int hev_thresh) {
938   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
939   u += 4 * stride;
940   v += 4 * stride;
941   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
942   {
943     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
944                                               ithresh, thresh);
945     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
946     uint8x16_t op1, op0, oq0, oq1;
947     DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
948     Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
949   }
950 }
951 
952 #if !defined(WORK_AROUND_GCC)
HFilter8_NEON(uint8_t * WEBP_RESTRICT u,uint8_t * WEBP_RESTRICT v,int stride,int thresh,int ithresh,int hev_thresh)953 static void HFilter8_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
954                           int stride, int thresh, int ithresh, int hev_thresh) {
955   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
956   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
957   {
958     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
959                                               ithresh, thresh);
960     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
961     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
962     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
963                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
964     Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
965   }
966 }
967 
HFilter8i_NEON(uint8_t * WEBP_RESTRICT u,uint8_t * WEBP_RESTRICT v,int stride,int thresh,int ithresh,int hev_thresh)968 static void HFilter8i_NEON(uint8_t* WEBP_RESTRICT u, uint8_t* WEBP_RESTRICT v,
969                            int stride,
970                            int thresh, int ithresh, int hev_thresh) {
971   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
972   u += 4;
973   v += 4;
974   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
975   {
976     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
977                                               ithresh, thresh);
978     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
979     uint8x16_t op1, op0, oq0, oq1;
980     DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
981     Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
982   }
983 }
984 #endif  // !WORK_AROUND_GCC
985 
986 //-----------------------------------------------------------------------------
987 // Inverse transforms (Paragraph 14.4)
988 
989 // Technically these are unsigned but vqdmulh is only available in signed.
990 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
991 // changing the >> 16 to >> 15 and requiring an additional >> 1.
992 // We use this to our advantage with kC2. The canonical value is 35468.
993 // However, the high bit is set so treating it as signed will give incorrect
994 // results. We avoid this by down shifting by 1 here to clear the highest bit.
995 // Combined with the doubling effect of vqdmulh we get >> 16.
996 // This can not be applied to kC1 because the lowest bit is set. Down shifting
997 // the constant would reduce precision.
998 
999 // libwebp uses a trick to avoid some extra addition that libvpx does.
1000 // Instead of:
1001 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
1002 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
1003 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
1004 
1005 static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
1006 static const int16_t kC2 =
1007     WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.
1008 
1009 #if defined(WEBP_USE_INTRINSICS)
Transpose8x2_NEON(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)1010 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
1011                                           const int16x8_t in1,
1012                                           int16x8x2_t* const out) {
1013   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
1014   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
1015   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
1016                                                   // b0 d0 b1 d1 b2 d2 ...
1017   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
1018 }
1019 
TransformPass_NEON(int16x8x2_t * const rows)1020 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
1021   // {rows} = in0 | in4
1022   //          in8 | in12
1023   // B1 = in4 | in12
1024   const int16x8_t B1 =
1025       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1026   // C0 = kC1 * in4 | kC1 * in12
1027   // C1 = kC2 * in4 | kC2 * in12
1028   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1029   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1030   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1031                                 vget_low_s16(rows->val[1]));   // in0 + in8
1032   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1033                                 vget_low_s16(rows->val[1]));   // in0 - in8
1034   // c = kC2 * in4 - kC1 * in12
1035   // d = kC1 * in4 + kC2 * in12
1036   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1037   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1038   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
1039   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
1040   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
1041   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
1042   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1043   Transpose8x2_NEON(E0, E1, rows);
1044 }
1045 
TransformOne_NEON(const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)1046 static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
1047                               uint8_t* WEBP_RESTRICT dst) {
1048   int16x8x2_t rows;
1049   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1050   TransformPass_NEON(&rows);
1051   TransformPass_NEON(&rows);
1052   Add4x4_NEON(rows.val[0], rows.val[1], dst);
1053 }
1054 
1055 #else
1056 
TransformOne_NEON(const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)1057 static void TransformOne_NEON(const int16_t* WEBP_RESTRICT in,
1058                               uint8_t* WEBP_RESTRICT dst) {
1059   const int kBPS = BPS;
1060   // kC1, kC2. Padded because vld1.16 loads 8 bytes
1061   const int16_t constants[4] = { kC1, kC2, 0, 0 };
1062   /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1063   __asm__ volatile (
1064     "vld1.16         {q1, q2}, [%[in]]           \n"
1065     "vld1.16         {d0}, [%[constants]]        \n"
1066 
1067     /* d2: in[0]
1068      * d3: in[8]
1069      * d4: in[4]
1070      * d5: in[12]
1071      */
1072     "vswp            d3, d4                      \n"
1073 
1074     /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1075      * q9 = {in[4], in[12]} * kC2 >> 16
1076      */
1077     "vqdmulh.s16     q8, q2, d0[0]               \n"
1078     "vqdmulh.s16     q9, q2, d0[1]               \n"
1079 
1080     /* d22 = a = in[0] + in[8]
1081      * d23 = b = in[0] - in[8]
1082      */
1083     "vqadd.s16       d22, d2, d3                 \n"
1084     "vqsub.s16       d23, d2, d3                 \n"
1085 
1086     /* The multiplication should be x * kC1 >> 16
1087      * However, with vqdmulh we get x * kC1 * 2 >> 16
1088      * (multiply, double, return high half)
1089      * We avoided this in kC2 by pre-shifting the constant.
1090      * q8 = in[4]/[12] * kC1 >> 16
1091      */
1092     "vshr.s16        q8, q8, #1                  \n"
1093 
1094     /* Add {in[4], in[12]} back after the multiplication. This is handled by
1095      * adding 1 << 16 to kC1 in the libwebp C code.
1096      */
1097     "vqadd.s16       q8, q2, q8                  \n"
1098 
1099     /* d20 = c = in[4]*kC2 - in[12]*kC1
1100      * d21 = d = in[4]*kC1 + in[12]*kC2
1101      */
1102     "vqsub.s16       d20, d18, d17               \n"
1103     "vqadd.s16       d21, d19, d16               \n"
1104 
1105     /* d2 = tmp[0] = a + d
1106      * d3 = tmp[1] = b + c
1107      * d4 = tmp[2] = b - c
1108      * d5 = tmp[3] = a - d
1109      */
1110     "vqadd.s16       d2, d22, d21                \n"
1111     "vqadd.s16       d3, d23, d20                \n"
1112     "vqsub.s16       d4, d23, d20                \n"
1113     "vqsub.s16       d5, d22, d21                \n"
1114 
1115     "vzip.16         q1, q2                      \n"
1116     "vzip.16         q1, q2                      \n"
1117 
1118     "vswp            d3, d4                      \n"
1119 
1120     /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1121      * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1122      */
1123     "vqdmulh.s16     q8, q2, d0[0]               \n"
1124     "vqdmulh.s16     q9, q2, d0[1]               \n"
1125 
1126     /* d22 = a = tmp[0] + tmp[8]
1127      * d23 = b = tmp[0] - tmp[8]
1128      */
1129     "vqadd.s16       d22, d2, d3                 \n"
1130     "vqsub.s16       d23, d2, d3                 \n"
1131 
1132     /* See long winded explanations prior */
1133     "vshr.s16        q8, q8, #1                  \n"
1134     "vqadd.s16       q8, q2, q8                  \n"
1135 
1136     /* d20 = c = in[4]*kC2 - in[12]*kC1
1137      * d21 = d = in[4]*kC1 + in[12]*kC2
1138      */
1139     "vqsub.s16       d20, d18, d17               \n"
1140     "vqadd.s16       d21, d19, d16               \n"
1141 
1142     /* d2 = tmp[0] = a + d
1143      * d3 = tmp[1] = b + c
1144      * d4 = tmp[2] = b - c
1145      * d5 = tmp[3] = a - d
1146      */
1147     "vqadd.s16       d2, d22, d21                \n"
1148     "vqadd.s16       d3, d23, d20                \n"
1149     "vqsub.s16       d4, d23, d20                \n"
1150     "vqsub.s16       d5, d22, d21                \n"
1151 
1152     "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
1153     "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
1154     "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
1155     "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
1156 
1157     "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
1158 
1159     /* (val) + 4 >> 3 */
1160     "vrshr.s16       d2, d2, #3                  \n"
1161     "vrshr.s16       d3, d3, #3                  \n"
1162     "vrshr.s16       d4, d4, #3                  \n"
1163     "vrshr.s16       d5, d5, #3                  \n"
1164 
1165     "vzip.16         q1, q2                      \n"
1166     "vzip.16         q1, q2                      \n"
1167 
1168     /* Must accumulate before saturating */
1169     "vmovl.u8        q8, d6                      \n"
1170     "vmovl.u8        q9, d7                      \n"
1171 
1172     "vqadd.s16       q1, q1, q8                  \n"
1173     "vqadd.s16       q2, q2, q9                  \n"
1174 
1175     "vqmovun.s16     d0, q1                      \n"
1176     "vqmovun.s16     d1, q2                      \n"
1177 
1178     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
1179     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
1180     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
1181     "vst1.32         d1[1], [%[dst]]             \n"
1182 
1183     : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
1184     : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
1185     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
1186   );
1187 }
1188 
1189 #endif    // WEBP_USE_INTRINSICS
1190 
TransformTwo_NEON(const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst,int do_two)1191 static void TransformTwo_NEON(const int16_t* WEBP_RESTRICT in,
1192                               uint8_t* WEBP_RESTRICT dst, int do_two) {
1193   TransformOne_NEON(in, dst);
1194   if (do_two) {
1195     TransformOne_NEON(in + 16, dst + 4);
1196   }
1197 }
1198 
TransformDC_NEON(const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)1199 static void TransformDC_NEON(const int16_t* WEBP_RESTRICT in,
1200                              uint8_t* WEBP_RESTRICT dst) {
1201   const int16x8_t DC = vdupq_n_s16(in[0]);
1202   Add4x4_NEON(DC, DC, dst);
1203 }
1204 
1205 //------------------------------------------------------------------------------
1206 
1207 #define STORE_WHT(dst, col, rows) do {                  \
1208   *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1209   *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1210   *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1211   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1212 } while (0)
1213 
TransformWHT_NEON(const int16_t * WEBP_RESTRICT in,int16_t * WEBP_RESTRICT out)1214 static void TransformWHT_NEON(const int16_t* WEBP_RESTRICT in,
1215                               int16_t* WEBP_RESTRICT out) {
1216   int32x4x4_t tmp;
1217 
1218   {
1219     // Load the source.
1220     const int16x4_t in00_03 = vld1_s16(in + 0);
1221     const int16x4_t in04_07 = vld1_s16(in + 4);
1222     const int16x4_t in08_11 = vld1_s16(in + 8);
1223     const int16x4_t in12_15 = vld1_s16(in + 12);
1224     const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
1225     const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
1226     const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
1227     const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
1228     tmp.val[0] = vaddq_s32(a0, a1);
1229     tmp.val[1] = vaddq_s32(a3, a2);
1230     tmp.val[2] = vsubq_s32(a0, a1);
1231     tmp.val[3] = vsubq_s32(a3, a2);
1232     // Arrange the temporary results column-wise.
1233     tmp = Transpose4x4_NEON(tmp);
1234   }
1235 
1236   {
1237     const int32x4_t kCst3 = vdupq_n_s32(3);
1238     const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
1239     const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1240     const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1241     const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1242     const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1243 
1244     tmp.val[0] = vaddq_s32(a0, a1);
1245     tmp.val[1] = vaddq_s32(a3, a2);
1246     tmp.val[2] = vsubq_s32(a0, a1);
1247     tmp.val[3] = vsubq_s32(a3, a2);
1248 
1249     // right shift the results by 3.
1250     tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1251     tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1252     tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1253     tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1254 
1255     STORE_WHT(out, 0, tmp);
1256     STORE_WHT(out, 1, tmp);
1257     STORE_WHT(out, 2, tmp);
1258     STORE_WHT(out, 3, tmp);
1259   }
1260 }
1261 
1262 #undef STORE_WHT
1263 
1264 //------------------------------------------------------------------------------
1265 
TransformAC3_NEON(const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)1266 static void TransformAC3_NEON(const int16_t* WEBP_RESTRICT in,
1267                               uint8_t* WEBP_RESTRICT dst) {
1268   const int16x4_t A = vld1_dup_s16(in);
1269   const int16x4_t c4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL2(in[4]));
1270   const int16x4_t d4 = vdup_n_s16(WEBP_TRANSFORM_AC3_MUL1(in[4]));
1271   const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
1272   const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
1273   const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
1274                       (uint64_t)( c1 & 0xffff) << 16 |
1275                       (uint64_t)(-c1 & 0xffff) << 32 |
1276                       (uint64_t)(-d1 & 0xffff) << 48;
1277   const int16x4_t CD = vcreate_s16(cd);
1278   const int16x4_t B = vqadd_s16(A, CD);
1279   const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1280   const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1281   Add4x4_NEON(m0_m1, m2_m3, dst);
1282 }
1283 
1284 //------------------------------------------------------------------------------
1285 // 4x4
1286 
DC4_NEON(uint8_t * dst)1287 static void DC4_NEON(uint8_t* dst) {    // DC
1288   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1289   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1290   const uint16x4_t p1 = vpadd_u16(p0, p0);
1291   const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1292   const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1293   const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1294   const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1295   const uint16x8_t s0 = vaddl_u8(L0, L1);
1296   const uint16x8_t s1 = vaddl_u8(L2, L3);
1297   const uint16x8_t s01 = vaddq_u16(s0, s1);
1298   const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
1299   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
1300   const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1301   int i;
1302   for (i = 0; i < 4; ++i) {
1303     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
1304   }
1305 }
1306 
1307 // TrueMotion (4x4 + 8x8)
TrueMotion_NEON(uint8_t * dst,int size)1308 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
1309   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1310   const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
1311   const uint16x8_t d = vsubl_u8(T, TL);  // A[c] - A[-1]
1312   int y;
1313   for (y = 0; y < size; y += 4) {
1314     // left edge
1315     const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
1316     const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
1317     const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
1318     const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
1319     // L[r] + A[c] - A[-1]
1320     const int16x8_t r0 = vreinterpretq_s16_u16(vaddw_u8(d, L0));
1321     const int16x8_t r1 = vreinterpretq_s16_u16(vaddw_u8(d, L1));
1322     const int16x8_t r2 = vreinterpretq_s16_u16(vaddw_u8(d, L2));
1323     const int16x8_t r3 = vreinterpretq_s16_u16(vaddw_u8(d, L3));
1324     // Saturate and store the result.
1325     const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
1326     const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
1327     const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
1328     const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
1329     if (size == 4) {
1330       vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
1331       vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
1332       vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
1333       vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
1334     } else {
1335       vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
1336       vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
1337       vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
1338       vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
1339     }
1340     dst += 4 * BPS;
1341   }
1342 }
1343 
TM4_NEON(uint8_t * dst)1344 static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
1345 
VE4_NEON(uint8_t * dst)1346 static void VE4_NEON(uint8_t* dst) {    // vertical
1347   // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
1348   const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
1349   const uint64x1_t A1 = vshr_n_u64(A0, 8);
1350   const uint64x1_t A2 = vshr_n_u64(A0, 16);
1351   const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
1352   const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
1353   const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
1354   const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
1355   const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
1356   int i;
1357   for (i = 0; i < 4; ++i) {
1358     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
1359   }
1360 }
1361 
RD4_NEON(uint8_t * dst)1362 static void RD4_NEON(uint8_t* dst) {   // Down-right
1363   const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
1364   const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
1365   const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
1366   const uint32_t I = dst[-1 + 0 * BPS];
1367   const uint32_t J = dst[-1 + 1 * BPS];
1368   const uint32_t K = dst[-1 + 2 * BPS];
1369   const uint32_t L = dst[-1 + 3 * BPS];
1370   const uint64x1_t LKJI____ =
1371       vcreate_u64((uint64_t)L | (K << 8) | (J << 16) | (I << 24));
1372   const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
1373   const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
1374   const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
1375   const uint8_t D = vget_lane_u8(XABCD_u8, 4);
1376   const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
1377   const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
1378   const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
1379   const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
1380   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1381   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
1382   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1383   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1384   const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1385   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1386   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1387   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1388   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1389 }
1390 
LD4_NEON(uint8_t * dst)1391 static void LD4_NEON(uint8_t* dst) {    // Down-left
1392   // Note using the same shift trick as VE4() is slower here.
1393   const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
1394   const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
1395   const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
1396   const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
1397   const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
1398   const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
1399   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
1400   const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
1401   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
1402   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
1403   const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
1404   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
1405   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
1406   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
1407   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
1408 }
1409 
1410 //------------------------------------------------------------------------------
1411 // Chroma
1412 
VE8uv_NEON(uint8_t * dst)1413 static void VE8uv_NEON(uint8_t* dst) {    // vertical
1414   const uint8x8_t top = vld1_u8(dst - BPS);
1415   int j;
1416   for (j = 0; j < 8; ++j) {
1417     vst1_u8(dst + j * BPS, top);
1418   }
1419 }
1420 
HE8uv_NEON(uint8_t * dst)1421 static void HE8uv_NEON(uint8_t* dst) {    // horizontal
1422   int j;
1423   for (j = 0; j < 8; ++j) {
1424     const uint8x8_t left = vld1_dup_u8(dst - 1);
1425     vst1_u8(dst, left);
1426     dst += BPS;
1427   }
1428 }
1429 
DC8_NEON(uint8_t * dst,int do_top,int do_left)1430 static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
1431   uint16x8_t sum_top;
1432   uint16x8_t sum_left;
1433   uint8x8_t dc0;
1434 
1435   if (do_top) {
1436     const uint8x8_t A = vld1_u8(dst - BPS);  // top row
1437 #if WEBP_AARCH64
1438     const uint16_t p2 = vaddlv_u8(A);
1439     sum_top = vdupq_n_u16(p2);
1440 #else
1441     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
1442     const uint16x4_t p1 = vpadd_u16(p0, p0);
1443     const uint16x4_t p2 = vpadd_u16(p1, p1);
1444     sum_top = vcombine_u16(p2, p2);
1445 #endif
1446   }
1447 
1448   if (do_left) {
1449     const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
1450     const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
1451     const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
1452     const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
1453     const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
1454     const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
1455     const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
1456     const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
1457     const uint16x8_t s0 = vaddl_u8(L0, L1);
1458     const uint16x8_t s1 = vaddl_u8(L2, L3);
1459     const uint16x8_t s2 = vaddl_u8(L4, L5);
1460     const uint16x8_t s3 = vaddl_u8(L6, L7);
1461     const uint16x8_t s01 = vaddq_u16(s0, s1);
1462     const uint16x8_t s23 = vaddq_u16(s2, s3);
1463     sum_left = vaddq_u16(s01, s23);
1464   }
1465 
1466   if (do_top && do_left) {
1467     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1468     dc0 = vrshrn_n_u16(sum, 4);
1469   } else if (do_top) {
1470     dc0 = vrshrn_n_u16(sum_top, 3);
1471   } else if (do_left) {
1472     dc0 = vrshrn_n_u16(sum_left, 3);
1473   } else {
1474     dc0 = vdup_n_u8(0x80);
1475   }
1476 
1477   {
1478     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
1479     int i;
1480     for (i = 0; i < 8; ++i) {
1481       vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
1482     }
1483   }
1484 }
1485 
DC8uv_NEON(uint8_t * dst)1486 static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
DC8uvNoTop_NEON(uint8_t * dst)1487 static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
DC8uvNoLeft_NEON(uint8_t * dst)1488 static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
DC8uvNoTopLeft_NEON(uint8_t * dst)1489 static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
1490 
TM8uv_NEON(uint8_t * dst)1491 static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
1492 
1493 //------------------------------------------------------------------------------
1494 // 16x16
1495 
VE16_NEON(uint8_t * dst)1496 static void VE16_NEON(uint8_t* dst) {     // vertical
1497   const uint8x16_t top = vld1q_u8(dst - BPS);
1498   int j;
1499   for (j = 0; j < 16; ++j) {
1500     vst1q_u8(dst + j * BPS, top);
1501   }
1502 }
1503 
HE16_NEON(uint8_t * dst)1504 static void HE16_NEON(uint8_t* dst) {     // horizontal
1505   int j;
1506   for (j = 0; j < 16; ++j) {
1507     const uint8x16_t left = vld1q_dup_u8(dst - 1);
1508     vst1q_u8(dst, left);
1509     dst += BPS;
1510   }
1511 }
1512 
DC16_NEON(uint8_t * dst,int do_top,int do_left)1513 static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
1514   uint16x8_t sum_top;
1515   uint16x8_t sum_left;
1516   uint8x8_t dc0;
1517 
1518   if (do_top) {
1519     const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
1520 #if WEBP_AARCH64
1521     const uint16_t p3 = vaddlvq_u8(A);
1522     sum_top = vdupq_n_u16(p3);
1523 #else
1524     const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
1525     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
1526     const uint16x4_t p2 = vpadd_u16(p1, p1);
1527     const uint16x4_t p3 = vpadd_u16(p2, p2);
1528     sum_top = vcombine_u16(p3, p3);
1529 #endif
1530   }
1531 
1532   if (do_left) {
1533     int i;
1534     sum_left = vdupq_n_u16(0);
1535     for (i = 0; i < 16; i += 8) {
1536       const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
1537       const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
1538       const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
1539       const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
1540       const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
1541       const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
1542       const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
1543       const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
1544       const uint16x8_t s0 = vaddl_u8(L0, L1);
1545       const uint16x8_t s1 = vaddl_u8(L2, L3);
1546       const uint16x8_t s2 = vaddl_u8(L4, L5);
1547       const uint16x8_t s3 = vaddl_u8(L6, L7);
1548       const uint16x8_t s01 = vaddq_u16(s0, s1);
1549       const uint16x8_t s23 = vaddq_u16(s2, s3);
1550       const uint16x8_t sum = vaddq_u16(s01, s23);
1551       sum_left = vaddq_u16(sum_left, sum);
1552     }
1553   }
1554 
1555   if (do_top && do_left) {
1556     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
1557     dc0 = vrshrn_n_u16(sum, 5);
1558   } else if (do_top) {
1559     dc0 = vrshrn_n_u16(sum_top, 4);
1560   } else if (do_left) {
1561     dc0 = vrshrn_n_u16(sum_left, 4);
1562   } else {
1563     dc0 = vdup_n_u8(0x80);
1564   }
1565 
1566   {
1567     const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
1568     int i;
1569     for (i = 0; i < 16; ++i) {
1570       vst1q_u8(dst + i * BPS, dc);
1571     }
1572   }
1573 }
1574 
DC16TopLeft_NEON(uint8_t * dst)1575 static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
DC16NoTop_NEON(uint8_t * dst)1576 static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
DC16NoLeft_NEON(uint8_t * dst)1577 static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
DC16NoTopLeft_NEON(uint8_t * dst)1578 static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
1579 
TM16_NEON(uint8_t * dst)1580 static void TM16_NEON(uint8_t* dst) {
1581   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
1582   const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
1583   // A[c] - A[-1]
1584   const uint16x8_t d_lo = vsubl_u8(vget_low_u8(T), TL);
1585   const uint16x8_t d_hi = vsubl_u8(vget_high_u8(T), TL);
1586   int y;
1587   for (y = 0; y < 16; y += 4) {
1588     // left edge
1589     const uint8x8_t L0 = vld1_dup_u8(dst + 0 * BPS - 1);
1590     const uint8x8_t L1 = vld1_dup_u8(dst + 1 * BPS - 1);
1591     const uint8x8_t L2 = vld1_dup_u8(dst + 2 * BPS - 1);
1592     const uint8x8_t L3 = vld1_dup_u8(dst + 3 * BPS - 1);
1593     // L[r] + A[c] - A[-1]
1594     const int16x8_t r0_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L0));
1595     const int16x8_t r1_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L1));
1596     const int16x8_t r2_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L2));
1597     const int16x8_t r3_lo = vreinterpretq_s16_u16(vaddw_u8(d_lo, L3));
1598     const int16x8_t r0_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L0));
1599     const int16x8_t r1_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L1));
1600     const int16x8_t r2_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L2));
1601     const int16x8_t r3_hi = vreinterpretq_s16_u16(vaddw_u8(d_hi, L3));
1602     // Saturate and store the result.
1603     const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
1604     const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
1605     const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
1606     const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
1607     vst1q_u8(dst + 0 * BPS, row0);
1608     vst1q_u8(dst + 1 * BPS, row1);
1609     vst1q_u8(dst + 2 * BPS, row2);
1610     vst1q_u8(dst + 3 * BPS, row3);
1611     dst += 4 * BPS;
1612   }
1613 }
1614 
1615 //------------------------------------------------------------------------------
1616 // Entry point
1617 
1618 extern void VP8DspInitNEON(void);
1619 
VP8DspInitNEON(void)1620 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
1621   VP8Transform = TransformTwo_NEON;
1622   VP8TransformAC3 = TransformAC3_NEON;
1623   VP8TransformDC = TransformDC_NEON;
1624   VP8TransformWHT = TransformWHT_NEON;
1625 
1626   VP8VFilter16 = VFilter16_NEON;
1627   VP8VFilter16i = VFilter16i_NEON;
1628   VP8HFilter16 = HFilter16_NEON;
1629 #if !defined(WORK_AROUND_GCC)
1630   VP8HFilter16i = HFilter16i_NEON;
1631 #endif
1632   VP8VFilter8 = VFilter8_NEON;
1633   VP8VFilter8i = VFilter8i_NEON;
1634 #if !defined(WORK_AROUND_GCC)
1635   VP8HFilter8 = HFilter8_NEON;
1636   VP8HFilter8i = HFilter8i_NEON;
1637 #endif
1638   VP8SimpleVFilter16 = SimpleVFilter16_NEON;
1639   VP8SimpleHFilter16 = SimpleHFilter16_NEON;
1640   VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
1641   VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
1642 
1643   VP8PredLuma4[0] = DC4_NEON;
1644   VP8PredLuma4[1] = TM4_NEON;
1645   VP8PredLuma4[2] = VE4_NEON;
1646   VP8PredLuma4[4] = RD4_NEON;
1647   VP8PredLuma4[6] = LD4_NEON;
1648 
1649   VP8PredLuma16[0] = DC16TopLeft_NEON;
1650   VP8PredLuma16[1] = TM16_NEON;
1651   VP8PredLuma16[2] = VE16_NEON;
1652   VP8PredLuma16[3] = HE16_NEON;
1653   VP8PredLuma16[4] = DC16NoTop_NEON;
1654   VP8PredLuma16[5] = DC16NoLeft_NEON;
1655   VP8PredLuma16[6] = DC16NoTopLeft_NEON;
1656 
1657   VP8PredChroma8[0] = DC8uv_NEON;
1658   VP8PredChroma8[1] = TM8uv_NEON;
1659   VP8PredChroma8[2] = VE8uv_NEON;
1660   VP8PredChroma8[3] = HE8uv_NEON;
1661   VP8PredChroma8[4] = DC8uvNoTop_NEON;
1662   VP8PredChroma8[5] = DC8uvNoLeft_NEON;
1663   VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
1664 }
1665 
1666 #else  // !WEBP_USE_NEON
1667 
1668 WEBP_DSP_INIT_STUB(VP8DspInitNEON)
1669 
1670 #endif  // WEBP_USE_NEON
1671