1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of dsp functions and loop filtering.
11 //
12 // Authors: Somnath Banerjee (somnath@google.com)
13 // Johann Koenig (johannkoenig@google.com)
14
15 #include "./dsp.h"
16
17 #if defined(WEBP_USE_NEON)
18
19 #include "./neon.h"
20 #include "../dec/vp8i.h"
21
22 //------------------------------------------------------------------------------
23 // NxM Loading functions
24
25 // Load/Store vertical edge
26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \
27 "vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
28 "vld4.8 {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
29 "vld4.8 {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
30 "vld4.8 {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
31 "vld4.8 {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
32 "vld4.8 {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
33 "vld4.8 {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
34 "vld4.8 {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
35
36 #define STORE8x2(c1, c2, p, stride) \
37 "vst2.8 {" #c1"[0], " #c2"[0]}," #p "," #stride " \n" \
38 "vst2.8 {" #c1"[1], " #c2"[1]}," #p "," #stride " \n" \
39 "vst2.8 {" #c1"[2], " #c2"[2]}," #p "," #stride " \n" \
40 "vst2.8 {" #c1"[3], " #c2"[3]}," #p "," #stride " \n" \
41 "vst2.8 {" #c1"[4], " #c2"[4]}," #p "," #stride " \n" \
42 "vst2.8 {" #c1"[5], " #c2"[5]}," #p "," #stride " \n" \
43 "vst2.8 {" #c1"[6], " #c2"[6]}," #p "," #stride " \n" \
44 "vst2.8 {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
45
46 #if !defined(WORK_AROUND_GCC)
47
48 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
49 // (register alloc, probably). The variants somewhat mitigate the problem, but
50 // not quite. HFilter16i() remains problematic.
Load4x8(const uint8_t * const src,int stride)51 static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
52 const uint8x8_t zero = vdup_n_u8(0);
53 uint8x8x4_t out;
54 INIT_VECTOR4(out, zero, zero, zero, zero);
55 out = vld4_lane_u8(src + 0 * stride, out, 0);
56 out = vld4_lane_u8(src + 1 * stride, out, 1);
57 out = vld4_lane_u8(src + 2 * stride, out, 2);
58 out = vld4_lane_u8(src + 3 * stride, out, 3);
59 out = vld4_lane_u8(src + 4 * stride, out, 4);
60 out = vld4_lane_u8(src + 5 * stride, out, 5);
61 out = vld4_lane_u8(src + 6 * stride, out, 6);
62 out = vld4_lane_u8(src + 7 * stride, out, 7);
63 return out;
64 }
65
Load4x16(const uint8_t * const src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)66 static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
67 uint8x16_t* const p1, uint8x16_t* const p0,
68 uint8x16_t* const q0, uint8x16_t* const q1) {
69 // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
70 // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
71 const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
72 const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
73 *p1 = vcombine_u8(row0.val[0], row8.val[0]);
74 *p0 = vcombine_u8(row0.val[1], row8.val[1]);
75 *q0 = vcombine_u8(row0.val[2], row8.val[2]);
76 *q1 = vcombine_u8(row0.val[3], row8.val[3]);
77 }
78
79 #else // WORK_AROUND_GCC
80
81 #define LOADQ_LANE_32b(VALUE, LANE) do { \
82 (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE)); \
83 src += stride; \
84 } while (0)
85
Load4x16(const uint8_t * src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)86 static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
87 uint8x16_t* const p1, uint8x16_t* const p0,
88 uint8x16_t* const q0, uint8x16_t* const q1) {
89 const uint32x4_t zero = vdupq_n_u32(0);
90 uint32x4x4_t in;
91 INIT_VECTOR4(in, zero, zero, zero, zero);
92 src -= 2;
93 LOADQ_LANE_32b(in.val[0], 0);
94 LOADQ_LANE_32b(in.val[1], 0);
95 LOADQ_LANE_32b(in.val[2], 0);
96 LOADQ_LANE_32b(in.val[3], 0);
97 LOADQ_LANE_32b(in.val[0], 1);
98 LOADQ_LANE_32b(in.val[1], 1);
99 LOADQ_LANE_32b(in.val[2], 1);
100 LOADQ_LANE_32b(in.val[3], 1);
101 LOADQ_LANE_32b(in.val[0], 2);
102 LOADQ_LANE_32b(in.val[1], 2);
103 LOADQ_LANE_32b(in.val[2], 2);
104 LOADQ_LANE_32b(in.val[3], 2);
105 LOADQ_LANE_32b(in.val[0], 3);
106 LOADQ_LANE_32b(in.val[1], 3);
107 LOADQ_LANE_32b(in.val[2], 3);
108 LOADQ_LANE_32b(in.val[3], 3);
109 // Transpose four 4x4 parts:
110 {
111 const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
112 vreinterpretq_u8_u32(in.val[1]));
113 const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
114 vreinterpretq_u8_u32(in.val[3]));
115 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
116 vreinterpretq_u16_u8(row23.val[0]));
117 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
118 vreinterpretq_u16_u8(row23.val[1]));
119 *p1 = vreinterpretq_u8_u16(row02.val[0]);
120 *p0 = vreinterpretq_u8_u16(row13.val[0]);
121 *q0 = vreinterpretq_u8_u16(row02.val[1]);
122 *q1 = vreinterpretq_u8_u16(row13.val[1]);
123 }
124 }
125 #undef LOADQ_LANE_32b
126
127 #endif // !WORK_AROUND_GCC
128
Load8x16(const uint8_t * const src,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)129 static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
130 uint8x16_t* const p3, uint8x16_t* const p2,
131 uint8x16_t* const p1, uint8x16_t* const p0,
132 uint8x16_t* const q0, uint8x16_t* const q1,
133 uint8x16_t* const q2, uint8x16_t* const q3) {
134 Load4x16(src - 2, stride, p3, p2, p1, p0);
135 Load4x16(src + 2, stride, q0, q1, q2, q3);
136 }
137
Load16x4(const uint8_t * const src,int stride,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1)138 static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
139 uint8x16_t* const p1, uint8x16_t* const p0,
140 uint8x16_t* const q0, uint8x16_t* const q1) {
141 *p1 = vld1q_u8(src - 2 * stride);
142 *p0 = vld1q_u8(src - 1 * stride);
143 *q0 = vld1q_u8(src + 0 * stride);
144 *q1 = vld1q_u8(src + 1 * stride);
145 }
146
Load16x8(const uint8_t * const src,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)147 static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
148 uint8x16_t* const p3, uint8x16_t* const p2,
149 uint8x16_t* const p1, uint8x16_t* const p0,
150 uint8x16_t* const q0, uint8x16_t* const q1,
151 uint8x16_t* const q2, uint8x16_t* const q3) {
152 Load16x4(src - 2 * stride, stride, p3, p2, p1, p0);
153 Load16x4(src + 2 * stride, stride, q0, q1, q2, q3);
154 }
155
Load8x8x2(const uint8_t * const u,const uint8_t * const v,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)156 static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
157 const uint8_t* const v,
158 int stride,
159 uint8x16_t* const p3, uint8x16_t* const p2,
160 uint8x16_t* const p1, uint8x16_t* const p0,
161 uint8x16_t* const q0, uint8x16_t* const q1,
162 uint8x16_t* const q2, uint8x16_t* const q3) {
163 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
164 // and the v-samples on the higher half.
165 *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
166 *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
167 *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
168 *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
169 *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
170 *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
171 *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
172 *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
173 }
174
175 #if !defined(WORK_AROUND_GCC)
176
177 #define LOAD_UV_8(ROW) \
178 vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
179
Load8x8x2T(const uint8_t * const u,const uint8_t * const v,int stride,uint8x16_t * const p3,uint8x16_t * const p2,uint8x16_t * const p1,uint8x16_t * const p0,uint8x16_t * const q0,uint8x16_t * const q1,uint8x16_t * const q2,uint8x16_t * const q3)180 static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
181 const uint8_t* const v,
182 int stride,
183 uint8x16_t* const p3, uint8x16_t* const p2,
184 uint8x16_t* const p1, uint8x16_t* const p0,
185 uint8x16_t* const q0, uint8x16_t* const q1,
186 uint8x16_t* const q2, uint8x16_t* const q3) {
187 // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
188 // and the v-samples on the higher half.
189 const uint8x16_t row0 = LOAD_UV_8(0);
190 const uint8x16_t row1 = LOAD_UV_8(1);
191 const uint8x16_t row2 = LOAD_UV_8(2);
192 const uint8x16_t row3 = LOAD_UV_8(3);
193 const uint8x16_t row4 = LOAD_UV_8(4);
194 const uint8x16_t row5 = LOAD_UV_8(5);
195 const uint8x16_t row6 = LOAD_UV_8(6);
196 const uint8x16_t row7 = LOAD_UV_8(7);
197 // Perform two side-by-side 8x8 transposes
198 // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
199 // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
200 // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
201 // u30 u31 u32 u33 u34 u35 u36 u37 | ...
202 // u40 u41 u42 u43 u44 u45 u46 u47 | ...
203 // u50 u51 u52 u53 u54 u55 u56 u57 | ...
204 // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
205 // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
206 const uint8x16x2_t row01 = vtrnq_u8(row0, row1); // u00 u10 u02 u12 ...
207 // u01 u11 u03 u13 ...
208 const uint8x16x2_t row23 = vtrnq_u8(row2, row3); // u20 u30 u22 u32 ...
209 // u21 u31 u23 u33 ...
210 const uint8x16x2_t row45 = vtrnq_u8(row4, row5); // ...
211 const uint8x16x2_t row67 = vtrnq_u8(row6, row7); // ...
212 const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
213 vreinterpretq_u16_u8(row23.val[0]));
214 const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
215 vreinterpretq_u16_u8(row23.val[1]));
216 const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
217 vreinterpretq_u16_u8(row67.val[0]));
218 const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
219 vreinterpretq_u16_u8(row67.val[1]));
220 const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
221 vreinterpretq_u32_u16(row46.val[0]));
222 const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
223 vreinterpretq_u32_u16(row46.val[1]));
224 const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
225 vreinterpretq_u32_u16(row57.val[0]));
226 const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
227 vreinterpretq_u32_u16(row57.val[1]));
228 *p3 = vreinterpretq_u8_u32(row04.val[0]);
229 *p2 = vreinterpretq_u8_u32(row15.val[0]);
230 *p1 = vreinterpretq_u8_u32(row26.val[0]);
231 *p0 = vreinterpretq_u8_u32(row37.val[0]);
232 *q0 = vreinterpretq_u8_u32(row04.val[1]);
233 *q1 = vreinterpretq_u8_u32(row15.val[1]);
234 *q2 = vreinterpretq_u8_u32(row26.val[1]);
235 *q3 = vreinterpretq_u8_u32(row37.val[1]);
236 }
237 #undef LOAD_UV_8
238
239 #endif // !WORK_AROUND_GCC
240
Store2x8(const uint8x8x2_t v,uint8_t * const dst,int stride)241 static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
242 uint8_t* const dst, int stride) {
243 vst2_lane_u8(dst + 0 * stride, v, 0);
244 vst2_lane_u8(dst + 1 * stride, v, 1);
245 vst2_lane_u8(dst + 2 * stride, v, 2);
246 vst2_lane_u8(dst + 3 * stride, v, 3);
247 vst2_lane_u8(dst + 4 * stride, v, 4);
248 vst2_lane_u8(dst + 5 * stride, v, 5);
249 vst2_lane_u8(dst + 6 * stride, v, 6);
250 vst2_lane_u8(dst + 7 * stride, v, 7);
251 }
252
Store2x16(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const dst,int stride)253 static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
254 uint8_t* const dst, int stride) {
255 uint8x8x2_t lo, hi;
256 lo.val[0] = vget_low_u8(p0);
257 lo.val[1] = vget_low_u8(q0);
258 hi.val[0] = vget_high_u8(p0);
259 hi.val[1] = vget_high_u8(q0);
260 Store2x8(lo, dst - 1 + 0 * stride, stride);
261 Store2x8(hi, dst - 1 + 8 * stride, stride);
262 }
263
264 #if !defined(WORK_AROUND_GCC)
Store4x8(const uint8x8x4_t v,uint8_t * const dst,int stride)265 static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
266 uint8_t* const dst, int stride) {
267 vst4_lane_u8(dst + 0 * stride, v, 0);
268 vst4_lane_u8(dst + 1 * stride, v, 1);
269 vst4_lane_u8(dst + 2 * stride, v, 2);
270 vst4_lane_u8(dst + 3 * stride, v, 3);
271 vst4_lane_u8(dst + 4 * stride, v, 4);
272 vst4_lane_u8(dst + 5 * stride, v, 5);
273 vst4_lane_u8(dst + 6 * stride, v, 6);
274 vst4_lane_u8(dst + 7 * stride, v, 7);
275 }
276
Store4x16(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const dst,int stride)277 static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
278 const uint8x16_t q0, const uint8x16_t q1,
279 uint8_t* const dst, int stride) {
280 uint8x8x4_t lo, hi;
281 INIT_VECTOR4(lo,
282 vget_low_u8(p1), vget_low_u8(p0),
283 vget_low_u8(q0), vget_low_u8(q1));
284 INIT_VECTOR4(hi,
285 vget_high_u8(p1), vget_high_u8(p0),
286 vget_high_u8(q0), vget_high_u8(q1));
287 Store4x8(lo, dst - 2 + 0 * stride, stride);
288 Store4x8(hi, dst - 2 + 8 * stride, stride);
289 }
290 #endif // !WORK_AROUND_GCC
291
Store16x2(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const dst,int stride)292 static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
293 uint8_t* const dst, int stride) {
294 vst1q_u8(dst - stride, p0);
295 vst1q_u8(dst, q0);
296 }
297
Store16x4(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const dst,int stride)298 static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
299 const uint8x16_t q0, const uint8x16_t q1,
300 uint8_t* const dst, int stride) {
301 Store16x2(p1, p0, dst - stride, stride);
302 Store16x2(q0, q1, dst + stride, stride);
303 }
304
Store8x2x2(const uint8x16_t p0,const uint8x16_t q0,uint8_t * const u,uint8_t * const v,int stride)305 static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
306 uint8_t* const u, uint8_t* const v,
307 int stride) {
308 // p0 and q0 contain the u+v samples packed in low/high halves.
309 vst1_u8(u - stride, vget_low_u8(p0));
310 vst1_u8(u, vget_low_u8(q0));
311 vst1_u8(v - stride, vget_high_u8(p0));
312 vst1_u8(v, vget_high_u8(q0));
313 }
314
Store8x4x2(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const u,uint8_t * const v,int stride)315 static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
316 const uint8x16_t q0, const uint8x16_t q1,
317 uint8_t* const u, uint8_t* const v,
318 int stride) {
319 // The p1...q1 registers contain the u+v samples packed in low/high halves.
320 Store8x2x2(p1, p0, u - stride, v - stride, stride);
321 Store8x2x2(q0, q1, u + stride, v + stride, stride);
322 }
323
324 #if !defined(WORK_AROUND_GCC)
325
326 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do { \
327 vst3_lane_u8((DST) - 3, (VAL0), (LANE)); \
328 vst3_lane_u8((DST) + 0, (VAL1), (LANE)); \
329 (DST) += stride; \
330 } while (0)
331
Store6x8x2(const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,uint8_t * u,uint8_t * v,int stride)332 static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
333 const uint8x16_t p0, const uint8x16_t q0,
334 const uint8x16_t q1, const uint8x16_t q2,
335 uint8_t* u, uint8_t* v,
336 int stride) {
337 uint8x8x3_t u0, u1, v0, v1;
338 INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
339 INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
340 INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
341 INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
342 STORE6_LANE(u, u0, u1, 0);
343 STORE6_LANE(u, u0, u1, 1);
344 STORE6_LANE(u, u0, u1, 2);
345 STORE6_LANE(u, u0, u1, 3);
346 STORE6_LANE(u, u0, u1, 4);
347 STORE6_LANE(u, u0, u1, 5);
348 STORE6_LANE(u, u0, u1, 6);
349 STORE6_LANE(u, u0, u1, 7);
350 STORE6_LANE(v, v0, v1, 0);
351 STORE6_LANE(v, v0, v1, 1);
352 STORE6_LANE(v, v0, v1, 2);
353 STORE6_LANE(v, v0, v1, 3);
354 STORE6_LANE(v, v0, v1, 4);
355 STORE6_LANE(v, v0, v1, 5);
356 STORE6_LANE(v, v0, v1, 6);
357 STORE6_LANE(v, v0, v1, 7);
358 }
359 #undef STORE6_LANE
360
Store4x8x2(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,uint8_t * const u,uint8_t * const v,int stride)361 static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
362 const uint8x16_t q0, const uint8x16_t q1,
363 uint8_t* const u, uint8_t* const v,
364 int stride) {
365 uint8x8x4_t u0, v0;
366 INIT_VECTOR4(u0,
367 vget_low_u8(p1), vget_low_u8(p0),
368 vget_low_u8(q0), vget_low_u8(q1));
369 INIT_VECTOR4(v0,
370 vget_high_u8(p1), vget_high_u8(p0),
371 vget_high_u8(q0), vget_high_u8(q1));
372 vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
373 vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
374 vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
375 vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
376 vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
377 vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
378 vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
379 vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
380 vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
381 vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
382 vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
383 vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
384 vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
385 vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
386 vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
387 vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
388 }
389
390 #endif // !WORK_AROUND_GCC
391
392 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
ConvertU8ToS16(uint32x2_t v)393 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
394 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
395 }
396
397 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
398 // to the corresponding rows of 'dst'.
SaturateAndStore4x4(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)399 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
400 const int16x8_t dst01,
401 const int16x8_t dst23) {
402 // Unsigned saturate to 8b.
403 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
404 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
405
406 // Store the results.
407 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
408 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
409 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
410 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
411 }
412
Add4x4(const int16x8_t row01,const int16x8_t row23,uint8_t * const dst)413 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
414 uint8_t* const dst) {
415 uint32x2_t dst01 = vdup_n_u32(0);
416 uint32x2_t dst23 = vdup_n_u32(0);
417
418 // Load the source pixels.
419 dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
420 dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
421 dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
422 dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
423
424 {
425 // Convert to 16b.
426 const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
427 const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
428
429 // Descale with rounding.
430 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
431 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
432 // Add the inverse transform.
433 SaturateAndStore4x4(dst, out01, out23);
434 }
435 }
436
437 //-----------------------------------------------------------------------------
438 // Simple In-loop filtering (Paragraph 15.2)
439
NeedsFilter(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,int thresh)440 static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
441 const uint8x16_t q0, const uint8x16_t q1,
442 int thresh) {
443 const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
444 const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0); // abs(p0-q0)
445 const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1); // abs(p1-q1)
446 const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0); // 2 * abs(p0-q0)
447 const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1); // abs(p1-q1) / 2
448 const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
449 const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
450 return mask;
451 }
452
FlipSign(const uint8x16_t v)453 static int8x16_t FlipSign(const uint8x16_t v) {
454 const uint8x16_t sign_bit = vdupq_n_u8(0x80);
455 return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
456 }
457
FlipSignBack(const int8x16_t v)458 static uint8x16_t FlipSignBack(const int8x16_t v) {
459 const int8x16_t sign_bit = vdupq_n_s8(0x80);
460 return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
461 }
462
GetBaseDelta(const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1)463 static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
464 const int8x16_t q0, const int8x16_t q1) {
465 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
466 const int8x16_t p1_q1 = vqsubq_s8(p1, q1); // (p1-q1)
467 const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0); // (p1-q1) + 1 * (q0 - p0)
468 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // (p1-q1) + 2 * (q0 - p0)
469 const int8x16_t s3 = vqaddq_s8(q0_p0, s2); // (p1-q1) + 3 * (q0 - p0)
470 return s3;
471 }
472
GetBaseDelta0(const int8x16_t p0,const int8x16_t q0)473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
474 const int8x16_t q0_p0 = vqsubq_s8(q0, p0); // (q0-p0)
475 const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0); // 2 * (q0 - p0)
476 const int8x16_t s2 = vqaddq_s8(q0_p0, s1); // 3 * (q0 - p0)
477 return s2;
478 }
479
480 //------------------------------------------------------------------------------
481
ApplyFilter2(const int8x16_t p0s,const int8x16_t q0s,const int8x16_t delta,uint8x16_t * const op0,uint8x16_t * const oq0)482 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
483 const int8x16_t delta,
484 uint8x16_t* const op0, uint8x16_t* const oq0) {
485 const int8x16_t kCst3 = vdupq_n_s8(0x03);
486 const int8x16_t kCst4 = vdupq_n_s8(0x04);
487 const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
488 const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
489 const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
490 const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
491 const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
492 const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
493 *op0 = FlipSignBack(sp0);
494 *oq0 = FlipSignBack(sq0);
495 }
496
497 #if defined(USE_INTRINSICS)
498
DoFilter2(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t mask,uint8x16_t * const op0,uint8x16_t * const oq0)499 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
500 const uint8x16_t q0, const uint8x16_t q1,
501 const uint8x16_t mask,
502 uint8x16_t* const op0, uint8x16_t* const oq0) {
503 const int8x16_t p1s = FlipSign(p1);
504 const int8x16_t p0s = FlipSign(p0);
505 const int8x16_t q0s = FlipSign(q0);
506 const int8x16_t q1s = FlipSign(q1);
507 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
508 const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
509 ApplyFilter2(p0s, q0s, delta1, op0, oq0);
510 }
511
SimpleVFilter16(uint8_t * p,int stride,int thresh)512 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
513 uint8x16_t p1, p0, q0, q1, op0, oq0;
514 Load16x4(p, stride, &p1, &p0, &q0, &q1);
515 {
516 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
517 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
518 }
519 Store16x2(op0, oq0, p, stride);
520 }
521
SimpleHFilter16(uint8_t * p,int stride,int thresh)522 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
523 uint8x16_t p1, p0, q0, q1, oq0, op0;
524 Load4x16(p, stride, &p1, &p0, &q0, &q1);
525 {
526 const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
527 DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
528 }
529 Store2x16(op0, oq0, p, stride);
530 }
531
532 #else
533
534 #define QRegs "q0", "q1", "q2", "q3", \
535 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
536
537 #define FLIP_SIGN_BIT2(a, b, s) \
538 "veor " #a "," #a "," #s " \n" \
539 "veor " #b "," #b "," #s " \n" \
540
541 #define FLIP_SIGN_BIT4(a, b, c, d, s) \
542 FLIP_SIGN_BIT2(a, b, s) \
543 FLIP_SIGN_BIT2(c, d, s) \
544
545 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \
546 "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \
547 "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \
548 "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \
549 "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \
550 "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
551 "vdup.8 q14, " #thresh " \n" \
552 "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */
553
554 #define GET_BASE_DELTA(p1, p0, q0, q1, o) \
555 "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \
556 "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \
557 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \
558 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \
559 "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */
560
561 #define DO_SIMPLE_FILTER(p0, q0, fl) \
562 "vmov.i8 q15, #0x03 \n" \
563 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \
564 "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \
565 "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \
566 \
567 "vmov.i8 q15, #0x04 \n" \
568 "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \
569 "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \
570 "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */
571
572 // Applies filter on 2 pixels (p0 and q0)
573 #define DO_FILTER2(p1, p0, q0, q1, thresh) \
574 NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \
575 "vmov.i8 q10, #0x80 \n" /* sign bit */ \
576 FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \
577 GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \
578 "vand q9, q9, q11 \n" /* apply filter mask */ \
579 DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \
580 FLIP_SIGN_BIT2(p0, q0, q10)
581
SimpleVFilter16(uint8_t * p,int stride,int thresh)582 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
583 __asm__ volatile (
584 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
585
586 "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1
587 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0
588 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0
589 "vld1.u8 {q12}, [%[p]] \n" // q1
590
591 DO_FILTER2(q1, q2, q3, q12, %[thresh])
592
593 "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride
594
595 "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0
596 "vst1.u8 {q3}, [%[p]] \n" // store oq0
597 : [p] "+r"(p)
598 : [stride] "r"(stride), [thresh] "r"(thresh)
599 : "memory", QRegs
600 );
601 }
602
SimpleHFilter16(uint8_t * p,int stride,int thresh)603 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
604 __asm__ volatile (
605 "sub r4, %[p], #2 \n" // base1 = p - 2
606 "lsl r6, %[stride], #1 \n" // r6 = 2 * stride
607 "add r5, r4, %[stride] \n" // base2 = base1 + stride
608
609 LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
610 LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
611 "vswp d3, d24 \n" // p1:q1 p0:q3
612 "vswp d5, d26 \n" // q0:q2 q1:q4
613 "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4
614
615 DO_FILTER2(q1, q2, q12, q13, %[thresh])
616
617 "sub %[p], %[p], #1 \n" // p - 1
618
619 "vswp d5, d24 \n"
620 STORE8x2(d4, d5, [%[p]], %[stride])
621 STORE8x2(d24, d25, [%[p]], %[stride])
622
623 : [p] "+r"(p)
624 : [stride] "r"(stride), [thresh] "r"(thresh)
625 : "memory", "r4", "r5", "r6", QRegs
626 );
627 }
628
629 #endif // USE_INTRINSICS
630
SimpleVFilter16i(uint8_t * p,int stride,int thresh)631 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
632 uint32_t k;
633 for (k = 3; k != 0; --k) {
634 p += 4 * stride;
635 SimpleVFilter16(p, stride, thresh);
636 }
637 }
638
SimpleHFilter16i(uint8_t * p,int stride,int thresh)639 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
640 uint32_t k;
641 for (k = 3; k != 0; --k) {
642 p += 4;
643 SimpleHFilter16(p, stride, thresh);
644 }
645 }
646
647 //------------------------------------------------------------------------------
648 // Complex In-loop filtering (Paragraph 15.3)
649
NeedsHev(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,int hev_thresh)650 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
651 const uint8x16_t q0, const uint8x16_t q1,
652 int hev_thresh) {
653 const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
654 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
655 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
656 const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);
657 const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);
658 const uint8x16_t mask = vorrq_u8(mask1, mask2);
659 return mask;
660 }
661
NeedsFilter2(const uint8x16_t p3,const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t q3,int ithresh,int thresh)662 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
663 const uint8x16_t p1, const uint8x16_t p0,
664 const uint8x16_t q0, const uint8x16_t q1,
665 const uint8x16_t q2, const uint8x16_t q3,
666 int ithresh, int thresh) {
667 const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
668 const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2); // abs(p3 - p2)
669 const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1); // abs(p2 - p1)
670 const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0); // abs(p1 - p0)
671 const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2); // abs(q3 - q2)
672 const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1); // abs(q2 - q1)
673 const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0); // abs(q1 - q0)
674 const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
675 const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
676 const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
677 const uint8x16_t max12 = vmaxq_u8(max1, max2);
678 const uint8x16_t max123 = vmaxq_u8(max12, max3);
679 const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
680 const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
681 const uint8x16_t mask = vandq_u8(mask1, mask2);
682 return mask;
683 }
684
685 // 4-points filter
686
ApplyFilter4(const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1,const int8x16_t delta0,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1)687 static void ApplyFilter4(
688 const int8x16_t p1, const int8x16_t p0,
689 const int8x16_t q0, const int8x16_t q1,
690 const int8x16_t delta0,
691 uint8x16_t* const op1, uint8x16_t* const op0,
692 uint8x16_t* const oq0, uint8x16_t* const oq1) {
693 const int8x16_t kCst3 = vdupq_n_s8(0x03);
694 const int8x16_t kCst4 = vdupq_n_s8(0x04);
695 const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
696 const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
697 const int8x16_t a1 = vshrq_n_s8(delta1, 3);
698 const int8x16_t a2 = vshrq_n_s8(delta2, 3);
699 const int8x16_t a3 = vrshrq_n_s8(a1, 1); // a3 = (a1 + 1) >> 1
700 *op0 = FlipSignBack(vqaddq_s8(p0, a2)); // clip(p0 + a2)
701 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - a1)
702 *op1 = FlipSignBack(vqaddq_s8(p1, a3)); // clip(p1 + a3)
703 *oq1 = FlipSignBack(vqsubq_s8(q1, a3)); // clip(q1 - a3)
704 }
705
DoFilter4(const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t mask,const uint8x16_t hev_mask,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1)706 static void DoFilter4(
707 const uint8x16_t p1, const uint8x16_t p0,
708 const uint8x16_t q0, const uint8x16_t q1,
709 const uint8x16_t mask, const uint8x16_t hev_mask,
710 uint8x16_t* const op1, uint8x16_t* const op0,
711 uint8x16_t* const oq0, uint8x16_t* const oq1) {
712 // This is a fused version of DoFilter2() calling ApplyFilter2 directly
713 const int8x16_t p1s = FlipSign(p1);
714 int8x16_t p0s = FlipSign(p0);
715 int8x16_t q0s = FlipSign(q0);
716 const int8x16_t q1s = FlipSign(q1);
717 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
718
719 // do_filter2 part (simple loopfilter on pixels with hev)
720 {
721 const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
722 const int8x16_t simple_lf_delta =
723 vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
724 uint8x16_t tmp_p0, tmp_q0;
725 ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
726 // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
727 p0s = FlipSign(tmp_p0);
728 q0s = FlipSign(tmp_q0);
729 }
730
731 // do_filter4 part (complex loopfilter on pixels without hev)
732 {
733 const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
734 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
735 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
736 const int8x16_t complex_lf_delta =
737 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
738 ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
739 }
740 }
741
742 // 6-points filter
743
ApplyFilter6(const int8x16_t p2,const int8x16_t p1,const int8x16_t p0,const int8x16_t q0,const int8x16_t q1,const int8x16_t q2,const int8x16_t delta,uint8x16_t * const op2,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1,uint8x16_t * const oq2)744 static void ApplyFilter6(
745 const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
746 const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
747 const int8x16_t delta,
748 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
749 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
750 const int16x8_t kCst63 = vdupq_n_s16(63);
751 const int8x8_t kCst27 = vdup_n_s8(27);
752 const int8x8_t kCst18 = vdup_n_s8(18);
753 const int8x8_t kCst9 = vdup_n_s8(9);
754 const int8x8_t delta_lo = vget_low_s8(delta);
755 const int8x8_t delta_hi = vget_high_s8(delta);
756 const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo); // 63 + 27 * a
757 const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi); // 63 + 27 * a
758 const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo); // 63 + 18 * a
759 const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi); // 63 + 18 * a
760 const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo); // 63 + 9 * a
761 const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi); // 63 + 9 * a
762 const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);
763 const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);
764 const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);
765 const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);
766 const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);
767 const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);
768 const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
769 const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
770 const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
771
772 *op0 = FlipSignBack(vqaddq_s8(p0, a1)); // clip(p0 + a1)
773 *oq0 = FlipSignBack(vqsubq_s8(q0, a1)); // clip(q0 - q1)
774 *oq1 = FlipSignBack(vqsubq_s8(q1, a2)); // clip(q1 - a2)
775 *op1 = FlipSignBack(vqaddq_s8(p1, a2)); // clip(p1 + a2)
776 *oq2 = FlipSignBack(vqsubq_s8(q2, a3)); // clip(q2 - a3)
777 *op2 = FlipSignBack(vqaddq_s8(p2, a3)); // clip(p2 + a3)
778 }
779
DoFilter6(const uint8x16_t p2,const uint8x16_t p1,const uint8x16_t p0,const uint8x16_t q0,const uint8x16_t q1,const uint8x16_t q2,const uint8x16_t mask,const uint8x16_t hev_mask,uint8x16_t * const op2,uint8x16_t * const op1,uint8x16_t * const op0,uint8x16_t * const oq0,uint8x16_t * const oq1,uint8x16_t * const oq2)780 static void DoFilter6(
781 const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
782 const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
783 const uint8x16_t mask, const uint8x16_t hev_mask,
784 uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
785 uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
786 // This is a fused version of DoFilter2() calling ApplyFilter2 directly
787 const int8x16_t p2s = FlipSign(p2);
788 const int8x16_t p1s = FlipSign(p1);
789 int8x16_t p0s = FlipSign(p0);
790 int8x16_t q0s = FlipSign(q0);
791 const int8x16_t q1s = FlipSign(q1);
792 const int8x16_t q2s = FlipSign(q2);
793 const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
794 const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
795
796 // do_filter2 part (simple loopfilter on pixels with hev)
797 {
798 const int8x16_t simple_lf_delta =
799 vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
800 uint8x16_t tmp_p0, tmp_q0;
801 ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
802 // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
803 p0s = FlipSign(tmp_p0);
804 q0s = FlipSign(tmp_q0);
805 }
806
807 // do_filter6 part (complex loopfilter on pixels without hev)
808 {
809 // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
810 const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
811 const int8x16_t complex_lf_delta =
812 vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
813 ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
814 op2, op1, op0, oq0, oq1, oq2);
815 }
816 }
817
818 // on macroblock edges
819
VFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)820 static void VFilter16(uint8_t* p, int stride,
821 int thresh, int ithresh, int hev_thresh) {
822 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
823 Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
824 {
825 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
826 ithresh, thresh);
827 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
828 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
829 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
830 &op2, &op1, &op0, &oq0, &oq1, &oq2);
831 Store16x2(op2, op1, p - 2 * stride, stride);
832 Store16x2(op0, oq0, p + 0 * stride, stride);
833 Store16x2(oq1, oq2, p + 2 * stride, stride);
834 }
835 }
836
HFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)837 static void HFilter16(uint8_t* p, int stride,
838 int thresh, int ithresh, int hev_thresh) {
839 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
840 Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
841 {
842 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
843 ithresh, thresh);
844 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
845 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
846 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
847 &op2, &op1, &op0, &oq0, &oq1, &oq2);
848 Store2x16(op2, op1, p - 2, stride);
849 Store2x16(op0, oq0, p + 0, stride);
850 Store2x16(oq1, oq2, p + 2, stride);
851 }
852 }
853
854 // on three inner edges
VFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)855 static void VFilter16i(uint8_t* p, int stride,
856 int thresh, int ithresh, int hev_thresh) {
857 uint32_t k;
858 uint8x16_t p3, p2, p1, p0;
859 Load16x4(p + 2 * stride, stride, &p3, &p2, &p1, &p0);
860 for (k = 3; k != 0; --k) {
861 uint8x16_t q0, q1, q2, q3;
862 p += 4 * stride;
863 Load16x4(p + 2 * stride, stride, &q0, &q1, &q2, &q3);
864 {
865 const uint8x16_t mask =
866 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
867 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
868 // p3 and p2 are not just temporary variables here: they will be
869 // re-used for next span. And q2/q3 will become p1/p0 accordingly.
870 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
871 Store16x4(p1, p0, p3, p2, p, stride);
872 p1 = q2;
873 p0 = q3;
874 }
875 }
876 }
877
878 #if !defined(WORK_AROUND_GCC)
HFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)879 static void HFilter16i(uint8_t* p, int stride,
880 int thresh, int ithresh, int hev_thresh) {
881 uint32_t k;
882 uint8x16_t p3, p2, p1, p0;
883 Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
884 for (k = 3; k != 0; --k) {
885 uint8x16_t q0, q1, q2, q3;
886 p += 4;
887 Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
888 {
889 const uint8x16_t mask =
890 NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
891 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
892 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
893 Store4x16(p1, p0, p3, p2, p, stride);
894 p1 = q2;
895 p0 = q3;
896 }
897 }
898 }
899 #endif // !WORK_AROUND_GCC
900
901 // 8-pixels wide variant, for chroma filtering
VFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)902 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
903 int thresh, int ithresh, int hev_thresh) {
904 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
905 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
906 {
907 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
908 ithresh, thresh);
909 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
910 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
911 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
912 &op2, &op1, &op0, &oq0, &oq1, &oq2);
913 Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
914 Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
915 Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
916 }
917 }
VFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)918 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
919 int thresh, int ithresh, int hev_thresh) {
920 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
921 u += 4 * stride;
922 v += 4 * stride;
923 Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
924 {
925 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
926 ithresh, thresh);
927 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
928 uint8x16_t op1, op0, oq0, oq1;
929 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
930 Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
931 }
932 }
933
934 #if !defined(WORK_AROUND_GCC)
HFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)935 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
936 int thresh, int ithresh, int hev_thresh) {
937 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
938 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
939 {
940 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
941 ithresh, thresh);
942 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
943 uint8x16_t op2, op1, op0, oq0, oq1, oq2;
944 DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
945 &op2, &op1, &op0, &oq0, &oq1, &oq2);
946 Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
947 }
948 }
949
HFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)950 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
951 int thresh, int ithresh, int hev_thresh) {
952 uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
953 u += 4;
954 v += 4;
955 Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
956 {
957 const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
958 ithresh, thresh);
959 const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
960 uint8x16_t op1, op0, oq0, oq1;
961 DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
962 Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
963 }
964 }
965 #endif // !WORK_AROUND_GCC
966
967 //-----------------------------------------------------------------------------
968 // Inverse transforms (Paragraph 14.4)
969
970 // Technically these are unsigned but vqdmulh is only available in signed.
971 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
972 // changing the >> 16 to >> 15 and requiring an additional >> 1.
973 // We use this to our advantage with kC2. The canonical value is 35468.
974 // However, the high bit is set so treating it as signed will give incorrect
975 // results. We avoid this by down shifting by 1 here to clear the highest bit.
976 // Combined with the doubling effect of vqdmulh we get >> 16.
977 // This can not be applied to kC1 because the lowest bit is set. Down shifting
978 // the constant would reduce precision.
979
980 // libwebp uses a trick to avoid some extra addition that libvpx does.
981 // Instead of:
982 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
983 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
984 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
985
986 static const int16_t kC1 = 20091;
987 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
988
989 #if defined(USE_INTRINSICS)
Transpose8x2(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)990 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
991 int16x8x2_t* const out) {
992 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
993 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
994 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
995 // b0 d0 b1 d1 b2 d2 ...
996 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
997 }
998
TransformPass(int16x8x2_t * const rows)999 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
1000 // {rows} = in0 | in4
1001 // in8 | in12
1002 // B1 = in4 | in12
1003 const int16x8_t B1 =
1004 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1005 // C0 = kC1 * in4 | kC1 * in12
1006 // C1 = kC2 * in4 | kC2 * in12
1007 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1008 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1009 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1010 vget_low_s16(rows->val[1])); // in0 + in8
1011 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1012 vget_low_s16(rows->val[1])); // in0 - in8
1013 // c = kC2 * in4 - kC1 * in12
1014 // d = kC1 * in4 + kC2 * in12
1015 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1016 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1017 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
1018 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
1019 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
1020 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
1021 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1022 Transpose8x2(E0, E1, rows);
1023 }
1024
TransformOne(const int16_t * in,uint8_t * dst)1025 static void TransformOne(const int16_t* in, uint8_t* dst) {
1026 int16x8x2_t rows;
1027 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1028 TransformPass(&rows);
1029 TransformPass(&rows);
1030 Add4x4(rows.val[0], rows.val[1], dst);
1031 }
1032
1033 #else
1034
TransformOne(const int16_t * in,uint8_t * dst)1035 static void TransformOne(const int16_t* in, uint8_t* dst) {
1036 const int kBPS = BPS;
1037 // kC1, kC2. Padded because vld1.16 loads 8 bytes
1038 const int16_t constants[4] = { kC1, kC2, 0, 0 };
1039 /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1040 __asm__ volatile (
1041 "vld1.16 {q1, q2}, [%[in]] \n"
1042 "vld1.16 {d0}, [%[constants]] \n"
1043
1044 /* d2: in[0]
1045 * d3: in[8]
1046 * d4: in[4]
1047 * d5: in[12]
1048 */
1049 "vswp d3, d4 \n"
1050
1051 /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1052 * q9 = {in[4], in[12]} * kC2 >> 16
1053 */
1054 "vqdmulh.s16 q8, q2, d0[0] \n"
1055 "vqdmulh.s16 q9, q2, d0[1] \n"
1056
1057 /* d22 = a = in[0] + in[8]
1058 * d23 = b = in[0] - in[8]
1059 */
1060 "vqadd.s16 d22, d2, d3 \n"
1061 "vqsub.s16 d23, d2, d3 \n"
1062
1063 /* The multiplication should be x * kC1 >> 16
1064 * However, with vqdmulh we get x * kC1 * 2 >> 16
1065 * (multiply, double, return high half)
1066 * We avoided this in kC2 by pre-shifting the constant.
1067 * q8 = in[4]/[12] * kC1 >> 16
1068 */
1069 "vshr.s16 q8, q8, #1 \n"
1070
1071 /* Add {in[4], in[12]} back after the multiplication. This is handled by
1072 * adding 1 << 16 to kC1 in the libwebp C code.
1073 */
1074 "vqadd.s16 q8, q2, q8 \n"
1075
1076 /* d20 = c = in[4]*kC2 - in[12]*kC1
1077 * d21 = d = in[4]*kC1 + in[12]*kC2
1078 */
1079 "vqsub.s16 d20, d18, d17 \n"
1080 "vqadd.s16 d21, d19, d16 \n"
1081
1082 /* d2 = tmp[0] = a + d
1083 * d3 = tmp[1] = b + c
1084 * d4 = tmp[2] = b - c
1085 * d5 = tmp[3] = a - d
1086 */
1087 "vqadd.s16 d2, d22, d21 \n"
1088 "vqadd.s16 d3, d23, d20 \n"
1089 "vqsub.s16 d4, d23, d20 \n"
1090 "vqsub.s16 d5, d22, d21 \n"
1091
1092 "vzip.16 q1, q2 \n"
1093 "vzip.16 q1, q2 \n"
1094
1095 "vswp d3, d4 \n"
1096
1097 /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1098 * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1099 */
1100 "vqdmulh.s16 q8, q2, d0[0] \n"
1101 "vqdmulh.s16 q9, q2, d0[1] \n"
1102
1103 /* d22 = a = tmp[0] + tmp[8]
1104 * d23 = b = tmp[0] - tmp[8]
1105 */
1106 "vqadd.s16 d22, d2, d3 \n"
1107 "vqsub.s16 d23, d2, d3 \n"
1108
1109 /* See long winded explanations prior */
1110 "vshr.s16 q8, q8, #1 \n"
1111 "vqadd.s16 q8, q2, q8 \n"
1112
1113 /* d20 = c = in[4]*kC2 - in[12]*kC1
1114 * d21 = d = in[4]*kC1 + in[12]*kC2
1115 */
1116 "vqsub.s16 d20, d18, d17 \n"
1117 "vqadd.s16 d21, d19, d16 \n"
1118
1119 /* d2 = tmp[0] = a + d
1120 * d3 = tmp[1] = b + c
1121 * d4 = tmp[2] = b - c
1122 * d5 = tmp[3] = a - d
1123 */
1124 "vqadd.s16 d2, d22, d21 \n"
1125 "vqadd.s16 d3, d23, d20 \n"
1126 "vqsub.s16 d4, d23, d20 \n"
1127 "vqsub.s16 d5, d22, d21 \n"
1128
1129 "vld1.32 d6[0], [%[dst]], %[kBPS] \n"
1130 "vld1.32 d6[1], [%[dst]], %[kBPS] \n"
1131 "vld1.32 d7[0], [%[dst]], %[kBPS] \n"
1132 "vld1.32 d7[1], [%[dst]], %[kBPS] \n"
1133
1134 "sub %[dst], %[dst], %[kBPS], lsl #2 \n"
1135
1136 /* (val) + 4 >> 3 */
1137 "vrshr.s16 d2, d2, #3 \n"
1138 "vrshr.s16 d3, d3, #3 \n"
1139 "vrshr.s16 d4, d4, #3 \n"
1140 "vrshr.s16 d5, d5, #3 \n"
1141
1142 "vzip.16 q1, q2 \n"
1143 "vzip.16 q1, q2 \n"
1144
1145 /* Must accumulate before saturating */
1146 "vmovl.u8 q8, d6 \n"
1147 "vmovl.u8 q9, d7 \n"
1148
1149 "vqadd.s16 q1, q1, q8 \n"
1150 "vqadd.s16 q2, q2, q9 \n"
1151
1152 "vqmovun.s16 d0, q1 \n"
1153 "vqmovun.s16 d1, q2 \n"
1154
1155 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
1156 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
1157 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
1158 "vst1.32 d1[1], [%[dst]] \n"
1159
1160 : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */
1161 : [kBPS] "r"(kBPS), [constants] "r"(constants) /* constants */
1162 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" /* clobbered */
1163 );
1164 }
1165
1166 #endif // USE_INTRINSICS
1167
TransformTwo(const int16_t * in,uint8_t * dst,int do_two)1168 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
1169 TransformOne(in, dst);
1170 if (do_two) {
1171 TransformOne(in + 16, dst + 4);
1172 }
1173 }
1174
TransformDC(const int16_t * in,uint8_t * dst)1175 static void TransformDC(const int16_t* in, uint8_t* dst) {
1176 const int16x8_t DC = vdupq_n_s16(in[0]);
1177 Add4x4(DC, DC, dst);
1178 }
1179
1180 //------------------------------------------------------------------------------
1181
1182 #define STORE_WHT(dst, col, rows) do { \
1183 *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1184 *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1185 *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1186 *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1187 } while (0)
1188
TransformWHT(const int16_t * in,int16_t * out)1189 static void TransformWHT(const int16_t* in, int16_t* out) {
1190 int32x4x4_t tmp;
1191
1192 {
1193 // Load the source.
1194 const int16x4_t in00_03 = vld1_s16(in + 0);
1195 const int16x4_t in04_07 = vld1_s16(in + 4);
1196 const int16x4_t in08_11 = vld1_s16(in + 8);
1197 const int16x4_t in12_15 = vld1_s16(in + 12);
1198 const int32x4_t a0 = vaddl_s16(in00_03, in12_15); // in[0..3] + in[12..15]
1199 const int32x4_t a1 = vaddl_s16(in04_07, in08_11); // in[4..7] + in[8..11]
1200 const int32x4_t a2 = vsubl_s16(in04_07, in08_11); // in[4..7] - in[8..11]
1201 const int32x4_t a3 = vsubl_s16(in00_03, in12_15); // in[0..3] - in[12..15]
1202 tmp.val[0] = vaddq_s32(a0, a1);
1203 tmp.val[1] = vaddq_s32(a3, a2);
1204 tmp.val[2] = vsubq_s32(a0, a1);
1205 tmp.val[3] = vsubq_s32(a3, a2);
1206 // Arrange the temporary results column-wise.
1207 tmp = Transpose4x4(tmp);
1208 }
1209
1210 {
1211 const int32x4_t kCst3 = vdupq_n_s32(3);
1212 const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3); // add rounder
1213 const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1214 const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1215 const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1216 const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1217
1218 tmp.val[0] = vaddq_s32(a0, a1);
1219 tmp.val[1] = vaddq_s32(a3, a2);
1220 tmp.val[2] = vsubq_s32(a0, a1);
1221 tmp.val[3] = vsubq_s32(a3, a2);
1222
1223 // right shift the results by 3.
1224 tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1225 tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1226 tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1227 tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1228
1229 STORE_WHT(out, 0, tmp);
1230 STORE_WHT(out, 1, tmp);
1231 STORE_WHT(out, 2, tmp);
1232 STORE_WHT(out, 3, tmp);
1233 }
1234 }
1235
1236 #undef STORE_WHT
1237
1238 //------------------------------------------------------------------------------
1239
1240 #define MUL(a, b) (((a) * (b)) >> 16)
TransformAC3(const int16_t * in,uint8_t * dst)1241 static void TransformAC3(const int16_t* in, uint8_t* dst) {
1242 static const int kC1_full = 20091 + (1 << 16);
1243 static const int kC2_full = 35468;
1244 const int16x4_t A = vdup_n_s16(in[0]);
1245 const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1246 const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1247 const int c1 = MUL(in[1], kC2_full);
1248 const int d1 = MUL(in[1], kC1_full);
1249 const uint64_t cd = (uint64_t)( d1 & 0xffff) << 0 |
1250 (uint64_t)( c1 & 0xffff) << 16 |
1251 (uint64_t)(-c1 & 0xffff) << 32 |
1252 (uint64_t)(-d1 & 0xffff) << 48;
1253 const int16x4_t CD = vcreate_s16(cd);
1254 const int16x4_t B = vqadd_s16(A, CD);
1255 const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1256 const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1257 Add4x4(m0_m1, m2_m3, dst);
1258 }
1259 #undef MUL
1260
1261 #endif // WEBP_USE_NEON
1262
1263 //------------------------------------------------------------------------------
1264 // Entry point
1265
1266 extern void VP8DspInitNEON(void);
1267
VP8DspInitNEON(void)1268 void VP8DspInitNEON(void) {
1269 #if defined(WEBP_USE_NEON)
1270 VP8Transform = TransformTwo;
1271 VP8TransformAC3 = TransformAC3;
1272 VP8TransformDC = TransformDC;
1273 VP8TransformWHT = TransformWHT;
1274
1275 VP8VFilter16 = VFilter16;
1276 VP8VFilter16i = VFilter16i;
1277 VP8HFilter16 = HFilter16;
1278 #if !defined(WORK_AROUND_GCC)
1279 VP8HFilter16i = HFilter16i;
1280 #endif
1281 VP8VFilter8 = VFilter8;
1282 VP8VFilter8i = VFilter8i;
1283 #if !defined(WORK_AROUND_GCC)
1284 VP8HFilter8 = HFilter8;
1285 VP8HFilter8i = HFilter8i;
1286 #endif
1287 VP8SimpleVFilter16 = SimpleVFilter16;
1288 VP8SimpleHFilter16 = SimpleHFilter16;
1289 VP8SimpleVFilter16i = SimpleVFilter16i;
1290 VP8SimpleHFilter16i = SimpleHFilter16i;
1291 #endif // WEBP_USE_NEON
1292 }
1293