1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of speed-critical encoding functions.
11 //
12 // adapted from libvpx (https://www.webmproject.org/code/)
13
14 #include "src/dsp/dsp.h"
15
16 #if defined(WEBP_USE_NEON)
17
18 #include <assert.h>
19
20 #include "src/dsp/neon.h"
21 #include "src/enc/vp8i_enc.h"
22
23 //------------------------------------------------------------------------------
24 // Transforms (Paragraph 14.4)
25
26 // Inverse transform.
27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
28 // for subtraction to *ref. See the comments there for algorithmic explanations.
29
30 static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
31 static const int16_t kC2 =
32 WEBP_TRANSFORM_AC3_C2 / 2; // half of kC2, actually. See comment above.
33
34 // This code works but is *slower* than the inlined-asm version below
35 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
36 // WEBP_USE_INTRINSICS define.
37 // With gcc-4.8, it's a little faster speed than inlined-assembly.
38 #if defined(WEBP_USE_INTRINSICS)
39
40 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
ConvertU8ToS16_NEON(uint32x2_t v)41 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
42 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
43 }
44
45 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
46 // to the corresponding rows of 'dst'.
SaturateAndStore4x4_NEON(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)47 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
48 const int16x8_t dst01,
49 const int16x8_t dst23) {
50 // Unsigned saturate to 8b.
51 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
52 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
53
54 // Store the results.
55 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
56 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
57 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
58 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
59 }
60
Add4x4_NEON(const int16x8_t row01,const int16x8_t row23,const uint8_t * WEBP_RESTRICT const ref,uint8_t * WEBP_RESTRICT const dst)61 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
62 const int16x8_t row23,
63 const uint8_t* WEBP_RESTRICT const ref,
64 uint8_t* WEBP_RESTRICT const dst) {
65 uint32x2_t dst01 = vdup_n_u32(0);
66 uint32x2_t dst23 = vdup_n_u32(0);
67
68 // Load the source pixels.
69 dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
70 dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
71 dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
72 dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
73
74 {
75 // Convert to 16b.
76 const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
77 const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
78
79 // Descale with rounding.
80 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
81 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
82 // Add the inverse transform.
83 SaturateAndStore4x4_NEON(dst, out01, out23);
84 }
85 }
86
Transpose8x2_NEON(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)87 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
88 const int16x8_t in1,
89 int16x8x2_t* const out) {
90 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
91 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
92 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
93 // b0 d0 b1 d1 b2 d2 ...
94 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
95 }
96
TransformPass_NEON(int16x8x2_t * const rows)97 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
98 // {rows} = in0 | in4
99 // in8 | in12
100 // B1 = in4 | in12
101 const int16x8_t B1 =
102 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
103 // C0 = kC1 * in4 | kC1 * in12
104 // C1 = kC2 * in4 | kC2 * in12
105 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
106 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
107 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
108 vget_low_s16(rows->val[1])); // in0 + in8
109 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
110 vget_low_s16(rows->val[1])); // in0 - in8
111 // c = kC2 * in4 - kC1 * in12
112 // d = kC1 * in4 + kC2 * in12
113 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
114 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
115 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
116 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
117 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
118 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
119 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
120 Transpose8x2_NEON(E0, E1, rows);
121 }
122
ITransformOne_NEON(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)123 static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
124 const int16_t* WEBP_RESTRICT in,
125 uint8_t* WEBP_RESTRICT dst) {
126 int16x8x2_t rows;
127 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
128 TransformPass_NEON(&rows);
129 TransformPass_NEON(&rows);
130 Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
131 }
132
133 #else
134
ITransformOne_NEON(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)135 static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
136 const int16_t* WEBP_RESTRICT in,
137 uint8_t* WEBP_RESTRICT dst) {
138 const int kBPS = BPS;
139 const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
140
141 __asm__ volatile (
142 "vld1.16 {q1, q2}, [%[in]] \n"
143 "vld1.16 {d0}, [%[kC1C2]] \n"
144
145 // d2: in[0]
146 // d3: in[8]
147 // d4: in[4]
148 // d5: in[12]
149 "vswp d3, d4 \n"
150
151 // q8 = {in[4], in[12]} * kC1 * 2 >> 16
152 // q9 = {in[4], in[12]} * kC2 >> 16
153 "vqdmulh.s16 q8, q2, d0[0] \n"
154 "vqdmulh.s16 q9, q2, d0[1] \n"
155
156 // d22 = a = in[0] + in[8]
157 // d23 = b = in[0] - in[8]
158 "vqadd.s16 d22, d2, d3 \n"
159 "vqsub.s16 d23, d2, d3 \n"
160
161 // q8 = in[4]/[12] * kC1 >> 16
162 "vshr.s16 q8, q8, #1 \n"
163
164 // Add {in[4], in[12]} back after the multiplication.
165 "vqadd.s16 q8, q2, q8 \n"
166
167 // d20 = c = in[4]*kC2 - in[12]*kC1
168 // d21 = d = in[4]*kC1 + in[12]*kC2
169 "vqsub.s16 d20, d18, d17 \n"
170 "vqadd.s16 d21, d19, d16 \n"
171
172 // d2 = tmp[0] = a + d
173 // d3 = tmp[1] = b + c
174 // d4 = tmp[2] = b - c
175 // d5 = tmp[3] = a - d
176 "vqadd.s16 d2, d22, d21 \n"
177 "vqadd.s16 d3, d23, d20 \n"
178 "vqsub.s16 d4, d23, d20 \n"
179 "vqsub.s16 d5, d22, d21 \n"
180
181 "vzip.16 q1, q2 \n"
182 "vzip.16 q1, q2 \n"
183
184 "vswp d3, d4 \n"
185
186 // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
187 // q9 = {tmp[4], tmp[12]} * kC2 >> 16
188 "vqdmulh.s16 q8, q2, d0[0] \n"
189 "vqdmulh.s16 q9, q2, d0[1] \n"
190
191 // d22 = a = tmp[0] + tmp[8]
192 // d23 = b = tmp[0] - tmp[8]
193 "vqadd.s16 d22, d2, d3 \n"
194 "vqsub.s16 d23, d2, d3 \n"
195
196 "vshr.s16 q8, q8, #1 \n"
197 "vqadd.s16 q8, q2, q8 \n"
198
199 // d20 = c = in[4]*kC2 - in[12]*kC1
200 // d21 = d = in[4]*kC1 + in[12]*kC2
201 "vqsub.s16 d20, d18, d17 \n"
202 "vqadd.s16 d21, d19, d16 \n"
203
204 // d2 = tmp[0] = a + d
205 // d3 = tmp[1] = b + c
206 // d4 = tmp[2] = b - c
207 // d5 = tmp[3] = a - d
208 "vqadd.s16 d2, d22, d21 \n"
209 "vqadd.s16 d3, d23, d20 \n"
210 "vqsub.s16 d4, d23, d20 \n"
211 "vqsub.s16 d5, d22, d21 \n"
212
213 "vld1.32 d6[0], [%[ref]], %[kBPS] \n"
214 "vld1.32 d6[1], [%[ref]], %[kBPS] \n"
215 "vld1.32 d7[0], [%[ref]], %[kBPS] \n"
216 "vld1.32 d7[1], [%[ref]], %[kBPS] \n"
217
218 "sub %[ref], %[ref], %[kBPS], lsl #2 \n"
219
220 // (val) + 4 >> 3
221 "vrshr.s16 d2, d2, #3 \n"
222 "vrshr.s16 d3, d3, #3 \n"
223 "vrshr.s16 d4, d4, #3 \n"
224 "vrshr.s16 d5, d5, #3 \n"
225
226 "vzip.16 q1, q2 \n"
227 "vzip.16 q1, q2 \n"
228
229 // Must accumulate before saturating
230 "vmovl.u8 q8, d6 \n"
231 "vmovl.u8 q9, d7 \n"
232
233 "vqadd.s16 q1, q1, q8 \n"
234 "vqadd.s16 q2, q2, q9 \n"
235
236 "vqmovun.s16 d0, q1 \n"
237 "vqmovun.s16 d1, q2 \n"
238
239 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
240 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
241 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
242 "vst1.32 d1[1], [%[dst]] \n"
243
244 : [in] "+r"(in), [dst] "+r"(dst) // modified registers
245 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
246 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
247 );
248 }
249
250 #endif // WEBP_USE_INTRINSICS
251
ITransform_NEON(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst,int do_two)252 static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,
253 const int16_t* WEBP_RESTRICT in,
254 uint8_t* WEBP_RESTRICT dst, int do_two) {
255 ITransformOne_NEON(ref, in, dst);
256 if (do_two) {
257 ITransformOne_NEON(ref + 4, in + 16, dst + 4);
258 }
259 }
260
261 // Load all 4x4 pixels into a single uint8x16_t variable.
Load4x4_NEON(const uint8_t * src)262 static uint8x16_t Load4x4_NEON(const uint8_t* src) {
263 uint32x4_t out = vdupq_n_u32(0);
264 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
265 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
266 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
267 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
268 return vreinterpretq_u8_u32(out);
269 }
270
271 // Forward transform.
272
273 #if defined(WEBP_USE_INTRINSICS)
274
Transpose4x4_S16_NEON(const int16x4_t A,const int16x4_t B,const int16x4_t C,const int16x4_t D,int16x8_t * const out01,int16x8_t * const out32)275 static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
276 const int16x4_t B,
277 const int16x4_t C,
278 const int16x4_t D,
279 int16x8_t* const out01,
280 int16x8_t* const out32) {
281 const int16x4x2_t AB = vtrn_s16(A, B);
282 const int16x4x2_t CD = vtrn_s16(C, D);
283 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
284 vreinterpret_s32_s16(CD.val[0]));
285 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
286 vreinterpret_s32_s16(CD.val[1]));
287 *out01 = vreinterpretq_s16_s64(
288 vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
289 vreinterpret_s64_s32(tmp13.val[0])));
290 *out32 = vreinterpretq_s16_s64(
291 vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
292 vreinterpret_s64_s32(tmp02.val[1])));
293 }
294
DiffU8ToS16_NEON(const uint8x8_t a,const uint8x8_t b)295 static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
296 const uint8x8_t b) {
297 return vreinterpretq_s16_u16(vsubl_u8(a, b));
298 }
299
FTransform_NEON(const uint8_t * WEBP_RESTRICT src,const uint8_t * WEBP_RESTRICT ref,int16_t * WEBP_RESTRICT out)300 static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
301 const uint8_t* WEBP_RESTRICT ref,
302 int16_t* WEBP_RESTRICT out) {
303 int16x8_t d0d1, d3d2; // working 4x4 int16 variables
304 {
305 const uint8x16_t S0 = Load4x4_NEON(src);
306 const uint8x16_t R0 = Load4x4_NEON(ref);
307 const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
308 const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
309 const int16x4_t D0 = vget_low_s16(D0D1);
310 const int16x4_t D1 = vget_high_s16(D0D1);
311 const int16x4_t D2 = vget_low_s16(D2D3);
312 const int16x4_t D3 = vget_high_s16(D2D3);
313 Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
314 }
315 { // 1rst pass
316 const int32x4_t kCst937 = vdupq_n_s32(937);
317 const int32x4_t kCst1812 = vdupq_n_s32(1812);
318 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
319 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
320 const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
321 const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
322 vget_high_s16(a0a1_2));
323 const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
324 vget_high_s16(a0a1_2));
325 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
326 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
327 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
328 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
329 const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
330 const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
331 Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
332 }
333 { // 2nd pass
334 // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
335 const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
336 const int32x4_t kCst51000 = vdupq_n_s32(51000);
337 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
338 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
339 const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
340 const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
341 const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
342 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
343 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
344 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
345 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
346 const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
347 const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
348 const int16x4_t a3_eq_0 =
349 vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
350 const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
351 vst1_s16(out + 0, out0);
352 vst1_s16(out + 4, out1);
353 vst1_s16(out + 8, out2);
354 vst1_s16(out + 12, out3);
355 }
356 }
357
358 #else
359
360 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
361 static const int16_t kCoeff16[] = {
362 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
363 };
364 static const int32_t kCoeff32[] = {
365 1812, 1812, 1812, 1812,
366 937, 937, 937, 937,
367 12000, 12000, 12000, 12000,
368 51000, 51000, 51000, 51000
369 };
370
FTransform_NEON(const uint8_t * WEBP_RESTRICT src,const uint8_t * WEBP_RESTRICT ref,int16_t * WEBP_RESTRICT out)371 static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
372 const uint8_t* WEBP_RESTRICT ref,
373 int16_t* WEBP_RESTRICT out) {
374 const int kBPS = BPS;
375 const uint8_t* src_ptr = src;
376 const uint8_t* ref_ptr = ref;
377 const int16_t* coeff16 = kCoeff16;
378 const int32_t* coeff32 = kCoeff32;
379
380 __asm__ volatile (
381 // load src into q4, q5 in high half
382 "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
383 "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
384 "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
385 "vld1.8 {d11}, [%[src_ptr]] \n"
386
387 // load ref into q6, q7 in high half
388 "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
389 "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
390 "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
391 "vld1.8 {d15}, [%[ref_ptr]] \n"
392
393 // Pack the high values in to q4 and q6
394 "vtrn.32 q4, q5 \n"
395 "vtrn.32 q6, q7 \n"
396
397 // d[0-3] = src - ref
398 "vsubl.u8 q0, d8, d12 \n"
399 "vsubl.u8 q1, d9, d13 \n"
400
401 // load coeff16 into q8(d16=5352, d17=2217)
402 "vld1.16 {q8}, [%[coeff16]] \n"
403
404 // load coeff32 high half into q9 = 1812, q10 = 937
405 "vld1.32 {q9, q10}, [%[coeff32]]! \n"
406
407 // load coeff32 low half into q11=12000, q12=51000
408 "vld1.32 {q11,q12}, [%[coeff32]] \n"
409
410 // part 1
411 // Transpose. Register dN is the same as dN in C
412 "vtrn.32 d0, d2 \n"
413 "vtrn.32 d1, d3 \n"
414 "vtrn.16 d0, d1 \n"
415 "vtrn.16 d2, d3 \n"
416
417 "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
418 "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
419 "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
420 "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
421
422 "vadd.s16 d0, d4, d5 \n" // a0 + a1
423 "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
424 "vsub.s16 d2, d4, d5 \n" // a0 - a1
425 "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
426
427 "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
428 "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
429 "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
430 "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
431
432 // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
433 // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
434 "vshrn.s32 d1, q9, #9 \n"
435 "vshrn.s32 d3, q10, #9 \n"
436
437 // part 2
438 // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
439 "vtrn.32 d0, d2 \n"
440 "vtrn.32 d1, d3 \n"
441 "vtrn.16 d0, d1 \n"
442 "vtrn.16 d2, d3 \n"
443
444 "vmov.s16 d26, #7 \n"
445
446 "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
447 "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
448 "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
449 "vadd.s16 d4, d4, d26 \n" // a1 + 7
450 "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
451
452 "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
453 "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
454
455 "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
456 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
457
458 "vceq.s16 d4, d7, #0 \n"
459
460 "vshr.s16 d0, d0, #4 \n"
461 "vshr.s16 d2, d2, #4 \n"
462
463 "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
464 "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
465
466 "vmvn d4, d4 \n" // !(d1 == 0)
467 // op[4] = (c1*2217 + d1*5352 + 12000)>>16
468 "vshrn.s32 d1, q11, #16 \n"
469 // op[4] += (d1!=0)
470 "vsub.s16 d1, d1, d4 \n"
471 // op[12]= (d1*2217 - c1*5352 + 51000)>>16
472 "vshrn.s32 d3, q12, #16 \n"
473
474 // set result to out array
475 "vst1.16 {q0, q1}, [%[out]] \n"
476 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
477 [coeff32] "+r"(coeff32) // modified registers
478 : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
479 [out] "r"(out) // constants
480 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
481 "q10", "q11", "q12", "q13" // clobbered
482 );
483 }
484
485 #endif
486
487 #define LOAD_LANE_16b(VALUE, LANE) do { \
488 (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
489 src += stride; \
490 } while (0)
491
FTransformWHT_NEON(const int16_t * WEBP_RESTRICT src,int16_t * WEBP_RESTRICT out)492 static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,
493 int16_t* WEBP_RESTRICT out) {
494 const int stride = 16;
495 const int16x4_t zero = vdup_n_s16(0);
496 int32x4x4_t tmp0;
497 int16x4x4_t in;
498 INIT_VECTOR4(in, zero, zero, zero, zero);
499 LOAD_LANE_16b(in.val[0], 0);
500 LOAD_LANE_16b(in.val[1], 0);
501 LOAD_LANE_16b(in.val[2], 0);
502 LOAD_LANE_16b(in.val[3], 0);
503 LOAD_LANE_16b(in.val[0], 1);
504 LOAD_LANE_16b(in.val[1], 1);
505 LOAD_LANE_16b(in.val[2], 1);
506 LOAD_LANE_16b(in.val[3], 1);
507 LOAD_LANE_16b(in.val[0], 2);
508 LOAD_LANE_16b(in.val[1], 2);
509 LOAD_LANE_16b(in.val[2], 2);
510 LOAD_LANE_16b(in.val[3], 2);
511 LOAD_LANE_16b(in.val[0], 3);
512 LOAD_LANE_16b(in.val[1], 3);
513 LOAD_LANE_16b(in.val[2], 3);
514 LOAD_LANE_16b(in.val[3], 3);
515
516 {
517 // a0 = in[0 * 16] + in[2 * 16]
518 // a1 = in[1 * 16] + in[3 * 16]
519 // a2 = in[1 * 16] - in[3 * 16]
520 // a3 = in[0 * 16] - in[2 * 16]
521 const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
522 const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
523 const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
524 const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
525 tmp0.val[0] = vaddq_s32(a0, a1);
526 tmp0.val[1] = vaddq_s32(a3, a2);
527 tmp0.val[2] = vsubq_s32(a3, a2);
528 tmp0.val[3] = vsubq_s32(a0, a1);
529 }
530 {
531 const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
532 // a0 = tmp[0 + i] + tmp[ 8 + i]
533 // a1 = tmp[4 + i] + tmp[12 + i]
534 // a2 = tmp[4 + i] - tmp[12 + i]
535 // a3 = tmp[0 + i] - tmp[ 8 + i]
536 const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
537 const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
538 const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
539 const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
540 const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
541 const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
542 const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
543 const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
544 const int16x4_t out0 = vmovn_s32(b0);
545 const int16x4_t out1 = vmovn_s32(b1);
546 const int16x4_t out2 = vmovn_s32(b2);
547 const int16x4_t out3 = vmovn_s32(b3);
548
549 vst1_s16(out + 0, out0);
550 vst1_s16(out + 4, out1);
551 vst1_s16(out + 8, out2);
552 vst1_s16(out + 12, out3);
553 }
554 }
555 #undef LOAD_LANE_16b
556
557 //------------------------------------------------------------------------------
558 // Texture distortion
559 //
560 // We try to match the spectral content (weighted) between source and
561 // reconstructed samples.
562
563 // a 0123, b 0123
564 // a 4567, b 4567
565 // a 89ab, b 89ab
566 // a cdef, b cdef
567 //
568 // transpose
569 //
570 // a 048c, b 048c
571 // a 159d, b 159d
572 // a 26ae, b 26ae
573 // a 37bf, b 37bf
574 //
DistoTranspose4x4S16_NEON(int16x8x4_t q4_in)575 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
576 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
577 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
578 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
579 vreinterpretq_s32_s16(q2_tmp1.val[0]));
580 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
581 vreinterpretq_s32_s16(q2_tmp1.val[1]));
582 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
583 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
584 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
585 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
586 return q4_in;
587 }
588
DistoHorizontalPass_NEON(const int16x8x4_t q4_in)589 static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
590 const int16x8x4_t q4_in) {
591 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
592 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
593 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
594 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
595 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
596 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
597 int16x8x4_t q4_out;
598 // tmp[0] = a0 + a1
599 // tmp[1] = a3 + a2
600 // tmp[2] = a3 - a2
601 // tmp[3] = a0 - a1
602 INIT_VECTOR4(q4_out,
603 vabsq_s16(vaddq_s16(q_a0, q_a1)),
604 vabsq_s16(vaddq_s16(q_a3, q_a2)),
605 vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
606 return q4_out;
607 }
608
DistoVerticalPass_NEON(const uint8x8x4_t q4_in)609 static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
610 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
611 q4_in.val[2]));
612 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
613 q4_in.val[3]));
614 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
615 q4_in.val[3]));
616 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
617 q4_in.val[2]));
618 int16x8x4_t q4_out;
619
620 INIT_VECTOR4(q4_out,
621 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
622 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
623 return q4_out;
624 }
625
DistoLoadW_NEON(const uint16_t * w)626 static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
627 const uint16x8_t q_w07 = vld1q_u16(&w[0]);
628 const uint16x8_t q_w8f = vld1q_u16(&w[8]);
629 int16x4x4_t d4_w;
630 INIT_VECTOR4(d4_w,
631 vget_low_s16(vreinterpretq_s16_u16(q_w07)),
632 vget_high_s16(vreinterpretq_s16_u16(q_w07)),
633 vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
634 vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
635 return d4_w;
636 }
637
DistoSum_NEON(const int16x8x4_t q4_in,const int16x4x4_t d4_w)638 static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
639 const int16x4x4_t d4_w) {
640 int32x2_t d_sum;
641 // sum += w[ 0] * abs(b0);
642 // sum += w[ 4] * abs(b1);
643 // sum += w[ 8] * abs(b2);
644 // sum += w[12] * abs(b3);
645 int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
646 int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
647 int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
648 int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
649 q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
650 q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
651 q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
652 q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
653
654 q_sum0 = vaddq_s32(q_sum0, q_sum1);
655 q_sum2 = vaddq_s32(q_sum2, q_sum3);
656 q_sum2 = vaddq_s32(q_sum0, q_sum2);
657 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
658 d_sum = vpadd_s32(d_sum, d_sum);
659 return d_sum;
660 }
661
662 #define LOAD_LANE_32b(src, VALUE, LANE) \
663 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
664
665 // Hadamard transform
666 // Returns the weighted sum of the absolute value of transformed coefficients.
667 // w[] contains a row-major 4 by 4 symmetric matrix.
Disto4x4_NEON(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,const uint16_t * WEBP_RESTRICT const w)668 static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,
669 const uint8_t* WEBP_RESTRICT const b,
670 const uint16_t* WEBP_RESTRICT const w) {
671 uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
672 uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
673 uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
674 uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
675 uint8x8x4_t d4_in;
676
677 // load data a, b
678 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
679 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
680 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
681 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
682 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
683 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
684 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
685 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
686 INIT_VECTOR4(d4_in,
687 vreinterpret_u8_u32(d_in_ab_0123),
688 vreinterpret_u8_u32(d_in_ab_4567),
689 vreinterpret_u8_u32(d_in_ab_89ab),
690 vreinterpret_u8_u32(d_in_ab_cdef));
691
692 {
693 // Vertical pass first to avoid a transpose (vertical and horizontal passes
694 // are commutative because w/kWeightY is symmetric) and subsequent
695 // transpose.
696 const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
697 const int16x4x4_t d4_w = DistoLoadW_NEON(w);
698 // horizontal pass
699 const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
700 const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
701 int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
702
703 // abs(sum2 - sum1) >> 5
704 d_sum = vabs_s32(d_sum);
705 d_sum = vshr_n_s32(d_sum, 5);
706 return vget_lane_s32(d_sum, 0);
707 }
708 }
709 #undef LOAD_LANE_32b
710
Disto16x16_NEON(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,const uint16_t * WEBP_RESTRICT const w)711 static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,
712 const uint8_t* WEBP_RESTRICT const b,
713 const uint16_t* WEBP_RESTRICT const w) {
714 int D = 0;
715 int x, y;
716 for (y = 0; y < 16 * BPS; y += 4 * BPS) {
717 for (x = 0; x < 16; x += 4) {
718 D += Disto4x4_NEON(a + x + y, b + x + y, w);
719 }
720 }
721 return D;
722 }
723
724 //------------------------------------------------------------------------------
725
CollectHistogram_NEON(const uint8_t * WEBP_RESTRICT ref,const uint8_t * WEBP_RESTRICT pred,int start_block,int end_block,VP8Histogram * WEBP_RESTRICT const histo)726 static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
727 const uint8_t* WEBP_RESTRICT pred,
728 int start_block, int end_block,
729 VP8Histogram* WEBP_RESTRICT const histo) {
730 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
731 int j;
732 int distribution[MAX_COEFF_THRESH + 1] = { 0 };
733 for (j = start_block; j < end_block; ++j) {
734 int16_t out[16];
735 FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
736 {
737 int k;
738 const int16x8_t a0 = vld1q_s16(out + 0);
739 const int16x8_t b0 = vld1q_s16(out + 8);
740 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
741 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
742 const uint16x8_t a2 = vshrq_n_u16(a1, 3);
743 const uint16x8_t b2 = vshrq_n_u16(b1, 3);
744 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
745 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
746 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
747 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
748 // Convert coefficients to bin.
749 for (k = 0; k < 16; ++k) {
750 ++distribution[out[k]];
751 }
752 }
753 }
754 VP8SetHistogramData(distribution, histo);
755 }
756
757 //------------------------------------------------------------------------------
758
AccumulateSSE16_NEON(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,uint32x4_t * const sum)759 static WEBP_INLINE void AccumulateSSE16_NEON(
760 const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,
761 uint32x4_t* const sum) {
762 const uint8x16_t a0 = vld1q_u8(a);
763 const uint8x16_t b0 = vld1q_u8(b);
764 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
765 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
766 vget_low_u8(abs_diff));
767 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
768 vget_high_u8(abs_diff));
769 /* pair-wise adds and widen */
770 const uint32x4_t sum1 = vpaddlq_u16(prod1);
771 const uint32x4_t sum2 = vpaddlq_u16(prod2);
772 *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
773 }
774
775 // Horizontal sum of all four uint32_t values in 'sum'.
SumToInt_NEON(uint32x4_t sum)776 static int SumToInt_NEON(uint32x4_t sum) {
777 #if WEBP_AARCH64
778 return (int)vaddvq_u32(sum);
779 #else
780 const uint64x2_t sum2 = vpaddlq_u32(sum);
781 const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
782 vreinterpret_u32_u64(vget_high_u64(sum2)));
783 return (int)vget_lane_u32(sum3, 0);
784 #endif
785 }
786
SSE16x16_NEON(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)787 static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,
788 const uint8_t* WEBP_RESTRICT b) {
789 uint32x4_t sum = vdupq_n_u32(0);
790 int y;
791 for (y = 0; y < 16; ++y) {
792 AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
793 }
794 return SumToInt_NEON(sum);
795 }
796
SSE16x8_NEON(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)797 static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,
798 const uint8_t* WEBP_RESTRICT b) {
799 uint32x4_t sum = vdupq_n_u32(0);
800 int y;
801 for (y = 0; y < 8; ++y) {
802 AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
803 }
804 return SumToInt_NEON(sum);
805 }
806
SSE8x8_NEON(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)807 static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,
808 const uint8_t* WEBP_RESTRICT b) {
809 uint32x4_t sum = vdupq_n_u32(0);
810 int y;
811 for (y = 0; y < 8; ++y) {
812 const uint8x8_t a0 = vld1_u8(a + y * BPS);
813 const uint8x8_t b0 = vld1_u8(b + y * BPS);
814 const uint8x8_t abs_diff = vabd_u8(a0, b0);
815 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
816 sum = vpadalq_u16(sum, prod);
817 }
818 return SumToInt_NEON(sum);
819 }
820
SSE4x4_NEON(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)821 static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,
822 const uint8_t* WEBP_RESTRICT b) {
823 const uint8x16_t a0 = Load4x4_NEON(a);
824 const uint8x16_t b0 = Load4x4_NEON(b);
825 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
826 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
827 vget_low_u8(abs_diff));
828 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
829 vget_high_u8(abs_diff));
830 /* pair-wise adds and widen */
831 const uint32x4_t sum1 = vpaddlq_u16(prod1);
832 const uint32x4_t sum2 = vpaddlq_u16(prod2);
833 return SumToInt_NEON(vaddq_u32(sum1, sum2));
834 }
835
836 //------------------------------------------------------------------------------
837
838 // Compilation with gcc-4.6.x is problematic for now.
839 #if !defined(WORK_AROUND_GCC)
840
Quantize_NEON(int16_t * WEBP_RESTRICT const in,const VP8Matrix * WEBP_RESTRICT const mtx,int offset)841 static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,
842 const VP8Matrix* WEBP_RESTRICT const mtx,
843 int offset) {
844 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
845 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
846 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
847 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
848 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
849
850 const int16x8_t a = vld1q_s16(in + offset); // in
851 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
852 const int16x8_t sign = vshrq_n_s16(a, 15); // sign
853 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
854 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
855 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
856 const uint32x4_t m2 = vhaddq_u32(m0, bias0);
857 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
858 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
859 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
860 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
861 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
862 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
863 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
864 vst1q_s16(in + offset, c4);
865 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
866 return c3;
867 }
868
869 static const uint8_t kShuffles[4][8] = {
870 { 0, 1, 2, 3, 8, 9, 16, 17 },
871 { 10, 11, 4, 5, 6, 7, 12, 13 },
872 { 18, 19, 24, 25, 26, 27, 20, 21 },
873 { 14, 15, 22, 23, 28, 29, 30, 31 }
874 };
875
QuantizeBlock_NEON(int16_t in[16],int16_t out[16],const VP8Matrix * WEBP_RESTRICT const mtx)876 static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
877 const VP8Matrix* WEBP_RESTRICT const mtx) {
878 const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
879 const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
880 uint8x8x4_t shuffles;
881 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
882 // non-standard versions there.
883 #if defined(__APPLE__) && WEBP_AARCH64 && \
884 defined(__apple_build_version__) && (__apple_build_version__< 6020037)
885 uint8x16x2_t all_out;
886 INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
887 INIT_VECTOR4(shuffles,
888 vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
889 vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
890 vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
891 vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
892 #else
893 uint8x8x4_t all_out;
894 INIT_VECTOR4(all_out,
895 vreinterpret_u8_s16(vget_low_s16(out0)),
896 vreinterpret_u8_s16(vget_high_s16(out0)),
897 vreinterpret_u8_s16(vget_low_s16(out1)),
898 vreinterpret_u8_s16(vget_high_s16(out1)));
899 INIT_VECTOR4(shuffles,
900 vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
901 vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
902 vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
903 vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
904 #endif
905 // Zigzag reordering
906 vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
907 vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
908 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
909 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
910 // test zeros
911 if (*(uint64_t*)(out + 0) != 0) return 1;
912 if (*(uint64_t*)(out + 4) != 0) return 1;
913 if (*(uint64_t*)(out + 8) != 0) return 1;
914 if (*(uint64_t*)(out + 12) != 0) return 1;
915 return 0;
916 }
917
Quantize2Blocks_NEON(int16_t in[32],int16_t out[32],const VP8Matrix * WEBP_RESTRICT const mtx)918 static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
919 const VP8Matrix* WEBP_RESTRICT const mtx) {
920 int nz;
921 nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
922 nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
923 return nz;
924 }
925
926 #endif // !WORK_AROUND_GCC
927
928 #if WEBP_AARCH64
929
930 #if BPS == 32
931 #define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane) \
932 do { \
933 uint8x16_t r; \
934 r = vqtbl2q_u8(qcombined, tbl); \
935 r = vreinterpretq_u8_u32( \
936 vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane), \
937 vreinterpretq_u32_u8(r), 1)); \
938 vst1q_u8(dst, r); \
939 } while (0)
940
941 #define RD4_VR4_LD4_VL4_NEON(dst, tbl) \
942 do { \
943 uint8x16_t r; \
944 r = vqtbl2q_u8(qcombined, tbl); \
945 vst1q_u8(dst, r); \
946 } while (0)
947
Intra4Preds_NEON(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)948 static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
949 const uint8_t* WEBP_RESTRICT top) {
950 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13
951 // L K J I X A B C D E F G H
952 // -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7
953 static const uint8_t kLookupTbl1[64] = {
954 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12,
955 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0,
956 4, 20, 21, 22, 3, 18, 2, 17, 3, 19, 4, 20, 2, 17, 1, 16,
957 2, 18, 3, 19, 1, 16, 31, 31, 1, 17, 2, 18, 31, 31, 31, 31
958 };
959
960 static const uint8_t kLookupTbl2[64] = {
961 20, 21, 22, 23, 5, 6, 7, 8, 22, 23, 24, 25, 6, 7, 8, 9,
962 19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,
963 18, 19, 20, 21, 19, 5, 6, 7, 24, 25, 26, 27, 7, 8, 9, 26,
964 17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27
965 };
966
967 static const uint8_t kLookupTbl3[64] = {
968 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 19, 19, 19, 19,
969 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 18, 18, 18, 18,
970 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 17, 17, 17, 17,
971 30, 30, 30, 30, 0, 0, 0, 0, 21, 22, 23, 24, 16, 16, 16, 16
972 };
973
974 const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1);
975 const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2);
976 const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3);
977
978 const uint8x16_t preload = vld1q_u8(top - 5);
979 uint8x16x2_t qcombined;
980 uint8x16_t result0, result1;
981
982 uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);
983 uint8x16_t b = preload;
984 uint8x16_t c = vextq_u8(a, a, 2);
985
986 uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);
987 uint8x16_t avg2_all = vrhaddq_u8(a, b);
988
989 uint8x8_t preload_x8, sub_a, sub_c;
990 uint8_t result_u8;
991 uint8x8_t res_lo, res_hi;
992 uint8x16_t full_b;
993 uint16x8_t sub, sum_lo, sum_hi;
994
995 preload_x8 = vget_low_u8(c);
996 preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);
997
998 result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;
999
1000 avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);
1001 avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);
1002
1003 qcombined.val[0] = avg2_all;
1004 qcombined.val[1] = avg3_all;
1005
1006 sub_a = vdup_laneq_u8(preload, 4);
1007
1008 // preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}
1009 full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);
1010 // preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}
1011 sub_c = vreinterpret_u8_u32(vdup_n_u32(
1012 vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));
1013
1014 sub = vsubl_u8(sub_c, sub_a);
1015 sum_lo = vaddw_u8(sub, vget_low_u8(full_b));
1016 res_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
1017
1018 sum_hi = vaddw_u8(sub, vget_high_u8(full_b));
1019 res_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
1020
1021 // DC4, VE4, HE4, TM4
1022 DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);
1023 DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);
1024 DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);
1025 DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);
1026
1027 // RD4, VR4, LD4, VL4
1028 RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);
1029 RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);
1030 RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);
1031 RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);
1032
1033 // HD4, HU4
1034 result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);
1035 result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);
1036
1037 vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));
1038 vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));
1039 vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
1040 vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
1041 }
1042 #endif // BPS == 32
1043
Fill_NEON(uint8_t * dst,const uint8_t value)1044 static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {
1045 uint8x16_t a = vdupq_n_u8(value);
1046 int i;
1047 for (i = 0; i < 16; i++) {
1048 vst1q_u8(dst + BPS * i, a);
1049 }
1050 }
1051
Fill16_NEON(uint8_t * dst,const uint8_t * src)1052 static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) {
1053 uint8x16_t a = vld1q_u8(src);
1054 int i;
1055 for (i = 0; i < 16; i++) {
1056 vst1q_u8(dst + BPS * i, a);
1057 }
1058 }
1059
HorizontalPred16_NEON(uint8_t * dst,const uint8_t * left)1060 static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst,
1061 const uint8_t* left) {
1062 uint8x16_t a;
1063
1064 if (left == NULL) {
1065 Fill_NEON(dst, 129);
1066 return;
1067 }
1068
1069 a = vld1q_u8(left + 0);
1070 vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0));
1071 vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1));
1072 vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2));
1073 vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3));
1074 vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4));
1075 vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5));
1076 vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6));
1077 vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7));
1078 vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8));
1079 vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9));
1080 vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10));
1081 vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11));
1082 vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12));
1083 vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13));
1084 vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14));
1085 vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15));
1086 }
1087
VerticalPred16_NEON(uint8_t * dst,const uint8_t * top)1088 static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) {
1089 if (top != NULL) {
1090 Fill16_NEON(dst, top);
1091 } else {
1092 Fill_NEON(dst, 127);
1093 }
1094 }
1095
DCMode_NEON(uint8_t * dst,const uint8_t * left,const uint8_t * top)1096 static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left,
1097 const uint8_t* top) {
1098 uint8_t s;
1099
1100 if (top != NULL) {
1101 uint16_t dc;
1102 dc = vaddlvq_u8(vld1q_u8(top));
1103 if (left != NULL) {
1104 // top and left present.
1105 dc += vaddlvq_u8(vld1q_u8(left));
1106 s = vqrshrnh_n_u16(dc, 5);
1107 } else {
1108 // top but no left.
1109 s = vqrshrnh_n_u16(dc, 4);
1110 }
1111 } else {
1112 if (left != NULL) {
1113 uint16_t dc;
1114 // left but no top.
1115 dc = vaddlvq_u8(vld1q_u8(left));
1116 s = vqrshrnh_n_u16(dc, 4);
1117 } else {
1118 // No top, no left, nothing.
1119 s = 0x80;
1120 }
1121 }
1122 Fill_NEON(dst, s);
1123 }
1124
TrueMotionHelper_NEON(uint8_t * dst,const uint8x8_t outer,const uint8x8x2_t inner,const uint16x8_t a,int i,const int n)1125 static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst,
1126 const uint8x8_t outer,
1127 const uint8x8x2_t inner,
1128 const uint16x8_t a, int i,
1129 const int n) {
1130 uint8x8_t d1, d2;
1131 uint16x8_t r1, r2;
1132
1133 r1 = vaddl_u8(outer, inner.val[0]);
1134 r1 = vqsubq_u16(r1, a);
1135 d1 = vqmovun_s16(vreinterpretq_s16_u16(r1));
1136 r2 = vaddl_u8(outer, inner.val[1]);
1137 r2 = vqsubq_u16(r2, a);
1138 d2 = vqmovun_s16(vreinterpretq_s16_u16(r2));
1139 vst1_u8(dst + BPS * (i * 4 + n), d1);
1140 vst1_u8(dst + BPS * (i * 4 + n) + 8, d2);
1141 }
1142
TrueMotion_NEON(uint8_t * dst,const uint8_t * left,const uint8_t * top)1143 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,
1144 const uint8_t* top) {
1145 int i;
1146 uint16x8_t a;
1147 uint8x8x2_t inner;
1148
1149 if (left == NULL) {
1150 // True motion without left samples (hence: with default 129 value) is
1151 // equivalent to VE prediction where you just copy the top samples.
1152 // Note that if top samples are not available, the default value is then
1153 // 129, and not 127 as in the VerticalPred case.
1154 if (top != NULL) {
1155 VerticalPred16_NEON(dst, top);
1156 } else {
1157 Fill_NEON(dst, 129);
1158 }
1159 return;
1160 }
1161
1162 // left is not NULL.
1163 if (top == NULL) {
1164 HorizontalPred16_NEON(dst, left);
1165 return;
1166 }
1167
1168 // Neither left nor top are NULL.
1169 a = vdupq_n_u16(left[-1]);
1170 inner = vld1_u8_x2(top);
1171
1172 for (i = 0; i < 4; i++) {
1173 const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]);
1174
1175 TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0);
1176 TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1);
1177 TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2);
1178 TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3);
1179 }
1180 }
1181
Intra16Preds_NEON(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top)1182 static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,
1183 const uint8_t* WEBP_RESTRICT left,
1184 const uint8_t* WEBP_RESTRICT top) {
1185 DCMode_NEON(I16DC16 + dst, left, top);
1186 VerticalPred16_NEON(I16VE16 + dst, top);
1187 HorizontalPred16_NEON(I16HE16 + dst, left);
1188 TrueMotion_NEON(I16TM16 + dst, left, top);
1189 }
1190
1191 #endif // WEBP_AARCH64
1192
1193 //------------------------------------------------------------------------------
1194 // Entry point
1195
1196 extern void VP8EncDspInitNEON(void);
1197
VP8EncDspInitNEON(void)1198 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
1199 VP8ITransform = ITransform_NEON;
1200 VP8FTransform = FTransform_NEON;
1201
1202 VP8FTransformWHT = FTransformWHT_NEON;
1203
1204 VP8TDisto4x4 = Disto4x4_NEON;
1205 VP8TDisto16x16 = Disto16x16_NEON;
1206 VP8CollectHistogram = CollectHistogram_NEON;
1207
1208 VP8SSE16x16 = SSE16x16_NEON;
1209 VP8SSE16x8 = SSE16x8_NEON;
1210 VP8SSE8x8 = SSE8x8_NEON;
1211 VP8SSE4x4 = SSE4x4_NEON;
1212
1213 #if WEBP_AARCH64
1214 #if BPS == 32
1215 VP8EncPredLuma4 = Intra4Preds_NEON;
1216 #endif
1217 VP8EncPredLuma16 = Intra16Preds_NEON;
1218 #endif
1219
1220 #if !defined(WORK_AROUND_GCC)
1221 VP8EncQuantizeBlock = QuantizeBlock_NEON;
1222 VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
1223 VP8EncQuantizeBlockWHT = QuantizeBlock_NEON;
1224 #endif
1225 }
1226
1227 #else // !WEBP_USE_NEON
1228
1229 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
1230
1231 #endif // WEBP_USE_NEON
1232