1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of speed-critical encoding functions.
11 //
12 // adapted from libvpx (https://www.webmproject.org/code/)
13
14 #include "src/dsp/dsp.h"
15
16 #if defined(WEBP_USE_NEON)
17
18 #include <assert.h>
19
20 #include "src/dsp/neon.h"
21 #include "src/enc/vp8i_enc.h"
22
23 //------------------------------------------------------------------------------
24 // Transforms (Paragraph 14.4)
25
26 // Inverse transform.
27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
28 // for subtraction to *ref. See the comments there for algorithmic explanations.
29
30 static const int16_t kC1 = 20091;
31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
32
33 // This code works but is *slower* than the inlined-asm version below
34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
35 // WEBP_USE_INTRINSICS define.
36 // With gcc-4.8, it's a little faster speed than inlined-assembly.
37 #if defined(WEBP_USE_INTRINSICS)
38
39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
ConvertU8ToS16_NEON(uint32x2_t v)40 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
41 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
42 }
43
44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
45 // to the corresponding rows of 'dst'.
SaturateAndStore4x4_NEON(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)46 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
47 const int16x8_t dst01,
48 const int16x8_t dst23) {
49 // Unsigned saturate to 8b.
50 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
51 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
52
53 // Store the results.
54 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
55 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
56 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
57 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
58 }
59
Add4x4_NEON(const int16x8_t row01,const int16x8_t row23,const uint8_t * const ref,uint8_t * const dst)60 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
61 const int16x8_t row23,
62 const uint8_t* const ref,
63 uint8_t* const dst) {
64 uint32x2_t dst01 = vdup_n_u32(0);
65 uint32x2_t dst23 = vdup_n_u32(0);
66
67 // Load the source pixels.
68 dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
69 dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
70 dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
71 dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
72
73 {
74 // Convert to 16b.
75 const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
76 const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
77
78 // Descale with rounding.
79 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
80 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
81 // Add the inverse transform.
82 SaturateAndStore4x4_NEON(dst, out01, out23);
83 }
84 }
85
Transpose8x2_NEON(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)86 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
87 const int16x8_t in1,
88 int16x8x2_t* const out) {
89 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
90 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
91 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
92 // b0 d0 b1 d1 b2 d2 ...
93 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
94 }
95
TransformPass_NEON(int16x8x2_t * const rows)96 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
97 // {rows} = in0 | in4
98 // in8 | in12
99 // B1 = in4 | in12
100 const int16x8_t B1 =
101 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
102 // C0 = kC1 * in4 | kC1 * in12
103 // C1 = kC2 * in4 | kC2 * in12
104 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
105 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
106 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
107 vget_low_s16(rows->val[1])); // in0 + in8
108 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
109 vget_low_s16(rows->val[1])); // in0 - in8
110 // c = kC2 * in4 - kC1 * in12
111 // d = kC1 * in4 + kC2 * in12
112 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
113 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
114 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
115 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
116 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
117 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
118 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
119 Transpose8x2_NEON(E0, E1, rows);
120 }
121
ITransformOne_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst)122 static void ITransformOne_NEON(const uint8_t* ref,
123 const int16_t* in, uint8_t* dst) {
124 int16x8x2_t rows;
125 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
126 TransformPass_NEON(&rows);
127 TransformPass_NEON(&rows);
128 Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
129 }
130
131 #else
132
ITransformOne_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst)133 static void ITransformOne_NEON(const uint8_t* ref,
134 const int16_t* in, uint8_t* dst) {
135 const int kBPS = BPS;
136 const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
137
138 __asm__ volatile (
139 "vld1.16 {q1, q2}, [%[in]] \n"
140 "vld1.16 {d0}, [%[kC1C2]] \n"
141
142 // d2: in[0]
143 // d3: in[8]
144 // d4: in[4]
145 // d5: in[12]
146 "vswp d3, d4 \n"
147
148 // q8 = {in[4], in[12]} * kC1 * 2 >> 16
149 // q9 = {in[4], in[12]} * kC2 >> 16
150 "vqdmulh.s16 q8, q2, d0[0] \n"
151 "vqdmulh.s16 q9, q2, d0[1] \n"
152
153 // d22 = a = in[0] + in[8]
154 // d23 = b = in[0] - in[8]
155 "vqadd.s16 d22, d2, d3 \n"
156 "vqsub.s16 d23, d2, d3 \n"
157
158 // q8 = in[4]/[12] * kC1 >> 16
159 "vshr.s16 q8, q8, #1 \n"
160
161 // Add {in[4], in[12]} back after the multiplication.
162 "vqadd.s16 q8, q2, q8 \n"
163
164 // d20 = c = in[4]*kC2 - in[12]*kC1
165 // d21 = d = in[4]*kC1 + in[12]*kC2
166 "vqsub.s16 d20, d18, d17 \n"
167 "vqadd.s16 d21, d19, d16 \n"
168
169 // d2 = tmp[0] = a + d
170 // d3 = tmp[1] = b + c
171 // d4 = tmp[2] = b - c
172 // d5 = tmp[3] = a - d
173 "vqadd.s16 d2, d22, d21 \n"
174 "vqadd.s16 d3, d23, d20 \n"
175 "vqsub.s16 d4, d23, d20 \n"
176 "vqsub.s16 d5, d22, d21 \n"
177
178 "vzip.16 q1, q2 \n"
179 "vzip.16 q1, q2 \n"
180
181 "vswp d3, d4 \n"
182
183 // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
184 // q9 = {tmp[4], tmp[12]} * kC2 >> 16
185 "vqdmulh.s16 q8, q2, d0[0] \n"
186 "vqdmulh.s16 q9, q2, d0[1] \n"
187
188 // d22 = a = tmp[0] + tmp[8]
189 // d23 = b = tmp[0] - tmp[8]
190 "vqadd.s16 d22, d2, d3 \n"
191 "vqsub.s16 d23, d2, d3 \n"
192
193 "vshr.s16 q8, q8, #1 \n"
194 "vqadd.s16 q8, q2, q8 \n"
195
196 // d20 = c = in[4]*kC2 - in[12]*kC1
197 // d21 = d = in[4]*kC1 + in[12]*kC2
198 "vqsub.s16 d20, d18, d17 \n"
199 "vqadd.s16 d21, d19, d16 \n"
200
201 // d2 = tmp[0] = a + d
202 // d3 = tmp[1] = b + c
203 // d4 = tmp[2] = b - c
204 // d5 = tmp[3] = a - d
205 "vqadd.s16 d2, d22, d21 \n"
206 "vqadd.s16 d3, d23, d20 \n"
207 "vqsub.s16 d4, d23, d20 \n"
208 "vqsub.s16 d5, d22, d21 \n"
209
210 "vld1.32 d6[0], [%[ref]], %[kBPS] \n"
211 "vld1.32 d6[1], [%[ref]], %[kBPS] \n"
212 "vld1.32 d7[0], [%[ref]], %[kBPS] \n"
213 "vld1.32 d7[1], [%[ref]], %[kBPS] \n"
214
215 "sub %[ref], %[ref], %[kBPS], lsl #2 \n"
216
217 // (val) + 4 >> 3
218 "vrshr.s16 d2, d2, #3 \n"
219 "vrshr.s16 d3, d3, #3 \n"
220 "vrshr.s16 d4, d4, #3 \n"
221 "vrshr.s16 d5, d5, #3 \n"
222
223 "vzip.16 q1, q2 \n"
224 "vzip.16 q1, q2 \n"
225
226 // Must accumulate before saturating
227 "vmovl.u8 q8, d6 \n"
228 "vmovl.u8 q9, d7 \n"
229
230 "vqadd.s16 q1, q1, q8 \n"
231 "vqadd.s16 q2, q2, q9 \n"
232
233 "vqmovun.s16 d0, q1 \n"
234 "vqmovun.s16 d1, q2 \n"
235
236 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
237 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
238 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
239 "vst1.32 d1[1], [%[dst]] \n"
240
241 : [in] "+r"(in), [dst] "+r"(dst) // modified registers
242 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
243 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
244 );
245 }
246
247 #endif // WEBP_USE_INTRINSICS
248
ITransform_NEON(const uint8_t * ref,const int16_t * in,uint8_t * dst,int do_two)249 static void ITransform_NEON(const uint8_t* ref,
250 const int16_t* in, uint8_t* dst, int do_two) {
251 ITransformOne_NEON(ref, in, dst);
252 if (do_two) {
253 ITransformOne_NEON(ref + 4, in + 16, dst + 4);
254 }
255 }
256
257 // Load all 4x4 pixels into a single uint8x16_t variable.
Load4x4_NEON(const uint8_t * src)258 static uint8x16_t Load4x4_NEON(const uint8_t* src) {
259 uint32x4_t out = vdupq_n_u32(0);
260 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
261 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
262 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
263 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
264 return vreinterpretq_u8_u32(out);
265 }
266
267 // Forward transform.
268
269 #if defined(WEBP_USE_INTRINSICS)
270
Transpose4x4_S16_NEON(const int16x4_t A,const int16x4_t B,const int16x4_t C,const int16x4_t D,int16x8_t * const out01,int16x8_t * const out32)271 static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
272 const int16x4_t B,
273 const int16x4_t C,
274 const int16x4_t D,
275 int16x8_t* const out01,
276 int16x8_t* const out32) {
277 const int16x4x2_t AB = vtrn_s16(A, B);
278 const int16x4x2_t CD = vtrn_s16(C, D);
279 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
280 vreinterpret_s32_s16(CD.val[0]));
281 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
282 vreinterpret_s32_s16(CD.val[1]));
283 *out01 = vreinterpretq_s16_s64(
284 vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
285 vreinterpret_s64_s32(tmp13.val[0])));
286 *out32 = vreinterpretq_s16_s64(
287 vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
288 vreinterpret_s64_s32(tmp02.val[1])));
289 }
290
DiffU8ToS16_NEON(const uint8x8_t a,const uint8x8_t b)291 static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
292 const uint8x8_t b) {
293 return vreinterpretq_s16_u16(vsubl_u8(a, b));
294 }
295
FTransform_NEON(const uint8_t * src,const uint8_t * ref,int16_t * out)296 static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
297 int16_t* out) {
298 int16x8_t d0d1, d3d2; // working 4x4 int16 variables
299 {
300 const uint8x16_t S0 = Load4x4_NEON(src);
301 const uint8x16_t R0 = Load4x4_NEON(ref);
302 const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
303 const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
304 const int16x4_t D0 = vget_low_s16(D0D1);
305 const int16x4_t D1 = vget_high_s16(D0D1);
306 const int16x4_t D2 = vget_low_s16(D2D3);
307 const int16x4_t D3 = vget_high_s16(D2D3);
308 Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
309 }
310 { // 1rst pass
311 const int32x4_t kCst937 = vdupq_n_s32(937);
312 const int32x4_t kCst1812 = vdupq_n_s32(1812);
313 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
314 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
315 const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
316 const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
317 vget_high_s16(a0a1_2));
318 const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
319 vget_high_s16(a0a1_2));
320 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
321 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
322 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
323 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
324 const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
325 const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
326 Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
327 }
328 { // 2nd pass
329 // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
330 const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
331 const int32x4_t kCst51000 = vdupq_n_s32(51000);
332 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
333 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
334 const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
335 const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
336 const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
337 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
338 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
339 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
340 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
341 const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
342 const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
343 const int16x4_t a3_eq_0 =
344 vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
345 const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
346 vst1_s16(out + 0, out0);
347 vst1_s16(out + 4, out1);
348 vst1_s16(out + 8, out2);
349 vst1_s16(out + 12, out3);
350 }
351 }
352
353 #else
354
355 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
356 static const int16_t kCoeff16[] = {
357 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
358 };
359 static const int32_t kCoeff32[] = {
360 1812, 1812, 1812, 1812,
361 937, 937, 937, 937,
362 12000, 12000, 12000, 12000,
363 51000, 51000, 51000, 51000
364 };
365
FTransform_NEON(const uint8_t * src,const uint8_t * ref,int16_t * out)366 static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
367 int16_t* out) {
368 const int kBPS = BPS;
369 const uint8_t* src_ptr = src;
370 const uint8_t* ref_ptr = ref;
371 const int16_t* coeff16 = kCoeff16;
372 const int32_t* coeff32 = kCoeff32;
373
374 __asm__ volatile (
375 // load src into q4, q5 in high half
376 "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
377 "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
378 "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
379 "vld1.8 {d11}, [%[src_ptr]] \n"
380
381 // load ref into q6, q7 in high half
382 "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
383 "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
384 "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
385 "vld1.8 {d15}, [%[ref_ptr]] \n"
386
387 // Pack the high values in to q4 and q6
388 "vtrn.32 q4, q5 \n"
389 "vtrn.32 q6, q7 \n"
390
391 // d[0-3] = src - ref
392 "vsubl.u8 q0, d8, d12 \n"
393 "vsubl.u8 q1, d9, d13 \n"
394
395 // load coeff16 into q8(d16=5352, d17=2217)
396 "vld1.16 {q8}, [%[coeff16]] \n"
397
398 // load coeff32 high half into q9 = 1812, q10 = 937
399 "vld1.32 {q9, q10}, [%[coeff32]]! \n"
400
401 // load coeff32 low half into q11=12000, q12=51000
402 "vld1.32 {q11,q12}, [%[coeff32]] \n"
403
404 // part 1
405 // Transpose. Register dN is the same as dN in C
406 "vtrn.32 d0, d2 \n"
407 "vtrn.32 d1, d3 \n"
408 "vtrn.16 d0, d1 \n"
409 "vtrn.16 d2, d3 \n"
410
411 "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
412 "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
413 "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
414 "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
415
416 "vadd.s16 d0, d4, d5 \n" // a0 + a1
417 "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
418 "vsub.s16 d2, d4, d5 \n" // a0 - a1
419 "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
420
421 "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
422 "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
423 "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
424 "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
425
426 // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
427 // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
428 "vshrn.s32 d1, q9, #9 \n"
429 "vshrn.s32 d3, q10, #9 \n"
430
431 // part 2
432 // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
433 "vtrn.32 d0, d2 \n"
434 "vtrn.32 d1, d3 \n"
435 "vtrn.16 d0, d1 \n"
436 "vtrn.16 d2, d3 \n"
437
438 "vmov.s16 d26, #7 \n"
439
440 "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
441 "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
442 "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
443 "vadd.s16 d4, d4, d26 \n" // a1 + 7
444 "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
445
446 "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
447 "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
448
449 "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
450 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
451
452 "vceq.s16 d4, d7, #0 \n"
453
454 "vshr.s16 d0, d0, #4 \n"
455 "vshr.s16 d2, d2, #4 \n"
456
457 "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
458 "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
459
460 "vmvn d4, d4 \n" // !(d1 == 0)
461 // op[4] = (c1*2217 + d1*5352 + 12000)>>16
462 "vshrn.s32 d1, q11, #16 \n"
463 // op[4] += (d1!=0)
464 "vsub.s16 d1, d1, d4 \n"
465 // op[12]= (d1*2217 - c1*5352 + 51000)>>16
466 "vshrn.s32 d3, q12, #16 \n"
467
468 // set result to out array
469 "vst1.16 {q0, q1}, [%[out]] \n"
470 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
471 [coeff32] "+r"(coeff32) // modified registers
472 : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
473 [out] "r"(out) // constants
474 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
475 "q10", "q11", "q12", "q13" // clobbered
476 );
477 }
478
479 #endif
480
481 #define LOAD_LANE_16b(VALUE, LANE) do { \
482 (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
483 src += stride; \
484 } while (0)
485
FTransformWHT_NEON(const int16_t * src,int16_t * out)486 static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
487 const int stride = 16;
488 const int16x4_t zero = vdup_n_s16(0);
489 int32x4x4_t tmp0;
490 int16x4x4_t in;
491 INIT_VECTOR4(in, zero, zero, zero, zero);
492 LOAD_LANE_16b(in.val[0], 0);
493 LOAD_LANE_16b(in.val[1], 0);
494 LOAD_LANE_16b(in.val[2], 0);
495 LOAD_LANE_16b(in.val[3], 0);
496 LOAD_LANE_16b(in.val[0], 1);
497 LOAD_LANE_16b(in.val[1], 1);
498 LOAD_LANE_16b(in.val[2], 1);
499 LOAD_LANE_16b(in.val[3], 1);
500 LOAD_LANE_16b(in.val[0], 2);
501 LOAD_LANE_16b(in.val[1], 2);
502 LOAD_LANE_16b(in.val[2], 2);
503 LOAD_LANE_16b(in.val[3], 2);
504 LOAD_LANE_16b(in.val[0], 3);
505 LOAD_LANE_16b(in.val[1], 3);
506 LOAD_LANE_16b(in.val[2], 3);
507 LOAD_LANE_16b(in.val[3], 3);
508
509 {
510 // a0 = in[0 * 16] + in[2 * 16]
511 // a1 = in[1 * 16] + in[3 * 16]
512 // a2 = in[1 * 16] - in[3 * 16]
513 // a3 = in[0 * 16] - in[2 * 16]
514 const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
515 const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
516 const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
517 const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
518 tmp0.val[0] = vaddq_s32(a0, a1);
519 tmp0.val[1] = vaddq_s32(a3, a2);
520 tmp0.val[2] = vsubq_s32(a3, a2);
521 tmp0.val[3] = vsubq_s32(a0, a1);
522 }
523 {
524 const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
525 // a0 = tmp[0 + i] + tmp[ 8 + i]
526 // a1 = tmp[4 + i] + tmp[12 + i]
527 // a2 = tmp[4 + i] - tmp[12 + i]
528 // a3 = tmp[0 + i] - tmp[ 8 + i]
529 const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
530 const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
531 const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
532 const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
533 const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
534 const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
535 const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
536 const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
537 const int16x4_t out0 = vmovn_s32(b0);
538 const int16x4_t out1 = vmovn_s32(b1);
539 const int16x4_t out2 = vmovn_s32(b2);
540 const int16x4_t out3 = vmovn_s32(b3);
541
542 vst1_s16(out + 0, out0);
543 vst1_s16(out + 4, out1);
544 vst1_s16(out + 8, out2);
545 vst1_s16(out + 12, out3);
546 }
547 }
548 #undef LOAD_LANE_16b
549
550 //------------------------------------------------------------------------------
551 // Texture distortion
552 //
553 // We try to match the spectral content (weighted) between source and
554 // reconstructed samples.
555
556 // a 0123, b 0123
557 // a 4567, b 4567
558 // a 89ab, b 89ab
559 // a cdef, b cdef
560 //
561 // transpose
562 //
563 // a 048c, b 048c
564 // a 159d, b 159d
565 // a 26ae, b 26ae
566 // a 37bf, b 37bf
567 //
DistoTranspose4x4S16_NEON(int16x8x4_t q4_in)568 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
569 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
570 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
571 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
572 vreinterpretq_s32_s16(q2_tmp1.val[0]));
573 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
574 vreinterpretq_s32_s16(q2_tmp1.val[1]));
575 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
576 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
577 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
578 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
579 return q4_in;
580 }
581
DistoHorizontalPass_NEON(const int16x8x4_t q4_in)582 static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
583 const int16x8x4_t q4_in) {
584 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
585 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
586 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
587 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
588 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
589 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
590 int16x8x4_t q4_out;
591 // tmp[0] = a0 + a1
592 // tmp[1] = a3 + a2
593 // tmp[2] = a3 - a2
594 // tmp[3] = a0 - a1
595 INIT_VECTOR4(q4_out,
596 vabsq_s16(vaddq_s16(q_a0, q_a1)),
597 vabsq_s16(vaddq_s16(q_a3, q_a2)),
598 vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
599 return q4_out;
600 }
601
DistoVerticalPass_NEON(const uint8x8x4_t q4_in)602 static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
603 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
604 q4_in.val[2]));
605 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
606 q4_in.val[3]));
607 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
608 q4_in.val[3]));
609 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
610 q4_in.val[2]));
611 int16x8x4_t q4_out;
612
613 INIT_VECTOR4(q4_out,
614 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
615 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
616 return q4_out;
617 }
618
DistoLoadW_NEON(const uint16_t * w)619 static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
620 const uint16x8_t q_w07 = vld1q_u16(&w[0]);
621 const uint16x8_t q_w8f = vld1q_u16(&w[8]);
622 int16x4x4_t d4_w;
623 INIT_VECTOR4(d4_w,
624 vget_low_s16(vreinterpretq_s16_u16(q_w07)),
625 vget_high_s16(vreinterpretq_s16_u16(q_w07)),
626 vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
627 vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
628 return d4_w;
629 }
630
DistoSum_NEON(const int16x8x4_t q4_in,const int16x4x4_t d4_w)631 static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
632 const int16x4x4_t d4_w) {
633 int32x2_t d_sum;
634 // sum += w[ 0] * abs(b0);
635 // sum += w[ 4] * abs(b1);
636 // sum += w[ 8] * abs(b2);
637 // sum += w[12] * abs(b3);
638 int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
639 int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
640 int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
641 int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
642 q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
643 q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
644 q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
645 q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
646
647 q_sum0 = vaddq_s32(q_sum0, q_sum1);
648 q_sum2 = vaddq_s32(q_sum2, q_sum3);
649 q_sum2 = vaddq_s32(q_sum0, q_sum2);
650 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
651 d_sum = vpadd_s32(d_sum, d_sum);
652 return d_sum;
653 }
654
655 #define LOAD_LANE_32b(src, VALUE, LANE) \
656 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
657
658 // Hadamard transform
659 // Returns the weighted sum of the absolute value of transformed coefficients.
660 // w[] contains a row-major 4 by 4 symmetric matrix.
Disto4x4_NEON(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)661 static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
662 const uint16_t* const w) {
663 uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
664 uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
665 uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
666 uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
667 uint8x8x4_t d4_in;
668
669 // load data a, b
670 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
671 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
672 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
673 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
674 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
675 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
676 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
677 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
678 INIT_VECTOR4(d4_in,
679 vreinterpret_u8_u32(d_in_ab_0123),
680 vreinterpret_u8_u32(d_in_ab_4567),
681 vreinterpret_u8_u32(d_in_ab_89ab),
682 vreinterpret_u8_u32(d_in_ab_cdef));
683
684 {
685 // Vertical pass first to avoid a transpose (vertical and horizontal passes
686 // are commutative because w/kWeightY is symmetric) and subsequent
687 // transpose.
688 const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
689 const int16x4x4_t d4_w = DistoLoadW_NEON(w);
690 // horizontal pass
691 const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
692 const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
693 int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
694
695 // abs(sum2 - sum1) >> 5
696 d_sum = vabs_s32(d_sum);
697 d_sum = vshr_n_s32(d_sum, 5);
698 return vget_lane_s32(d_sum, 0);
699 }
700 }
701 #undef LOAD_LANE_32b
702
Disto16x16_NEON(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)703 static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
704 const uint16_t* const w) {
705 int D = 0;
706 int x, y;
707 for (y = 0; y < 16 * BPS; y += 4 * BPS) {
708 for (x = 0; x < 16; x += 4) {
709 D += Disto4x4_NEON(a + x + y, b + x + y, w);
710 }
711 }
712 return D;
713 }
714
715 //------------------------------------------------------------------------------
716
CollectHistogram_NEON(const uint8_t * ref,const uint8_t * pred,int start_block,int end_block,VP8Histogram * const histo)717 static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
718 int start_block, int end_block,
719 VP8Histogram* const histo) {
720 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
721 int j;
722 int distribution[MAX_COEFF_THRESH + 1] = { 0 };
723 for (j = start_block; j < end_block; ++j) {
724 int16_t out[16];
725 FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
726 {
727 int k;
728 const int16x8_t a0 = vld1q_s16(out + 0);
729 const int16x8_t b0 = vld1q_s16(out + 8);
730 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
731 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
732 const uint16x8_t a2 = vshrq_n_u16(a1, 3);
733 const uint16x8_t b2 = vshrq_n_u16(b1, 3);
734 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
735 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
736 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
737 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
738 // Convert coefficients to bin.
739 for (k = 0; k < 16; ++k) {
740 ++distribution[out[k]];
741 }
742 }
743 }
744 VP8SetHistogramData(distribution, histo);
745 }
746
747 //------------------------------------------------------------------------------
748
AccumulateSSE16_NEON(const uint8_t * const a,const uint8_t * const b,uint32x4_t * const sum)749 static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
750 const uint8_t* const b,
751 uint32x4_t* const sum) {
752 const uint8x16_t a0 = vld1q_u8(a);
753 const uint8x16_t b0 = vld1q_u8(b);
754 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
755 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
756 vget_low_u8(abs_diff));
757 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
758 vget_high_u8(abs_diff));
759 /* pair-wise adds and widen */
760 const uint32x4_t sum1 = vpaddlq_u16(prod1);
761 const uint32x4_t sum2 = vpaddlq_u16(prod2);
762 *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
763 }
764
765 // Horizontal sum of all four uint32_t values in 'sum'.
SumToInt_NEON(uint32x4_t sum)766 static int SumToInt_NEON(uint32x4_t sum) {
767 #if defined(__aarch64__)
768 return (int)vaddvq_u32(sum);
769 #else
770 const uint64x2_t sum2 = vpaddlq_u32(sum);
771 const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
772 vreinterpret_u32_u64(vget_high_u64(sum2)));
773 return (int)vget_lane_u32(sum3, 0);
774 #endif
775 }
776
SSE16x16_NEON(const uint8_t * a,const uint8_t * b)777 static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
778 uint32x4_t sum = vdupq_n_u32(0);
779 int y;
780 for (y = 0; y < 16; ++y) {
781 AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
782 }
783 return SumToInt_NEON(sum);
784 }
785
SSE16x8_NEON(const uint8_t * a,const uint8_t * b)786 static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
787 uint32x4_t sum = vdupq_n_u32(0);
788 int y;
789 for (y = 0; y < 8; ++y) {
790 AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
791 }
792 return SumToInt_NEON(sum);
793 }
794
SSE8x8_NEON(const uint8_t * a,const uint8_t * b)795 static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
796 uint32x4_t sum = vdupq_n_u32(0);
797 int y;
798 for (y = 0; y < 8; ++y) {
799 const uint8x8_t a0 = vld1_u8(a + y * BPS);
800 const uint8x8_t b0 = vld1_u8(b + y * BPS);
801 const uint8x8_t abs_diff = vabd_u8(a0, b0);
802 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
803 sum = vpadalq_u16(sum, prod);
804 }
805 return SumToInt_NEON(sum);
806 }
807
SSE4x4_NEON(const uint8_t * a,const uint8_t * b)808 static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
809 const uint8x16_t a0 = Load4x4_NEON(a);
810 const uint8x16_t b0 = Load4x4_NEON(b);
811 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
812 const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
813 vget_low_u8(abs_diff));
814 const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
815 vget_high_u8(abs_diff));
816 /* pair-wise adds and widen */
817 const uint32x4_t sum1 = vpaddlq_u16(prod1);
818 const uint32x4_t sum2 = vpaddlq_u16(prod2);
819 return SumToInt_NEON(vaddq_u32(sum1, sum2));
820 }
821
822 //------------------------------------------------------------------------------
823
824 // Compilation with gcc-4.6.x is problematic for now.
825 #if !defined(WORK_AROUND_GCC)
826
Quantize_NEON(int16_t * const in,const VP8Matrix * const mtx,int offset)827 static int16x8_t Quantize_NEON(int16_t* const in,
828 const VP8Matrix* const mtx, int offset) {
829 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
830 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
831 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
832 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
833 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
834
835 const int16x8_t a = vld1q_s16(in + offset); // in
836 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
837 const int16x8_t sign = vshrq_n_s16(a, 15); // sign
838 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
839 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
840 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
841 const uint32x4_t m2 = vhaddq_u32(m0, bias0);
842 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
843 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
844 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
845 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
846 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
847 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
848 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
849 vst1q_s16(in + offset, c4);
850 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
851 return c3;
852 }
853
854 static const uint8_t kShuffles[4][8] = {
855 { 0, 1, 2, 3, 8, 9, 16, 17 },
856 { 10, 11, 4, 5, 6, 7, 12, 13 },
857 { 18, 19, 24, 25, 26, 27, 20, 21 },
858 { 14, 15, 22, 23, 28, 29, 30, 31 }
859 };
860
QuantizeBlock_NEON(int16_t in[16],int16_t out[16],const VP8Matrix * const mtx)861 static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
862 const VP8Matrix* const mtx) {
863 const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
864 const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
865 uint8x8x4_t shuffles;
866 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
867 // non-standard versions there.
868 #if defined(__APPLE__) && defined(__aarch64__) && \
869 defined(__apple_build_version__) && (__apple_build_version__< 6020037)
870 uint8x16x2_t all_out;
871 INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
872 INIT_VECTOR4(shuffles,
873 vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
874 vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
875 vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
876 vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
877 #else
878 uint8x8x4_t all_out;
879 INIT_VECTOR4(all_out,
880 vreinterpret_u8_s16(vget_low_s16(out0)),
881 vreinterpret_u8_s16(vget_high_s16(out0)),
882 vreinterpret_u8_s16(vget_low_s16(out1)),
883 vreinterpret_u8_s16(vget_high_s16(out1)));
884 INIT_VECTOR4(shuffles,
885 vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
886 vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
887 vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
888 vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
889 #endif
890 // Zigzag reordering
891 vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
892 vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
893 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
894 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
895 // test zeros
896 if (*(uint64_t*)(out + 0) != 0) return 1;
897 if (*(uint64_t*)(out + 4) != 0) return 1;
898 if (*(uint64_t*)(out + 8) != 0) return 1;
899 if (*(uint64_t*)(out + 12) != 0) return 1;
900 return 0;
901 }
902
Quantize2Blocks_NEON(int16_t in[32],int16_t out[32],const VP8Matrix * const mtx)903 static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
904 const VP8Matrix* const mtx) {
905 int nz;
906 nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
907 nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
908 return nz;
909 }
910
911 #endif // !WORK_AROUND_GCC
912
913 //------------------------------------------------------------------------------
914 // Entry point
915
916 extern void VP8EncDspInitNEON(void);
917
VP8EncDspInitNEON(void)918 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
919 VP8ITransform = ITransform_NEON;
920 VP8FTransform = FTransform_NEON;
921
922 VP8FTransformWHT = FTransformWHT_NEON;
923
924 VP8TDisto4x4 = Disto4x4_NEON;
925 VP8TDisto16x16 = Disto16x16_NEON;
926 VP8CollectHistogram = CollectHistogram_NEON;
927
928 VP8SSE16x16 = SSE16x16_NEON;
929 VP8SSE16x8 = SSE16x8_NEON;
930 VP8SSE8x8 = SSE8x8_NEON;
931 VP8SSE4x4 = SSE4x4_NEON;
932
933 #if !defined(WORK_AROUND_GCC)
934 VP8EncQuantizeBlock = QuantizeBlock_NEON;
935 VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
936 #endif
937 }
938
939 #else // !WEBP_USE_NEON
940
941 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
942
943 #endif // WEBP_USE_NEON
944