1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of speed-critical encoding functions.
11 //
12 // adapted from libvpx (http://www.webmproject.org/code/)
13
14 #include "./dsp.h"
15
16 #if defined(WEBP_USE_NEON)
17
18 #include <assert.h>
19
20 #include "./neon.h"
21 #include "../enc/vp8enci.h"
22
23 //------------------------------------------------------------------------------
24 // Transforms (Paragraph 14.4)
25
26 // Inverse transform.
27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
28 // for subtraction to *ref. See the comments there for algorithmic explanations.
29
30 static const int16_t kC1 = 20091;
31 static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
32
33 // This code works but is *slower* than the inlined-asm version below
34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
35 // WEBP_USE_INTRINSICS define.
36 // With gcc-4.8, it's a little faster speed than inlined-assembly.
37 #if defined(WEBP_USE_INTRINSICS)
38
39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
ConvertU8ToS16(uint32x2_t v)40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
41 return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
42 }
43
44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
45 // to the corresponding rows of 'dst'.
SaturateAndStore4x4(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
47 const int16x8_t dst01,
48 const int16x8_t dst23) {
49 // Unsigned saturate to 8b.
50 const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
51 const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
52
53 // Store the results.
54 vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
55 vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
56 vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
57 vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
58 }
59
Add4x4(const int16x8_t row01,const int16x8_t row23,const uint8_t * const ref,uint8_t * const dst)60 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
61 const uint8_t* const ref, uint8_t* const dst) {
62 uint32x2_t dst01 = vdup_n_u32(0);
63 uint32x2_t dst23 = vdup_n_u32(0);
64
65 // Load the source pixels.
66 dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
67 dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
68 dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
69 dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
70
71 {
72 // Convert to 16b.
73 const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
74 const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
75
76 // Descale with rounding.
77 const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
78 const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
79 // Add the inverse transform.
80 SaturateAndStore4x4(dst, out01, out23);
81 }
82 }
83
Transpose8x2(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)84 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
85 int16x8x2_t* const out) {
86 // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
87 // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
88 const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
89 // b0 d0 b1 d1 b2 d2 ...
90 *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
91 }
92
TransformPass(int16x8x2_t * const rows)93 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
94 // {rows} = in0 | in4
95 // in8 | in12
96 // B1 = in4 | in12
97 const int16x8_t B1 =
98 vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
99 // C0 = kC1 * in4 | kC1 * in12
100 // C1 = kC2 * in4 | kC2 * in12
101 const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
102 const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
103 const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
104 vget_low_s16(rows->val[1])); // in0 + in8
105 const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
106 vget_low_s16(rows->val[1])); // in0 - in8
107 // c = kC2 * in4 - kC1 * in12
108 // d = kC1 * in4 + kC2 * in12
109 const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
110 const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
111 const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
112 const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
113 const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
114 const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
115 const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
116 Transpose8x2(E0, E1, rows);
117 }
118
ITransformOne(const uint8_t * ref,const int16_t * in,uint8_t * dst)119 static void ITransformOne(const uint8_t* ref,
120 const int16_t* in, uint8_t* dst) {
121 int16x8x2_t rows;
122 INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
123 TransformPass(&rows);
124 TransformPass(&rows);
125 Add4x4(rows.val[0], rows.val[1], ref, dst);
126 }
127
128 #else
129
ITransformOne(const uint8_t * ref,const int16_t * in,uint8_t * dst)130 static void ITransformOne(const uint8_t* ref,
131 const int16_t* in, uint8_t* dst) {
132 const int kBPS = BPS;
133 const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
134
135 __asm__ volatile (
136 "vld1.16 {q1, q2}, [%[in]] \n"
137 "vld1.16 {d0}, [%[kC1C2]] \n"
138
139 // d2: in[0]
140 // d3: in[8]
141 // d4: in[4]
142 // d5: in[12]
143 "vswp d3, d4 \n"
144
145 // q8 = {in[4], in[12]} * kC1 * 2 >> 16
146 // q9 = {in[4], in[12]} * kC2 >> 16
147 "vqdmulh.s16 q8, q2, d0[0] \n"
148 "vqdmulh.s16 q9, q2, d0[1] \n"
149
150 // d22 = a = in[0] + in[8]
151 // d23 = b = in[0] - in[8]
152 "vqadd.s16 d22, d2, d3 \n"
153 "vqsub.s16 d23, d2, d3 \n"
154
155 // q8 = in[4]/[12] * kC1 >> 16
156 "vshr.s16 q8, q8, #1 \n"
157
158 // Add {in[4], in[12]} back after the multiplication.
159 "vqadd.s16 q8, q2, q8 \n"
160
161 // d20 = c = in[4]*kC2 - in[12]*kC1
162 // d21 = d = in[4]*kC1 + in[12]*kC2
163 "vqsub.s16 d20, d18, d17 \n"
164 "vqadd.s16 d21, d19, d16 \n"
165
166 // d2 = tmp[0] = a + d
167 // d3 = tmp[1] = b + c
168 // d4 = tmp[2] = b - c
169 // d5 = tmp[3] = a - d
170 "vqadd.s16 d2, d22, d21 \n"
171 "vqadd.s16 d3, d23, d20 \n"
172 "vqsub.s16 d4, d23, d20 \n"
173 "vqsub.s16 d5, d22, d21 \n"
174
175 "vzip.16 q1, q2 \n"
176 "vzip.16 q1, q2 \n"
177
178 "vswp d3, d4 \n"
179
180 // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
181 // q9 = {tmp[4], tmp[12]} * kC2 >> 16
182 "vqdmulh.s16 q8, q2, d0[0] \n"
183 "vqdmulh.s16 q9, q2, d0[1] \n"
184
185 // d22 = a = tmp[0] + tmp[8]
186 // d23 = b = tmp[0] - tmp[8]
187 "vqadd.s16 d22, d2, d3 \n"
188 "vqsub.s16 d23, d2, d3 \n"
189
190 "vshr.s16 q8, q8, #1 \n"
191 "vqadd.s16 q8, q2, q8 \n"
192
193 // d20 = c = in[4]*kC2 - in[12]*kC1
194 // d21 = d = in[4]*kC1 + in[12]*kC2
195 "vqsub.s16 d20, d18, d17 \n"
196 "vqadd.s16 d21, d19, d16 \n"
197
198 // d2 = tmp[0] = a + d
199 // d3 = tmp[1] = b + c
200 // d4 = tmp[2] = b - c
201 // d5 = tmp[3] = a - d
202 "vqadd.s16 d2, d22, d21 \n"
203 "vqadd.s16 d3, d23, d20 \n"
204 "vqsub.s16 d4, d23, d20 \n"
205 "vqsub.s16 d5, d22, d21 \n"
206
207 "vld1.32 d6[0], [%[ref]], %[kBPS] \n"
208 "vld1.32 d6[1], [%[ref]], %[kBPS] \n"
209 "vld1.32 d7[0], [%[ref]], %[kBPS] \n"
210 "vld1.32 d7[1], [%[ref]], %[kBPS] \n"
211
212 "sub %[ref], %[ref], %[kBPS], lsl #2 \n"
213
214 // (val) + 4 >> 3
215 "vrshr.s16 d2, d2, #3 \n"
216 "vrshr.s16 d3, d3, #3 \n"
217 "vrshr.s16 d4, d4, #3 \n"
218 "vrshr.s16 d5, d5, #3 \n"
219
220 "vzip.16 q1, q2 \n"
221 "vzip.16 q1, q2 \n"
222
223 // Must accumulate before saturating
224 "vmovl.u8 q8, d6 \n"
225 "vmovl.u8 q9, d7 \n"
226
227 "vqadd.s16 q1, q1, q8 \n"
228 "vqadd.s16 q2, q2, q9 \n"
229
230 "vqmovun.s16 d0, q1 \n"
231 "vqmovun.s16 d1, q2 \n"
232
233 "vst1.32 d0[0], [%[dst]], %[kBPS] \n"
234 "vst1.32 d0[1], [%[dst]], %[kBPS] \n"
235 "vst1.32 d1[0], [%[dst]], %[kBPS] \n"
236 "vst1.32 d1[1], [%[dst]] \n"
237
238 : [in] "+r"(in), [dst] "+r"(dst) // modified registers
239 : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
240 : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
241 );
242 }
243
244 #endif // WEBP_USE_INTRINSICS
245
ITransform(const uint8_t * ref,const int16_t * in,uint8_t * dst,int do_two)246 static void ITransform(const uint8_t* ref,
247 const int16_t* in, uint8_t* dst, int do_two) {
248 ITransformOne(ref, in, dst);
249 if (do_two) {
250 ITransformOne(ref + 4, in + 16, dst + 4);
251 }
252 }
253
254 // Load all 4x4 pixels into a single uint8x16_t variable.
Load4x4(const uint8_t * src)255 static uint8x16_t Load4x4(const uint8_t* src) {
256 uint32x4_t out = vdupq_n_u32(0);
257 out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
258 out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
259 out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
260 out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
261 return vreinterpretq_u8_u32(out);
262 }
263
264 // Forward transform.
265
266 #if defined(WEBP_USE_INTRINSICS)
267
Transpose4x4_S16(const int16x4_t A,const int16x4_t B,const int16x4_t C,const int16x4_t D,int16x8_t * const out01,int16x8_t * const out32)268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
269 const int16x4_t C, const int16x4_t D,
270 int16x8_t* const out01,
271 int16x8_t* const out32) {
272 const int16x4x2_t AB = vtrn_s16(A, B);
273 const int16x4x2_t CD = vtrn_s16(C, D);
274 const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
275 vreinterpret_s32_s16(CD.val[0]));
276 const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
277 vreinterpret_s32_s16(CD.val[1]));
278 *out01 = vreinterpretq_s16_s64(
279 vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
280 vreinterpret_s64_s32(tmp13.val[0])));
281 *out32 = vreinterpretq_s16_s64(
282 vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
283 vreinterpret_s64_s32(tmp02.val[1])));
284 }
285
DiffU8ToS16(const uint8x8_t a,const uint8x8_t b)286 static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
287 const uint8x8_t b) {
288 return vreinterpretq_s16_u16(vsubl_u8(a, b));
289 }
290
FTransform(const uint8_t * src,const uint8_t * ref,int16_t * out)291 static void FTransform(const uint8_t* src, const uint8_t* ref,
292 int16_t* out) {
293 int16x8_t d0d1, d3d2; // working 4x4 int16 variables
294 {
295 const uint8x16_t S0 = Load4x4(src);
296 const uint8x16_t R0 = Load4x4(ref);
297 const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
298 const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
299 const int16x4_t D0 = vget_low_s16(D0D1);
300 const int16x4_t D1 = vget_high_s16(D0D1);
301 const int16x4_t D2 = vget_low_s16(D2D3);
302 const int16x4_t D3 = vget_high_s16(D2D3);
303 Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
304 }
305 { // 1rst pass
306 const int32x4_t kCst937 = vdupq_n_s32(937);
307 const int32x4_t kCst1812 = vdupq_n_s32(1812);
308 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
309 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
310 const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
311 const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
312 vget_high_s16(a0a1_2));
313 const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
314 vget_high_s16(a0a1_2));
315 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
316 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
317 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
318 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
319 const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
320 const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
321 Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
322 }
323 { // 2nd pass
324 // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
325 const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
326 const int32x4_t kCst51000 = vdupq_n_s32(51000);
327 const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
328 const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
329 const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
330 const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
331 const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
332 const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
333 const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
334 const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
335 const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
336 const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
337 const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
338 const int16x4_t a3_eq_0 =
339 vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
340 const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
341 vst1_s16(out + 0, out0);
342 vst1_s16(out + 4, out1);
343 vst1_s16(out + 8, out2);
344 vst1_s16(out + 12, out3);
345 }
346 }
347
348 #else
349
350 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
351 static const int16_t kCoeff16[] = {
352 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
353 };
354 static const int32_t kCoeff32[] = {
355 1812, 1812, 1812, 1812,
356 937, 937, 937, 937,
357 12000, 12000, 12000, 12000,
358 51000, 51000, 51000, 51000
359 };
360
FTransform(const uint8_t * src,const uint8_t * ref,int16_t * out)361 static void FTransform(const uint8_t* src, const uint8_t* ref,
362 int16_t* out) {
363 const int kBPS = BPS;
364 const uint8_t* src_ptr = src;
365 const uint8_t* ref_ptr = ref;
366 const int16_t* coeff16 = kCoeff16;
367 const int32_t* coeff32 = kCoeff32;
368
369 __asm__ volatile (
370 // load src into q4, q5 in high half
371 "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
372 "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
373 "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
374 "vld1.8 {d11}, [%[src_ptr]] \n"
375
376 // load ref into q6, q7 in high half
377 "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
378 "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
379 "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
380 "vld1.8 {d15}, [%[ref_ptr]] \n"
381
382 // Pack the high values in to q4 and q6
383 "vtrn.32 q4, q5 \n"
384 "vtrn.32 q6, q7 \n"
385
386 // d[0-3] = src - ref
387 "vsubl.u8 q0, d8, d12 \n"
388 "vsubl.u8 q1, d9, d13 \n"
389
390 // load coeff16 into q8(d16=5352, d17=2217)
391 "vld1.16 {q8}, [%[coeff16]] \n"
392
393 // load coeff32 high half into q9 = 1812, q10 = 937
394 "vld1.32 {q9, q10}, [%[coeff32]]! \n"
395
396 // load coeff32 low half into q11=12000, q12=51000
397 "vld1.32 {q11,q12}, [%[coeff32]] \n"
398
399 // part 1
400 // Transpose. Register dN is the same as dN in C
401 "vtrn.32 d0, d2 \n"
402 "vtrn.32 d1, d3 \n"
403 "vtrn.16 d0, d1 \n"
404 "vtrn.16 d2, d3 \n"
405
406 "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
407 "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
408 "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
409 "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
410
411 "vadd.s16 d0, d4, d5 \n" // a0 + a1
412 "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
413 "vsub.s16 d2, d4, d5 \n" // a0 - a1
414 "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
415
416 "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
417 "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
418 "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
419 "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
420
421 // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
422 // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
423 "vshrn.s32 d1, q9, #9 \n"
424 "vshrn.s32 d3, q10, #9 \n"
425
426 // part 2
427 // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
428 "vtrn.32 d0, d2 \n"
429 "vtrn.32 d1, d3 \n"
430 "vtrn.16 d0, d1 \n"
431 "vtrn.16 d2, d3 \n"
432
433 "vmov.s16 d26, #7 \n"
434
435 "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
436 "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
437 "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
438 "vadd.s16 d4, d4, d26 \n" // a1 + 7
439 "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
440
441 "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
442 "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
443
444 "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
445 "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
446
447 "vceq.s16 d4, d7, #0 \n"
448
449 "vshr.s16 d0, d0, #4 \n"
450 "vshr.s16 d2, d2, #4 \n"
451
452 "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
453 "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
454
455 "vmvn d4, d4 \n" // !(d1 == 0)
456 // op[4] = (c1*2217 + d1*5352 + 12000)>>16
457 "vshrn.s32 d1, q11, #16 \n"
458 // op[4] += (d1!=0)
459 "vsub.s16 d1, d1, d4 \n"
460 // op[12]= (d1*2217 - c1*5352 + 51000)>>16
461 "vshrn.s32 d3, q12, #16 \n"
462
463 // set result to out array
464 "vst1.16 {q0, q1}, [%[out]] \n"
465 : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
466 [coeff32] "+r"(coeff32) // modified registers
467 : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
468 [out] "r"(out) // constants
469 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
470 "q10", "q11", "q12", "q13" // clobbered
471 );
472 }
473
474 #endif
475
476 #define LOAD_LANE_16b(VALUE, LANE) do { \
477 (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
478 src += stride; \
479 } while (0)
480
FTransformWHT(const int16_t * src,int16_t * out)481 static void FTransformWHT(const int16_t* src, int16_t* out) {
482 const int stride = 16;
483 const int16x4_t zero = vdup_n_s16(0);
484 int32x4x4_t tmp0;
485 int16x4x4_t in;
486 INIT_VECTOR4(in, zero, zero, zero, zero);
487 LOAD_LANE_16b(in.val[0], 0);
488 LOAD_LANE_16b(in.val[1], 0);
489 LOAD_LANE_16b(in.val[2], 0);
490 LOAD_LANE_16b(in.val[3], 0);
491 LOAD_LANE_16b(in.val[0], 1);
492 LOAD_LANE_16b(in.val[1], 1);
493 LOAD_LANE_16b(in.val[2], 1);
494 LOAD_LANE_16b(in.val[3], 1);
495 LOAD_LANE_16b(in.val[0], 2);
496 LOAD_LANE_16b(in.val[1], 2);
497 LOAD_LANE_16b(in.val[2], 2);
498 LOAD_LANE_16b(in.val[3], 2);
499 LOAD_LANE_16b(in.val[0], 3);
500 LOAD_LANE_16b(in.val[1], 3);
501 LOAD_LANE_16b(in.val[2], 3);
502 LOAD_LANE_16b(in.val[3], 3);
503
504 {
505 // a0 = in[0 * 16] + in[2 * 16]
506 // a1 = in[1 * 16] + in[3 * 16]
507 // a2 = in[1 * 16] - in[3 * 16]
508 // a3 = in[0 * 16] - in[2 * 16]
509 const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
510 const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
511 const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
512 const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
513 tmp0.val[0] = vaddq_s32(a0, a1);
514 tmp0.val[1] = vaddq_s32(a3, a2);
515 tmp0.val[2] = vsubq_s32(a3, a2);
516 tmp0.val[3] = vsubq_s32(a0, a1);
517 }
518 {
519 const int32x4x4_t tmp1 = Transpose4x4(tmp0);
520 // a0 = tmp[0 + i] + tmp[ 8 + i]
521 // a1 = tmp[4 + i] + tmp[12 + i]
522 // a2 = tmp[4 + i] - tmp[12 + i]
523 // a3 = tmp[0 + i] - tmp[ 8 + i]
524 const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
525 const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
526 const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
527 const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
528 const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
529 const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
530 const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
531 const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
532 const int16x4_t out0 = vmovn_s32(b0);
533 const int16x4_t out1 = vmovn_s32(b1);
534 const int16x4_t out2 = vmovn_s32(b2);
535 const int16x4_t out3 = vmovn_s32(b3);
536
537 vst1_s16(out + 0, out0);
538 vst1_s16(out + 4, out1);
539 vst1_s16(out + 8, out2);
540 vst1_s16(out + 12, out3);
541 }
542 }
543 #undef LOAD_LANE_16b
544
545 //------------------------------------------------------------------------------
546 // Texture distortion
547 //
548 // We try to match the spectral content (weighted) between source and
549 // reconstructed samples.
550
551 // a 0123, b 0123
552 // a 4567, b 4567
553 // a 89ab, b 89ab
554 // a cdef, b cdef
555 //
556 // transpose
557 //
558 // a 048c, b 048c
559 // a 159d, b 159d
560 // a 26ae, b 26ae
561 // a 37bf, b 37bf
562 //
DistoTranspose4x4U8(uint8x8x4_t d4_in)563 static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) {
564 const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]);
565 const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]);
566 const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]),
567 vreinterpret_u16_u8(d2_tmp1.val[0]));
568 const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]),
569 vreinterpret_u16_u8(d2_tmp1.val[1]));
570
571 d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]);
572 d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]);
573 d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]);
574 d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]);
575 return d4_in;
576 }
577
DistoTranspose4x4S16(int16x8x4_t q4_in)578 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
579 const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
580 const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
581 const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
582 vreinterpretq_s32_s16(q2_tmp1.val[0]));
583 const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
584 vreinterpretq_s32_s16(q2_tmp1.val[1]));
585 q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
586 q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
587 q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
588 q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
589 return q4_in;
590 }
591
DistoHorizontalPass(const uint8x8x4_t d4_in)592 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) {
593 // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
594 // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
595 const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0],
596 d4_in.val[2]));
597 const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1],
598 d4_in.val[3]));
599 const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0],
600 d4_in.val[2]));
601 const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1],
602 d4_in.val[3]));
603 int16x8x4_t q4_out;
604 // tmp[0] = a0 + a1
605 // tmp[1] = a3 + a2
606 // tmp[2] = a3 - a2
607 // tmp[3] = a0 - a1
608 INIT_VECTOR4(q4_out,
609 vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
610 vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
611 return q4_out;
612 }
613
DistoVerticalPass(int16x8x4_t q4_in)614 static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) {
615 const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
616 const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
617 const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
618 const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
619
620 q4_in.val[0] = vaddq_s16(q_a0, q_a1);
621 q4_in.val[1] = vaddq_s16(q_a3, q_a2);
622 q4_in.val[2] = vabdq_s16(q_a3, q_a2);
623 q4_in.val[3] = vabdq_s16(q_a0, q_a1);
624 q4_in.val[0] = vabsq_s16(q4_in.val[0]);
625 q4_in.val[1] = vabsq_s16(q4_in.val[1]);
626 return q4_in;
627 }
628
DistoLoadW(const uint16_t * w)629 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
630 const uint16x8_t q_w07 = vld1q_u16(&w[0]);
631 const uint16x8_t q_w8f = vld1q_u16(&w[8]);
632 int16x4x4_t d4_w;
633 INIT_VECTOR4(d4_w,
634 vget_low_s16(vreinterpretq_s16_u16(q_w07)),
635 vget_high_s16(vreinterpretq_s16_u16(q_w07)),
636 vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
637 vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
638 return d4_w;
639 }
640
DistoSum(const int16x8x4_t q4_in,const int16x4x4_t d4_w)641 static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
642 const int16x4x4_t d4_w) {
643 int32x2_t d_sum;
644 // sum += w[ 0] * abs(b0);
645 // sum += w[ 4] * abs(b1);
646 // sum += w[ 8] * abs(b2);
647 // sum += w[12] * abs(b3);
648 int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
649 int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
650 int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
651 int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
652 q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
653 q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
654 q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
655 q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
656
657 q_sum0 = vaddq_s32(q_sum0, q_sum1);
658 q_sum2 = vaddq_s32(q_sum2, q_sum3);
659 q_sum2 = vaddq_s32(q_sum0, q_sum2);
660 d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
661 d_sum = vpadd_s32(d_sum, d_sum);
662 return d_sum;
663 }
664
665 #define LOAD_LANE_32b(src, VALUE, LANE) \
666 (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
667
668 // Hadamard transform
669 // Returns the weighted sum of the absolute value of transformed coefficients.
Disto4x4(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)670 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
671 const uint16_t* const w) {
672 uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
673 uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
674 uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
675 uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
676 uint8x8x4_t d4_in;
677
678 // load data a, b
679 LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
680 LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
681 LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
682 LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
683 LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
684 LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
685 LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
686 LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
687 INIT_VECTOR4(d4_in,
688 vreinterpret_u8_u32(d_in_ab_0123),
689 vreinterpret_u8_u32(d_in_ab_4567),
690 vreinterpret_u8_u32(d_in_ab_89ab),
691 vreinterpret_u8_u32(d_in_ab_cdef));
692
693 {
694 // horizontal pass
695 const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in);
696 const int16x8x4_t q4_h = DistoHorizontalPass(d4_t);
697 const int16x4x4_t d4_w = DistoLoadW(w);
698 // vertical pass
699 const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h);
700 const int16x8x4_t q4_v = DistoVerticalPass(q4_t);
701 int32x2_t d_sum = DistoSum(q4_v, d4_w);
702
703 // abs(sum2 - sum1) >> 5
704 d_sum = vabs_s32(d_sum);
705 d_sum = vshr_n_s32(d_sum, 5);
706 return vget_lane_s32(d_sum, 0);
707 }
708 }
709 #undef LOAD_LANE_32b
710
Disto16x16(const uint8_t * const a,const uint8_t * const b,const uint16_t * const w)711 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
712 const uint16_t* const w) {
713 int D = 0;
714 int x, y;
715 for (y = 0; y < 16 * BPS; y += 4 * BPS) {
716 for (x = 0; x < 16; x += 4) {
717 D += Disto4x4(a + x + y, b + x + y, w);
718 }
719 }
720 return D;
721 }
722
723 //------------------------------------------------------------------------------
724
CollectHistogram(const uint8_t * ref,const uint8_t * pred,int start_block,int end_block,VP8Histogram * const histo)725 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
726 int start_block, int end_block,
727 VP8Histogram* const histo) {
728 const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
729 int j;
730 int distribution[MAX_COEFF_THRESH + 1] = { 0 };
731 for (j = start_block; j < end_block; ++j) {
732 int16_t out[16];
733 FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
734 {
735 int k;
736 const int16x8_t a0 = vld1q_s16(out + 0);
737 const int16x8_t b0 = vld1q_s16(out + 8);
738 const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
739 const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
740 const uint16x8_t a2 = vshrq_n_u16(a1, 3);
741 const uint16x8_t b2 = vshrq_n_u16(b1, 3);
742 const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
743 const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
744 vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
745 vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
746 // Convert coefficients to bin.
747 for (k = 0; k < 16; ++k) {
748 ++distribution[out[k]];
749 }
750 }
751 }
752 VP8SetHistogramData(distribution, histo);
753 }
754
755 //------------------------------------------------------------------------------
756
AccumulateSSE16(const uint8_t * const a,const uint8_t * const b,uint32x4_t * const sum)757 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
758 const uint8_t* const b,
759 uint32x4_t* const sum) {
760 const uint8x16_t a0 = vld1q_u8(a);
761 const uint8x16_t b0 = vld1q_u8(b);
762 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
763 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
764 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
765 *sum = vpadalq_u16(*sum, prod); // pair-wise add and accumulate
766 }
767
768 // Horizontal sum of all four uint32_t values in 'sum'.
SumToInt(uint32x4_t sum)769 static int SumToInt(uint32x4_t sum) {
770 const uint64x2_t sum2 = vpaddlq_u32(sum);
771 const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
772 return (int)sum3;
773 }
774
SSE16x16(const uint8_t * a,const uint8_t * b)775 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
776 uint32x4_t sum = vdupq_n_u32(0);
777 int y;
778 for (y = 0; y < 16; ++y) {
779 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
780 }
781 return SumToInt(sum);
782 }
783
SSE16x8(const uint8_t * a,const uint8_t * b)784 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
785 uint32x4_t sum = vdupq_n_u32(0);
786 int y;
787 for (y = 0; y < 8; ++y) {
788 AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
789 }
790 return SumToInt(sum);
791 }
792
SSE8x8(const uint8_t * a,const uint8_t * b)793 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
794 uint32x4_t sum = vdupq_n_u32(0);
795 int y;
796 for (y = 0; y < 8; ++y) {
797 const uint8x8_t a0 = vld1_u8(a + y * BPS);
798 const uint8x8_t b0 = vld1_u8(b + y * BPS);
799 const uint8x8_t abs_diff = vabd_u8(a0, b0);
800 const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
801 sum = vpadalq_u16(sum, prod);
802 }
803 return SumToInt(sum);
804 }
805
SSE4x4(const uint8_t * a,const uint8_t * b)806 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
807 const uint8x16_t a0 = Load4x4(a);
808 const uint8x16_t b0 = Load4x4(b);
809 const uint8x16_t abs_diff = vabdq_u8(a0, b0);
810 uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
811 prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
812 return SumToInt(vpaddlq_u16(prod));
813 }
814
815 //------------------------------------------------------------------------------
816
817 // Compilation with gcc-4.6.x is problematic for now.
818 #if !defined(WORK_AROUND_GCC)
819
Quantize(int16_t * const in,const VP8Matrix * const mtx,int offset)820 static int16x8_t Quantize(int16_t* const in,
821 const VP8Matrix* const mtx, int offset) {
822 const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
823 const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
824 const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
825 const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
826 const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
827
828 const int16x8_t a = vld1q_s16(in + offset); // in
829 const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
830 const int16x8_t sign = vshrq_n_s16(a, 15); // sign
831 const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
832 const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
833 const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
834 const uint32x4_t m2 = vhaddq_u32(m0, bias0);
835 const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
836 const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
837 vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
838 const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
839 const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
840 const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
841 const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
842 vst1q_s16(in + offset, c4);
843 assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
844 return c3;
845 }
846
847 static const uint8_t kShuffles[4][8] = {
848 { 0, 1, 2, 3, 8, 9, 16, 17 },
849 { 10, 11, 4, 5, 6, 7, 12, 13 },
850 { 18, 19, 24, 25, 26, 27, 20, 21 },
851 { 14, 15, 22, 23, 28, 29, 30, 31 }
852 };
853
QuantizeBlock(int16_t in[16],int16_t out[16],const VP8Matrix * const mtx)854 static int QuantizeBlock(int16_t in[16], int16_t out[16],
855 const VP8Matrix* const mtx) {
856 const int16x8_t out0 = Quantize(in, mtx, 0);
857 const int16x8_t out1 = Quantize(in, mtx, 8);
858 uint8x8x4_t shuffles;
859 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
860 // non-standard versions there.
861 #if defined(__APPLE__) && defined(__aarch64__) && \
862 defined(__apple_build_version__) && (__apple_build_version__< 6020037)
863 uint8x16x2_t all_out;
864 INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
865 INIT_VECTOR4(shuffles,
866 vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
867 vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
868 vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
869 vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
870 #else
871 uint8x8x4_t all_out;
872 INIT_VECTOR4(all_out,
873 vreinterpret_u8_s16(vget_low_s16(out0)),
874 vreinterpret_u8_s16(vget_high_s16(out0)),
875 vreinterpret_u8_s16(vget_low_s16(out1)),
876 vreinterpret_u8_s16(vget_high_s16(out1)));
877 INIT_VECTOR4(shuffles,
878 vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
879 vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
880 vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
881 vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
882 #endif
883 // Zigzag reordering
884 vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
885 vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
886 vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
887 vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
888 // test zeros
889 if (*(uint64_t*)(out + 0) != 0) return 1;
890 if (*(uint64_t*)(out + 4) != 0) return 1;
891 if (*(uint64_t*)(out + 8) != 0) return 1;
892 if (*(uint64_t*)(out + 12) != 0) return 1;
893 return 0;
894 }
895
Quantize2Blocks(int16_t in[32],int16_t out[32],const VP8Matrix * const mtx)896 static int Quantize2Blocks(int16_t in[32], int16_t out[32],
897 const VP8Matrix* const mtx) {
898 int nz;
899 nz = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
900 nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
901 return nz;
902 }
903
904 #endif // !WORK_AROUND_GCC
905
906 //------------------------------------------------------------------------------
907 // Entry point
908
909 extern void VP8EncDspInitNEON(void);
910
VP8EncDspInitNEON(void)911 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
912 VP8ITransform = ITransform;
913 VP8FTransform = FTransform;
914
915 VP8FTransformWHT = FTransformWHT;
916
917 VP8TDisto4x4 = Disto4x4;
918 VP8TDisto16x16 = Disto16x16;
919 VP8CollectHistogram = CollectHistogram;
920 VP8SSE16x16 = SSE16x16;
921 VP8SSE16x8 = SSE16x8;
922 VP8SSE8x8 = SSE8x8;
923 VP8SSE4x4 = SSE4x4;
924 #if !defined(WORK_AROUND_GCC)
925 VP8EncQuantizeBlock = QuantizeBlock;
926 VP8EncQuantize2Blocks = Quantize2Blocks;
927 #endif
928 }
929
930 #else // !WEBP_USE_NEON
931
932 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
933
934 #endif // WEBP_USE_NEON
935