• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // ARM NEON version of speed-critical encoding functions.
11 //
12 // adapted from libvpx (https://www.webmproject.org/code/)
13 
14 #include "src/dsp/dsp.h"
15 
16 #if defined(WEBP_USE_NEON)
17 
18 #include <assert.h>
19 
20 #include "src/dsp/neon.h"
21 #include "src/enc/vp8i_enc.h"
22 
23 //------------------------------------------------------------------------------
24 // Transforms (Paragraph 14.4)
25 
26 // Inverse transform.
27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
28 // for subtraction to *ref. See the comments there for algorithmic explanations.
29 
30 static const int16_t kC1 = WEBP_TRANSFORM_AC3_C1;
31 static const int16_t kC2 =
32     WEBP_TRANSFORM_AC3_C2 / 2;  // half of kC2, actually. See comment above.
33 
34 // This code works but is *slower* than the inlined-asm version below
35 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
36 // WEBP_USE_INTRINSICS define.
37 // With gcc-4.8, it's a little faster speed than inlined-assembly.
38 #if defined(WEBP_USE_INTRINSICS)
39 
40 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
ConvertU8ToS16_NEON(uint32x2_t v)41 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
42   return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
43 }
44 
45 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
46 // to the corresponding rows of 'dst'.
SaturateAndStore4x4_NEON(uint8_t * const dst,const int16x8_t dst01,const int16x8_t dst23)47 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
48                                                  const int16x8_t dst01,
49                                                  const int16x8_t dst23) {
50   // Unsigned saturate to 8b.
51   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
52   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
53 
54   // Store the results.
55   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
56   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
57   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
58   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
59 }
60 
Add4x4_NEON(const int16x8_t row01,const int16x8_t row23,const uint8_t * WEBP_RESTRICT const ref,uint8_t * WEBP_RESTRICT const dst)61 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
62                                     const int16x8_t row23,
63                                     const uint8_t* WEBP_RESTRICT const ref,
64                                     uint8_t* WEBP_RESTRICT const dst) {
65   uint32x2_t dst01 = vdup_n_u32(0);
66   uint32x2_t dst23 = vdup_n_u32(0);
67 
68   // Load the source pixels.
69   dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
70   dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
71   dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
72   dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
73 
74   {
75     // Convert to 16b.
76     const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
77     const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
78 
79     // Descale with rounding.
80     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
81     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
82     // Add the inverse transform.
83     SaturateAndStore4x4_NEON(dst, out01, out23);
84   }
85 }
86 
Transpose8x2_NEON(const int16x8_t in0,const int16x8_t in1,int16x8x2_t * const out)87 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
88                                           const int16x8_t in1,
89                                           int16x8x2_t* const out) {
90   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
91   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
92   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
93                                                   // b0 d0 b1 d1 b2 d2 ...
94   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
95 }
96 
TransformPass_NEON(int16x8x2_t * const rows)97 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
98   // {rows} = in0 | in4
99   //          in8 | in12
100   // B1 = in4 | in12
101   const int16x8_t B1 =
102       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
103   // C0 = kC1 * in4 | kC1 * in12
104   // C1 = kC2 * in4 | kC2 * in12
105   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
106   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
107   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
108                                 vget_low_s16(rows->val[1]));   // in0 + in8
109   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
110                                 vget_low_s16(rows->val[1]));   // in0 - in8
111   // c = kC2 * in4 - kC1 * in12
112   // d = kC1 * in4 + kC2 * in12
113   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
114   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
115   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
116   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
117   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
118   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
119   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
120   Transpose8x2_NEON(E0, E1, rows);
121 }
122 
ITransformOne_NEON(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)123 static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
124                                const int16_t* WEBP_RESTRICT in,
125                                uint8_t* WEBP_RESTRICT dst) {
126   int16x8x2_t rows;
127   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
128   TransformPass_NEON(&rows);
129   TransformPass_NEON(&rows);
130   Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
131 }
132 
133 #else
134 
ITransformOne_NEON(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst)135 static void ITransformOne_NEON(const uint8_t* WEBP_RESTRICT ref,
136                                const int16_t* WEBP_RESTRICT in,
137                                uint8_t* WEBP_RESTRICT dst) {
138   const int kBPS = BPS;
139   const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
140 
141   __asm__ volatile (
142     "vld1.16         {q1, q2}, [%[in]]           \n"
143     "vld1.16         {d0}, [%[kC1C2]]            \n"
144 
145     // d2: in[0]
146     // d3: in[8]
147     // d4: in[4]
148     // d5: in[12]
149     "vswp            d3, d4                      \n"
150 
151     // q8 = {in[4], in[12]} * kC1 * 2 >> 16
152     // q9 = {in[4], in[12]} * kC2 >> 16
153     "vqdmulh.s16     q8, q2, d0[0]               \n"
154     "vqdmulh.s16     q9, q2, d0[1]               \n"
155 
156     // d22 = a = in[0] + in[8]
157     // d23 = b = in[0] - in[8]
158     "vqadd.s16       d22, d2, d3                 \n"
159     "vqsub.s16       d23, d2, d3                 \n"
160 
161     //  q8 = in[4]/[12] * kC1 >> 16
162     "vshr.s16        q8, q8, #1                  \n"
163 
164     // Add {in[4], in[12]} back after the multiplication.
165     "vqadd.s16       q8, q2, q8                  \n"
166 
167     // d20 = c = in[4]*kC2 - in[12]*kC1
168     // d21 = d = in[4]*kC1 + in[12]*kC2
169     "vqsub.s16       d20, d18, d17               \n"
170     "vqadd.s16       d21, d19, d16               \n"
171 
172     // d2 = tmp[0] = a + d
173     // d3 = tmp[1] = b + c
174     // d4 = tmp[2] = b - c
175     // d5 = tmp[3] = a - d
176     "vqadd.s16       d2, d22, d21                \n"
177     "vqadd.s16       d3, d23, d20                \n"
178     "vqsub.s16       d4, d23, d20                \n"
179     "vqsub.s16       d5, d22, d21                \n"
180 
181     "vzip.16         q1, q2                      \n"
182     "vzip.16         q1, q2                      \n"
183 
184     "vswp            d3, d4                      \n"
185 
186     // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
187     // q9 = {tmp[4], tmp[12]} * kC2 >> 16
188     "vqdmulh.s16     q8, q2, d0[0]               \n"
189     "vqdmulh.s16     q9, q2, d0[1]               \n"
190 
191     // d22 = a = tmp[0] + tmp[8]
192     // d23 = b = tmp[0] - tmp[8]
193     "vqadd.s16       d22, d2, d3                 \n"
194     "vqsub.s16       d23, d2, d3                 \n"
195 
196     "vshr.s16        q8, q8, #1                  \n"
197     "vqadd.s16       q8, q2, q8                  \n"
198 
199     // d20 = c = in[4]*kC2 - in[12]*kC1
200     // d21 = d = in[4]*kC1 + in[12]*kC2
201     "vqsub.s16       d20, d18, d17               \n"
202     "vqadd.s16       d21, d19, d16               \n"
203 
204     // d2 = tmp[0] = a + d
205     // d3 = tmp[1] = b + c
206     // d4 = tmp[2] = b - c
207     // d5 = tmp[3] = a - d
208     "vqadd.s16       d2, d22, d21                \n"
209     "vqadd.s16       d3, d23, d20                \n"
210     "vqsub.s16       d4, d23, d20                \n"
211     "vqsub.s16       d5, d22, d21                \n"
212 
213     "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
214     "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
215     "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
216     "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
217 
218     "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
219 
220     // (val) + 4 >> 3
221     "vrshr.s16       d2, d2, #3                  \n"
222     "vrshr.s16       d3, d3, #3                  \n"
223     "vrshr.s16       d4, d4, #3                  \n"
224     "vrshr.s16       d5, d5, #3                  \n"
225 
226     "vzip.16         q1, q2                      \n"
227     "vzip.16         q1, q2                      \n"
228 
229     // Must accumulate before saturating
230     "vmovl.u8        q8, d6                      \n"
231     "vmovl.u8        q9, d7                      \n"
232 
233     "vqadd.s16       q1, q1, q8                  \n"
234     "vqadd.s16       q2, q2, q9                  \n"
235 
236     "vqmovun.s16     d0, q1                      \n"
237     "vqmovun.s16     d1, q2                      \n"
238 
239     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
240     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
241     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
242     "vst1.32         d1[1], [%[dst]]             \n"
243 
244     : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
245     : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
246     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
247   );
248 }
249 
250 #endif    // WEBP_USE_INTRINSICS
251 
ITransform_NEON(const uint8_t * WEBP_RESTRICT ref,const int16_t * WEBP_RESTRICT in,uint8_t * WEBP_RESTRICT dst,int do_two)252 static void ITransform_NEON(const uint8_t* WEBP_RESTRICT ref,
253                             const int16_t* WEBP_RESTRICT in,
254                             uint8_t* WEBP_RESTRICT dst, int do_two) {
255   ITransformOne_NEON(ref, in, dst);
256   if (do_two) {
257     ITransformOne_NEON(ref + 4, in + 16, dst + 4);
258   }
259 }
260 
261 // Load all 4x4 pixels into a single uint8x16_t variable.
Load4x4_NEON(const uint8_t * src)262 static uint8x16_t Load4x4_NEON(const uint8_t* src) {
263   uint32x4_t out = vdupq_n_u32(0);
264   out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
265   out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
266   out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
267   out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
268   return vreinterpretq_u8_u32(out);
269 }
270 
271 // Forward transform.
272 
273 #if defined(WEBP_USE_INTRINSICS)
274 
Transpose4x4_S16_NEON(const int16x4_t A,const int16x4_t B,const int16x4_t C,const int16x4_t D,int16x8_t * const out01,int16x8_t * const out32)275 static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
276                                               const int16x4_t B,
277                                               const int16x4_t C,
278                                               const int16x4_t D,
279                                               int16x8_t* const out01,
280                                               int16x8_t* const out32) {
281   const int16x4x2_t AB = vtrn_s16(A, B);
282   const int16x4x2_t CD = vtrn_s16(C, D);
283   const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
284                                      vreinterpret_s32_s16(CD.val[0]));
285   const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
286                                      vreinterpret_s32_s16(CD.val[1]));
287   *out01 = vreinterpretq_s16_s64(
288       vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
289                    vreinterpret_s64_s32(tmp13.val[0])));
290   *out32 = vreinterpretq_s16_s64(
291       vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
292                    vreinterpret_s64_s32(tmp02.val[1])));
293 }
294 
DiffU8ToS16_NEON(const uint8x8_t a,const uint8x8_t b)295 static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
296                                               const uint8x8_t b) {
297   return vreinterpretq_s16_u16(vsubl_u8(a, b));
298 }
299 
FTransform_NEON(const uint8_t * WEBP_RESTRICT src,const uint8_t * WEBP_RESTRICT ref,int16_t * WEBP_RESTRICT out)300 static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
301                             const uint8_t* WEBP_RESTRICT ref,
302                             int16_t* WEBP_RESTRICT out) {
303   int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
304   {
305     const uint8x16_t S0 = Load4x4_NEON(src);
306     const uint8x16_t R0 = Load4x4_NEON(ref);
307     const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
308     const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
309     const int16x4_t D0 = vget_low_s16(D0D1);
310     const int16x4_t D1 = vget_high_s16(D0D1);
311     const int16x4_t D2 = vget_low_s16(D2D3);
312     const int16x4_t D3 = vget_high_s16(D2D3);
313     Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
314   }
315   {    // 1rst pass
316     const int32x4_t kCst937 = vdupq_n_s32(937);
317     const int32x4_t kCst1812 = vdupq_n_s32(1812);
318     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
319     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
320     const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
321     const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
322                                     vget_high_s16(a0a1_2));
323     const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
324                                     vget_high_s16(a0a1_2));
325     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
326     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
327     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
328     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
329     const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
330     const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
331     Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
332   }
333   {    // 2nd pass
334     // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
335     const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
336     const int32x4_t kCst51000 = vdupq_n_s32(51000);
337     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
338     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
339     const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
340     const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
341     const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
342     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
343     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
344     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
345     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
346     const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
347     const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
348     const int16x4_t a3_eq_0 =
349         vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
350     const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
351     vst1_s16(out +  0, out0);
352     vst1_s16(out +  4, out1);
353     vst1_s16(out +  8, out2);
354     vst1_s16(out + 12, out3);
355   }
356 }
357 
358 #else
359 
360 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
361 static const int16_t kCoeff16[] = {
362   5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
363 };
364 static const int32_t kCoeff32[] = {
365    1812,  1812,  1812,  1812,
366     937,   937,   937,   937,
367   12000, 12000, 12000, 12000,
368   51000, 51000, 51000, 51000
369 };
370 
FTransform_NEON(const uint8_t * WEBP_RESTRICT src,const uint8_t * WEBP_RESTRICT ref,int16_t * WEBP_RESTRICT out)371 static void FTransform_NEON(const uint8_t* WEBP_RESTRICT src,
372                             const uint8_t* WEBP_RESTRICT ref,
373                             int16_t* WEBP_RESTRICT out) {
374   const int kBPS = BPS;
375   const uint8_t* src_ptr = src;
376   const uint8_t* ref_ptr = ref;
377   const int16_t* coeff16 = kCoeff16;
378   const int32_t* coeff32 = kCoeff32;
379 
380   __asm__ volatile (
381     // load src into q4, q5 in high half
382     "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
383     "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
384     "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
385     "vld1.8 {d11}, [%[src_ptr]]               \n"
386 
387     // load ref into q6, q7 in high half
388     "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
389     "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
390     "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
391     "vld1.8 {d15}, [%[ref_ptr]]               \n"
392 
393     // Pack the high values in to q4 and q6
394     "vtrn.32     q4, q5                       \n"
395     "vtrn.32     q6, q7                       \n"
396 
397     // d[0-3] = src - ref
398     "vsubl.u8    q0, d8, d12                  \n"
399     "vsubl.u8    q1, d9, d13                  \n"
400 
401     // load coeff16 into q8(d16=5352, d17=2217)
402     "vld1.16     {q8}, [%[coeff16]]           \n"
403 
404     // load coeff32 high half into q9 = 1812, q10 = 937
405     "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
406 
407     // load coeff32 low half into q11=12000, q12=51000
408     "vld1.32     {q11,q12}, [%[coeff32]]      \n"
409 
410     // part 1
411     // Transpose. Register dN is the same as dN in C
412     "vtrn.32         d0, d2                   \n"
413     "vtrn.32         d1, d3                   \n"
414     "vtrn.16         d0, d1                   \n"
415     "vtrn.16         d2, d3                   \n"
416 
417     "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
418     "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
419     "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
420     "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
421 
422     "vadd.s16        d0, d4, d5               \n" // a0 + a1
423     "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
424     "vsub.s16        d2, d4, d5               \n" // a0 - a1
425     "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
426 
427     "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
428     "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
429     "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
430     "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
431 
432     // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
433     // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
434     "vshrn.s32       d1, q9, #9               \n"
435     "vshrn.s32       d3, q10, #9              \n"
436 
437     // part 2
438     // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
439     "vtrn.32         d0, d2                   \n"
440     "vtrn.32         d1, d3                   \n"
441     "vtrn.16         d0, d1                   \n"
442     "vtrn.16         d2, d3                   \n"
443 
444     "vmov.s16        d26, #7                  \n"
445 
446     "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
447     "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
448     "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
449     "vadd.s16        d4, d4, d26              \n" // a1 + 7
450     "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
451 
452     "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
453     "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
454 
455     "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
456     "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
457 
458     "vceq.s16        d4, d7, #0               \n"
459 
460     "vshr.s16        d0, d0, #4               \n"
461     "vshr.s16        d2, d2, #4               \n"
462 
463     "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
464     "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
465 
466     "vmvn            d4, d4                   \n" // !(d1 == 0)
467     // op[4] = (c1*2217 + d1*5352 + 12000)>>16
468     "vshrn.s32       d1, q11, #16             \n"
469     // op[4] += (d1!=0)
470     "vsub.s16        d1, d1, d4               \n"
471     // op[12]= (d1*2217 - c1*5352 + 51000)>>16
472     "vshrn.s32       d3, q12, #16             \n"
473 
474     // set result to out array
475     "vst1.16         {q0, q1}, [%[out]]   \n"
476     : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
477       [coeff32] "+r"(coeff32)          // modified registers
478     : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
479       [out] "r"(out)                   // constants
480     : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
481       "q10", "q11", "q12", "q13"       // clobbered
482   );
483 }
484 
485 #endif
486 
487 #define LOAD_LANE_16b(VALUE, LANE) do {             \
488   (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
489   src += stride;                                    \
490 } while (0)
491 
FTransformWHT_NEON(const int16_t * WEBP_RESTRICT src,int16_t * WEBP_RESTRICT out)492 static void FTransformWHT_NEON(const int16_t* WEBP_RESTRICT src,
493                                int16_t* WEBP_RESTRICT out) {
494   const int stride = 16;
495   const int16x4_t zero = vdup_n_s16(0);
496   int32x4x4_t tmp0;
497   int16x4x4_t in;
498   INIT_VECTOR4(in, zero, zero, zero, zero);
499   LOAD_LANE_16b(in.val[0], 0);
500   LOAD_LANE_16b(in.val[1], 0);
501   LOAD_LANE_16b(in.val[2], 0);
502   LOAD_LANE_16b(in.val[3], 0);
503   LOAD_LANE_16b(in.val[0], 1);
504   LOAD_LANE_16b(in.val[1], 1);
505   LOAD_LANE_16b(in.val[2], 1);
506   LOAD_LANE_16b(in.val[3], 1);
507   LOAD_LANE_16b(in.val[0], 2);
508   LOAD_LANE_16b(in.val[1], 2);
509   LOAD_LANE_16b(in.val[2], 2);
510   LOAD_LANE_16b(in.val[3], 2);
511   LOAD_LANE_16b(in.val[0], 3);
512   LOAD_LANE_16b(in.val[1], 3);
513   LOAD_LANE_16b(in.val[2], 3);
514   LOAD_LANE_16b(in.val[3], 3);
515 
516   {
517     // a0 = in[0 * 16] + in[2 * 16]
518     // a1 = in[1 * 16] + in[3 * 16]
519     // a2 = in[1 * 16] - in[3 * 16]
520     // a3 = in[0 * 16] - in[2 * 16]
521     const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
522     const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
523     const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
524     const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
525     tmp0.val[0] = vaddq_s32(a0, a1);
526     tmp0.val[1] = vaddq_s32(a3, a2);
527     tmp0.val[2] = vsubq_s32(a3, a2);
528     tmp0.val[3] = vsubq_s32(a0, a1);
529   }
530   {
531     const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
532     // a0 = tmp[0 + i] + tmp[ 8 + i]
533     // a1 = tmp[4 + i] + tmp[12 + i]
534     // a2 = tmp[4 + i] - tmp[12 + i]
535     // a3 = tmp[0 + i] - tmp[ 8 + i]
536     const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
537     const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
538     const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
539     const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
540     const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
541     const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
542     const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
543     const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
544     const int16x4_t out0 = vmovn_s32(b0);
545     const int16x4_t out1 = vmovn_s32(b1);
546     const int16x4_t out2 = vmovn_s32(b2);
547     const int16x4_t out3 = vmovn_s32(b3);
548 
549     vst1_s16(out +  0, out0);
550     vst1_s16(out +  4, out1);
551     vst1_s16(out +  8, out2);
552     vst1_s16(out + 12, out3);
553   }
554 }
555 #undef LOAD_LANE_16b
556 
557 //------------------------------------------------------------------------------
558 // Texture distortion
559 //
560 // We try to match the spectral content (weighted) between source and
561 // reconstructed samples.
562 
563 // a 0123, b 0123
564 // a 4567, b 4567
565 // a 89ab, b 89ab
566 // a cdef, b cdef
567 //
568 // transpose
569 //
570 // a 048c, b 048c
571 // a 159d, b 159d
572 // a 26ae, b 26ae
573 // a 37bf, b 37bf
574 //
DistoTranspose4x4S16_NEON(int16x8x4_t q4_in)575 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
576   const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
577   const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
578   const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
579                                         vreinterpretq_s32_s16(q2_tmp1.val[0]));
580   const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
581                                         vreinterpretq_s32_s16(q2_tmp1.val[1]));
582   q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
583   q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
584   q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
585   q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
586   return q4_in;
587 }
588 
DistoHorizontalPass_NEON(const int16x8x4_t q4_in)589 static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
590     const int16x8x4_t q4_in) {
591   // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
592   // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
593   const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
594   const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
595   const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
596   const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
597   int16x8x4_t q4_out;
598   // tmp[0] = a0 + a1
599   // tmp[1] = a3 + a2
600   // tmp[2] = a3 - a2
601   // tmp[3] = a0 - a1
602   INIT_VECTOR4(q4_out,
603                vabsq_s16(vaddq_s16(q_a0, q_a1)),
604                vabsq_s16(vaddq_s16(q_a3, q_a2)),
605                vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
606   return q4_out;
607 }
608 
DistoVerticalPass_NEON(const uint8x8x4_t q4_in)609 static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
610   const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
611                                                         q4_in.val[2]));
612   const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
613                                                         q4_in.val[3]));
614   const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
615                                                         q4_in.val[3]));
616   const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
617                                                         q4_in.val[2]));
618   int16x8x4_t q4_out;
619 
620   INIT_VECTOR4(q4_out,
621                vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
622                vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
623   return q4_out;
624 }
625 
DistoLoadW_NEON(const uint16_t * w)626 static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
627   const uint16x8_t q_w07 = vld1q_u16(&w[0]);
628   const uint16x8_t q_w8f = vld1q_u16(&w[8]);
629   int16x4x4_t d4_w;
630   INIT_VECTOR4(d4_w,
631                vget_low_s16(vreinterpretq_s16_u16(q_w07)),
632                vget_high_s16(vreinterpretq_s16_u16(q_w07)),
633                vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
634                vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
635   return d4_w;
636 }
637 
DistoSum_NEON(const int16x8x4_t q4_in,const int16x4x4_t d4_w)638 static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
639                                            const int16x4x4_t d4_w) {
640   int32x2_t d_sum;
641   // sum += w[ 0] * abs(b0);
642   // sum += w[ 4] * abs(b1);
643   // sum += w[ 8] * abs(b2);
644   // sum += w[12] * abs(b3);
645   int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
646   int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
647   int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
648   int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
649   q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
650   q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
651   q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
652   q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
653 
654   q_sum0 = vaddq_s32(q_sum0, q_sum1);
655   q_sum2 = vaddq_s32(q_sum2, q_sum3);
656   q_sum2 = vaddq_s32(q_sum0, q_sum2);
657   d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
658   d_sum = vpadd_s32(d_sum, d_sum);
659   return d_sum;
660 }
661 
662 #define LOAD_LANE_32b(src, VALUE, LANE) \
663     (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
664 
665 // Hadamard transform
666 // Returns the weighted sum of the absolute value of transformed coefficients.
667 // w[] contains a row-major 4 by 4 symmetric matrix.
Disto4x4_NEON(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,const uint16_t * WEBP_RESTRICT const w)668 static int Disto4x4_NEON(const uint8_t* WEBP_RESTRICT const a,
669                          const uint8_t* WEBP_RESTRICT const b,
670                          const uint16_t* WEBP_RESTRICT const w) {
671   uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
672   uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
673   uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
674   uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
675   uint8x8x4_t d4_in;
676 
677   // load data a, b
678   LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
679   LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
680   LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
681   LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
682   LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
683   LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
684   LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
685   LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
686   INIT_VECTOR4(d4_in,
687                vreinterpret_u8_u32(d_in_ab_0123),
688                vreinterpret_u8_u32(d_in_ab_4567),
689                vreinterpret_u8_u32(d_in_ab_89ab),
690                vreinterpret_u8_u32(d_in_ab_cdef));
691 
692   {
693     // Vertical pass first to avoid a transpose (vertical and horizontal passes
694     // are commutative because w/kWeightY is symmetric) and subsequent
695     // transpose.
696     const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
697     const int16x4x4_t d4_w = DistoLoadW_NEON(w);
698     // horizontal pass
699     const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
700     const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
701     int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
702 
703     // abs(sum2 - sum1) >> 5
704     d_sum = vabs_s32(d_sum);
705     d_sum = vshr_n_s32(d_sum, 5);
706     return vget_lane_s32(d_sum, 0);
707   }
708 }
709 #undef LOAD_LANE_32b
710 
Disto16x16_NEON(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,const uint16_t * WEBP_RESTRICT const w)711 static int Disto16x16_NEON(const uint8_t* WEBP_RESTRICT const a,
712                            const uint8_t* WEBP_RESTRICT const b,
713                            const uint16_t* WEBP_RESTRICT const w) {
714   int D = 0;
715   int x, y;
716   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
717     for (x = 0; x < 16; x += 4) {
718       D += Disto4x4_NEON(a + x + y, b + x + y, w);
719     }
720   }
721   return D;
722 }
723 
724 //------------------------------------------------------------------------------
725 
CollectHistogram_NEON(const uint8_t * WEBP_RESTRICT ref,const uint8_t * WEBP_RESTRICT pred,int start_block,int end_block,VP8Histogram * WEBP_RESTRICT const histo)726 static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
727                                   const uint8_t* WEBP_RESTRICT pred,
728                                   int start_block, int end_block,
729                                   VP8Histogram* WEBP_RESTRICT const histo) {
730   const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
731   int j;
732   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
733   for (j = start_block; j < end_block; ++j) {
734     int16_t out[16];
735     FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
736     {
737       int k;
738       const int16x8_t a0 = vld1q_s16(out + 0);
739       const int16x8_t b0 = vld1q_s16(out + 8);
740       const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
741       const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
742       const uint16x8_t a2 = vshrq_n_u16(a1, 3);
743       const uint16x8_t b2 = vshrq_n_u16(b1, 3);
744       const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
745       const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
746       vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
747       vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
748       // Convert coefficients to bin.
749       for (k = 0; k < 16; ++k) {
750         ++distribution[out[k]];
751       }
752     }
753   }
754   VP8SetHistogramData(distribution, histo);
755 }
756 
757 //------------------------------------------------------------------------------
758 
AccumulateSSE16_NEON(const uint8_t * WEBP_RESTRICT const a,const uint8_t * WEBP_RESTRICT const b,uint32x4_t * const sum)759 static WEBP_INLINE void AccumulateSSE16_NEON(
760     const uint8_t* WEBP_RESTRICT const a, const uint8_t* WEBP_RESTRICT const b,
761     uint32x4_t* const sum) {
762   const uint8x16_t a0 = vld1q_u8(a);
763   const uint8x16_t b0 = vld1q_u8(b);
764   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
765   const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
766                                     vget_low_u8(abs_diff));
767   const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
768                                     vget_high_u8(abs_diff));
769   /* pair-wise adds and widen */
770   const uint32x4_t sum1 = vpaddlq_u16(prod1);
771   const uint32x4_t sum2 = vpaddlq_u16(prod2);
772   *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
773 }
774 
775 // Horizontal sum of all four uint32_t values in 'sum'.
SumToInt_NEON(uint32x4_t sum)776 static int SumToInt_NEON(uint32x4_t sum) {
777 #if WEBP_AARCH64
778   return (int)vaddvq_u32(sum);
779 #else
780   const uint64x2_t sum2 = vpaddlq_u32(sum);
781   const uint32x2_t sum3 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(sum2)),
782                                    vreinterpret_u32_u64(vget_high_u64(sum2)));
783   return (int)vget_lane_u32(sum3, 0);
784 #endif
785 }
786 
SSE16x16_NEON(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)787 static int SSE16x16_NEON(const uint8_t* WEBP_RESTRICT a,
788                          const uint8_t* WEBP_RESTRICT b) {
789   uint32x4_t sum = vdupq_n_u32(0);
790   int y;
791   for (y = 0; y < 16; ++y) {
792     AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
793   }
794   return SumToInt_NEON(sum);
795 }
796 
SSE16x8_NEON(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)797 static int SSE16x8_NEON(const uint8_t* WEBP_RESTRICT a,
798                         const uint8_t* WEBP_RESTRICT b) {
799   uint32x4_t sum = vdupq_n_u32(0);
800   int y;
801   for (y = 0; y < 8; ++y) {
802     AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
803   }
804   return SumToInt_NEON(sum);
805 }
806 
SSE8x8_NEON(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)807 static int SSE8x8_NEON(const uint8_t* WEBP_RESTRICT a,
808                        const uint8_t* WEBP_RESTRICT b) {
809   uint32x4_t sum = vdupq_n_u32(0);
810   int y;
811   for (y = 0; y < 8; ++y) {
812     const uint8x8_t a0 = vld1_u8(a + y * BPS);
813     const uint8x8_t b0 = vld1_u8(b + y * BPS);
814     const uint8x8_t abs_diff = vabd_u8(a0, b0);
815     const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
816     sum = vpadalq_u16(sum, prod);
817   }
818   return SumToInt_NEON(sum);
819 }
820 
SSE4x4_NEON(const uint8_t * WEBP_RESTRICT a,const uint8_t * WEBP_RESTRICT b)821 static int SSE4x4_NEON(const uint8_t* WEBP_RESTRICT a,
822                        const uint8_t* WEBP_RESTRICT b) {
823   const uint8x16_t a0 = Load4x4_NEON(a);
824   const uint8x16_t b0 = Load4x4_NEON(b);
825   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
826   const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
827                                     vget_low_u8(abs_diff));
828   const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
829                                     vget_high_u8(abs_diff));
830   /* pair-wise adds and widen */
831   const uint32x4_t sum1 = vpaddlq_u16(prod1);
832   const uint32x4_t sum2 = vpaddlq_u16(prod2);
833   return SumToInt_NEON(vaddq_u32(sum1, sum2));
834 }
835 
836 //------------------------------------------------------------------------------
837 
838 // Compilation with gcc-4.6.x is problematic for now.
839 #if !defined(WORK_AROUND_GCC)
840 
Quantize_NEON(int16_t * WEBP_RESTRICT const in,const VP8Matrix * WEBP_RESTRICT const mtx,int offset)841 static int16x8_t Quantize_NEON(int16_t* WEBP_RESTRICT const in,
842                                const VP8Matrix* WEBP_RESTRICT const mtx,
843                                int offset) {
844   const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
845   const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
846   const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
847   const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
848   const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
849 
850   const int16x8_t a = vld1q_s16(in + offset);                // in
851   const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
852   const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
853   const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
854   const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
855   const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
856   const uint32x4_t m2 = vhaddq_u32(m0, bias0);
857   const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
858   const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
859                                      vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
860   const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
861   const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
862   const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
863   const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
864   vst1q_s16(in + offset, c4);
865   assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
866   return c3;
867 }
868 
869 static const uint8_t kShuffles[4][8] = {
870   { 0,   1,  2,  3,  8,  9, 16, 17 },
871   { 10, 11,  4,  5,  6,  7, 12, 13 },
872   { 18, 19, 24, 25, 26, 27, 20, 21 },
873   { 14, 15, 22, 23, 28, 29, 30, 31 }
874 };
875 
QuantizeBlock_NEON(int16_t in[16],int16_t out[16],const VP8Matrix * WEBP_RESTRICT const mtx)876 static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
877                               const VP8Matrix* WEBP_RESTRICT const mtx) {
878   const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
879   const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
880   uint8x8x4_t shuffles;
881   // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
882   // non-standard versions there.
883 #if defined(__APPLE__) && WEBP_AARCH64 && \
884     defined(__apple_build_version__) && (__apple_build_version__< 6020037)
885   uint8x16x2_t all_out;
886   INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
887   INIT_VECTOR4(shuffles,
888                vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
889                vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
890                vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
891                vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
892 #else
893   uint8x8x4_t all_out;
894   INIT_VECTOR4(all_out,
895                vreinterpret_u8_s16(vget_low_s16(out0)),
896                vreinterpret_u8_s16(vget_high_s16(out0)),
897                vreinterpret_u8_s16(vget_low_s16(out1)),
898                vreinterpret_u8_s16(vget_high_s16(out1)));
899   INIT_VECTOR4(shuffles,
900                vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
901                vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
902                vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
903                vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
904 #endif
905   // Zigzag reordering
906   vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
907   vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
908   vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
909   vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
910   // test zeros
911   if (*(uint64_t*)(out +  0) != 0) return 1;
912   if (*(uint64_t*)(out +  4) != 0) return 1;
913   if (*(uint64_t*)(out +  8) != 0) return 1;
914   if (*(uint64_t*)(out + 12) != 0) return 1;
915   return 0;
916 }
917 
Quantize2Blocks_NEON(int16_t in[32],int16_t out[32],const VP8Matrix * WEBP_RESTRICT const mtx)918 static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
919                                 const VP8Matrix* WEBP_RESTRICT const mtx) {
920   int nz;
921   nz  = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
922   nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
923   return nz;
924 }
925 
926 #endif   // !WORK_AROUND_GCC
927 
928 #if WEBP_AARCH64
929 
930 #if BPS == 32
931 #define DC4_VE4_HE4_TM4_NEON(dst, tbl, res, lane)                              \
932   do {                                                                         \
933     uint8x16_t r;                                                              \
934     r = vqtbl2q_u8(qcombined, tbl);                                            \
935     r = vreinterpretq_u8_u32(                                                  \
936         vsetq_lane_u32(vget_lane_u32(vreinterpret_u32_u8(res), lane),          \
937                        vreinterpretq_u32_u8(r), 1));                           \
938     vst1q_u8(dst, r);                                                          \
939   } while (0)
940 
941 #define RD4_VR4_LD4_VL4_NEON(dst, tbl)                                         \
942   do {                                                                         \
943     uint8x16_t r;                                                              \
944     r = vqtbl2q_u8(qcombined, tbl);                                            \
945     vst1q_u8(dst, r);                                                          \
946   } while (0)
947 
Intra4Preds_NEON(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT top)948 static void Intra4Preds_NEON(uint8_t* WEBP_RESTRICT dst,
949                              const uint8_t* WEBP_RESTRICT top) {
950   // 0   1   2   3   4   5   6   7   8   9  10  11  12  13
951   //     L   K   J   I   X   A   B   C   D   E   F   G   H
952   //    -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7
953   static const uint8_t kLookupTbl1[64] = {
954     0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 12, 12,
955     3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0,
956     4, 20, 21, 22,  3, 18,  2, 17,  3, 19,  4, 20,  2, 17,  1, 16,
957     2, 18,  3, 19,  1, 16, 31, 31,  1, 17,  2, 18, 31, 31, 31, 31
958   };
959 
960   static const uint8_t kLookupTbl2[64] = {
961     20, 21, 22, 23,  5,  6,  7,  8, 22, 23, 24, 25,  6,  7,  8,  9,
962     19, 20, 21, 22, 20, 21, 22, 23, 23, 24, 25, 26, 22, 23, 24, 25,
963     18, 19, 20, 21, 19,  5,  6,  7, 24, 25, 26, 27,  7,  8,  9, 26,
964     17, 18, 19, 20, 18, 20, 21, 22, 25, 26, 27, 28, 23, 24, 25, 27
965   };
966 
967   static const uint8_t kLookupTbl3[64] = {
968     30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 19, 19, 19, 19,
969     30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 18, 18, 18, 18,
970     30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 17, 17, 17, 17,
971     30, 30, 30, 30,  0,  0,  0,  0, 21, 22, 23, 24, 16, 16, 16, 16
972   };
973 
974   const uint8x16x4_t lookup_avgs1 = vld1q_u8_x4(kLookupTbl1);
975   const uint8x16x4_t lookup_avgs2 = vld1q_u8_x4(kLookupTbl2);
976   const uint8x16x4_t lookup_avgs3 = vld1q_u8_x4(kLookupTbl3);
977 
978   const uint8x16_t preload = vld1q_u8(top - 5);
979   uint8x16x2_t qcombined;
980   uint8x16_t result0, result1;
981 
982   uint8x16_t a = vqtbl1q_u8(preload, lookup_avgs1.val[0]);
983   uint8x16_t b = preload;
984   uint8x16_t c = vextq_u8(a, a, 2);
985 
986   uint8x16_t avg3_all = vrhaddq_u8(vhaddq_u8(a, c), b);
987   uint8x16_t avg2_all = vrhaddq_u8(a, b);
988 
989   uint8x8_t preload_x8, sub_a, sub_c;
990   uint8_t result_u8;
991   uint8x8_t res_lo, res_hi;
992   uint8x16_t full_b;
993   uint16x8_t sub, sum_lo, sum_hi;
994 
995   preload_x8 = vget_low_u8(c);
996   preload_x8 = vset_lane_u8(vgetq_lane_u8(preload, 0), preload_x8, 3);
997 
998   result_u8 = (vaddlv_u8(preload_x8) + 4) >> 3;
999 
1000   avg3_all = vsetq_lane_u8(vgetq_lane_u8(preload, 0), avg3_all, 15);
1001   avg3_all = vsetq_lane_u8(result_u8, avg3_all, 14);
1002 
1003   qcombined.val[0] = avg2_all;
1004   qcombined.val[1] = avg3_all;
1005 
1006   sub_a = vdup_laneq_u8(preload, 4);
1007 
1008   // preload = {a,b,c,d,...} => full_b = {d,d,d,d,c,c,c,c,b,b,b,b,a,a,a,a}
1009   full_b = vqtbl1q_u8(preload, lookup_avgs1.val[1]);
1010   // preload = {a,b,c,d,...} => sub_c = {a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d}
1011   sub_c = vreinterpret_u8_u32(vdup_n_u32(
1012       vgetq_lane_u32(vreinterpretq_u32_u8(vextq_u8(preload, preload, 5)), 0)));
1013 
1014   sub = vsubl_u8(sub_c, sub_a);
1015   sum_lo = vaddw_u8(sub, vget_low_u8(full_b));
1016   res_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
1017 
1018   sum_hi = vaddw_u8(sub, vget_high_u8(full_b));
1019   res_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
1020 
1021   // DC4, VE4, HE4, TM4
1022   DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 0, lookup_avgs3.val[0], res_lo, 0);
1023   DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 1, lookup_avgs3.val[1], res_lo, 1);
1024   DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 2, lookup_avgs3.val[2], res_hi, 0);
1025   DC4_VE4_HE4_TM4_NEON(dst + I4DC4 + BPS * 3, lookup_avgs3.val[3], res_hi, 1);
1026 
1027   // RD4, VR4, LD4, VL4
1028   RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 0, lookup_avgs2.val[0]);
1029   RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 1, lookup_avgs2.val[1]);
1030   RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 2, lookup_avgs2.val[2]);
1031   RD4_VR4_LD4_VL4_NEON(dst + I4RD4 + BPS * 3, lookup_avgs2.val[3]);
1032 
1033   // HD4, HU4
1034   result0 = vqtbl2q_u8(qcombined, lookup_avgs1.val[2]);
1035   result1 = vqtbl2q_u8(qcombined, lookup_avgs1.val[3]);
1036 
1037   vst1_u8(dst + I4HD4 + BPS * 0, vget_low_u8(result0));
1038   vst1_u8(dst + I4HD4 + BPS * 1, vget_high_u8(result0));
1039   vst1_u8(dst + I4HD4 + BPS * 2, vget_low_u8(result1));
1040   vst1_u8(dst + I4HD4 + BPS * 3, vget_high_u8(result1));
1041 }
1042 #endif  // BPS == 32
1043 
Fill_NEON(uint8_t * dst,const uint8_t value)1044 static WEBP_INLINE void Fill_NEON(uint8_t* dst, const uint8_t value) {
1045   uint8x16_t a = vdupq_n_u8(value);
1046   int i;
1047   for (i = 0; i < 16; i++) {
1048     vst1q_u8(dst + BPS * i, a);
1049   }
1050 }
1051 
Fill16_NEON(uint8_t * dst,const uint8_t * src)1052 static WEBP_INLINE void Fill16_NEON(uint8_t* dst, const uint8_t* src) {
1053   uint8x16_t a = vld1q_u8(src);
1054   int i;
1055   for (i = 0; i < 16; i++) {
1056     vst1q_u8(dst + BPS * i, a);
1057   }
1058 }
1059 
HorizontalPred16_NEON(uint8_t * dst,const uint8_t * left)1060 static WEBP_INLINE void HorizontalPred16_NEON(uint8_t* dst,
1061                                               const uint8_t* left) {
1062   uint8x16_t a;
1063 
1064   if (left == NULL) {
1065     Fill_NEON(dst, 129);
1066     return;
1067   }
1068 
1069   a = vld1q_u8(left + 0);
1070   vst1q_u8(dst + BPS * 0, vdupq_laneq_u8(a, 0));
1071   vst1q_u8(dst + BPS * 1, vdupq_laneq_u8(a, 1));
1072   vst1q_u8(dst + BPS * 2, vdupq_laneq_u8(a, 2));
1073   vst1q_u8(dst + BPS * 3, vdupq_laneq_u8(a, 3));
1074   vst1q_u8(dst + BPS * 4, vdupq_laneq_u8(a, 4));
1075   vst1q_u8(dst + BPS * 5, vdupq_laneq_u8(a, 5));
1076   vst1q_u8(dst + BPS * 6, vdupq_laneq_u8(a, 6));
1077   vst1q_u8(dst + BPS * 7, vdupq_laneq_u8(a, 7));
1078   vst1q_u8(dst + BPS * 8, vdupq_laneq_u8(a, 8));
1079   vst1q_u8(dst + BPS * 9, vdupq_laneq_u8(a, 9));
1080   vst1q_u8(dst + BPS * 10, vdupq_laneq_u8(a, 10));
1081   vst1q_u8(dst + BPS * 11, vdupq_laneq_u8(a, 11));
1082   vst1q_u8(dst + BPS * 12, vdupq_laneq_u8(a, 12));
1083   vst1q_u8(dst + BPS * 13, vdupq_laneq_u8(a, 13));
1084   vst1q_u8(dst + BPS * 14, vdupq_laneq_u8(a, 14));
1085   vst1q_u8(dst + BPS * 15, vdupq_laneq_u8(a, 15));
1086 }
1087 
VerticalPred16_NEON(uint8_t * dst,const uint8_t * top)1088 static WEBP_INLINE void VerticalPred16_NEON(uint8_t* dst, const uint8_t* top) {
1089   if (top != NULL) {
1090     Fill16_NEON(dst, top);
1091   } else {
1092     Fill_NEON(dst, 127);
1093   }
1094 }
1095 
DCMode_NEON(uint8_t * dst,const uint8_t * left,const uint8_t * top)1096 static WEBP_INLINE void DCMode_NEON(uint8_t* dst, const uint8_t* left,
1097                                     const uint8_t* top) {
1098   uint8_t s;
1099 
1100   if (top != NULL) {
1101     uint16_t dc;
1102     dc = vaddlvq_u8(vld1q_u8(top));
1103     if (left != NULL) {
1104       // top and left present.
1105       dc += vaddlvq_u8(vld1q_u8(left));
1106       s = vqrshrnh_n_u16(dc, 5);
1107     } else {
1108       // top but no left.
1109       s = vqrshrnh_n_u16(dc, 4);
1110     }
1111   } else {
1112     if (left != NULL) {
1113       uint16_t dc;
1114       // left but no top.
1115       dc = vaddlvq_u8(vld1q_u8(left));
1116       s = vqrshrnh_n_u16(dc, 4);
1117     } else {
1118       // No top, no left, nothing.
1119       s = 0x80;
1120     }
1121   }
1122   Fill_NEON(dst, s);
1123 }
1124 
TrueMotionHelper_NEON(uint8_t * dst,const uint8x8_t outer,const uint8x8x2_t inner,const uint16x8_t a,int i,const int n)1125 static WEBP_INLINE void TrueMotionHelper_NEON(uint8_t* dst,
1126                                               const uint8x8_t outer,
1127                                               const uint8x8x2_t inner,
1128                                               const uint16x8_t a, int i,
1129                                               const int n) {
1130   uint8x8_t d1, d2;
1131   uint16x8_t r1, r2;
1132 
1133   r1 = vaddl_u8(outer, inner.val[0]);
1134   r1 = vqsubq_u16(r1, a);
1135   d1 = vqmovun_s16(vreinterpretq_s16_u16(r1));
1136   r2 = vaddl_u8(outer, inner.val[1]);
1137   r2 = vqsubq_u16(r2, a);
1138   d2 = vqmovun_s16(vreinterpretq_s16_u16(r2));
1139   vst1_u8(dst + BPS * (i * 4 + n), d1);
1140   vst1_u8(dst + BPS * (i * 4 + n) + 8, d2);
1141 }
1142 
TrueMotion_NEON(uint8_t * dst,const uint8_t * left,const uint8_t * top)1143 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, const uint8_t* left,
1144                                         const uint8_t* top) {
1145   int i;
1146   uint16x8_t a;
1147   uint8x8x2_t inner;
1148 
1149   if (left == NULL) {
1150     // True motion without left samples (hence: with default 129 value) is
1151     // equivalent to VE prediction where you just copy the top samples.
1152     // Note that if top samples are not available, the default value is then
1153     // 129, and not 127 as in the VerticalPred case.
1154     if (top != NULL) {
1155       VerticalPred16_NEON(dst, top);
1156     } else {
1157       Fill_NEON(dst, 129);
1158     }
1159     return;
1160   }
1161 
1162   // left is not NULL.
1163   if (top == NULL) {
1164     HorizontalPred16_NEON(dst, left);
1165     return;
1166   }
1167 
1168   // Neither left nor top are NULL.
1169   a = vdupq_n_u16(left[-1]);
1170   inner = vld1_u8_x2(top);
1171 
1172   for (i = 0; i < 4; i++) {
1173     const uint8x8x4_t outer = vld4_dup_u8(&left[i * 4]);
1174 
1175     TrueMotionHelper_NEON(dst, outer.val[0], inner, a, i, 0);
1176     TrueMotionHelper_NEON(dst, outer.val[1], inner, a, i, 1);
1177     TrueMotionHelper_NEON(dst, outer.val[2], inner, a, i, 2);
1178     TrueMotionHelper_NEON(dst, outer.val[3], inner, a, i, 3);
1179   }
1180 }
1181 
Intra16Preds_NEON(uint8_t * WEBP_RESTRICT dst,const uint8_t * WEBP_RESTRICT left,const uint8_t * WEBP_RESTRICT top)1182 static void Intra16Preds_NEON(uint8_t* WEBP_RESTRICT dst,
1183                               const uint8_t* WEBP_RESTRICT left,
1184                               const uint8_t* WEBP_RESTRICT top) {
1185   DCMode_NEON(I16DC16 + dst, left, top);
1186   VerticalPred16_NEON(I16VE16 + dst, top);
1187   HorizontalPred16_NEON(I16HE16 + dst, left);
1188   TrueMotion_NEON(I16TM16 + dst, left, top);
1189 }
1190 
1191 #endif // WEBP_AARCH64
1192 
1193 //------------------------------------------------------------------------------
1194 // Entry point
1195 
1196 extern void VP8EncDspInitNEON(void);
1197 
VP8EncDspInitNEON(void)1198 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
1199   VP8ITransform = ITransform_NEON;
1200   VP8FTransform = FTransform_NEON;
1201 
1202   VP8FTransformWHT = FTransformWHT_NEON;
1203 
1204   VP8TDisto4x4 = Disto4x4_NEON;
1205   VP8TDisto16x16 = Disto16x16_NEON;
1206   VP8CollectHistogram = CollectHistogram_NEON;
1207 
1208   VP8SSE16x16 = SSE16x16_NEON;
1209   VP8SSE16x8 = SSE16x8_NEON;
1210   VP8SSE8x8 = SSE8x8_NEON;
1211   VP8SSE4x4 = SSE4x4_NEON;
1212 
1213 #if WEBP_AARCH64
1214 #if BPS == 32
1215   VP8EncPredLuma4 = Intra4Preds_NEON;
1216 #endif
1217   VP8EncPredLuma16 = Intra16Preds_NEON;
1218 #endif
1219 
1220 #if !defined(WORK_AROUND_GCC)
1221   VP8EncQuantizeBlock = QuantizeBlock_NEON;
1222   VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
1223   VP8EncQuantizeBlockWHT = QuantizeBlock_NEON;
1224 #endif
1225 }
1226 
1227 #else  // !WEBP_USE_NEON
1228 
1229 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
1230 
1231 #endif  // WEBP_USE_NEON
1232