• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale.h"
13 #include "libyuv/scale_row.h"
14 
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19 
20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22 
23 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)24 void ScaleRowDown2_NEON(const uint8* src_ptr,
25                         ptrdiff_t src_stride,
26                         uint8* dst,
27                         int dst_width) {
28   (void)src_stride;
29   asm volatile (
30   "1:                                          \n"
31     // load even pixels into v0, odd into v1
32     MEMACCESS(0)
33     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
34     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
35     MEMACCESS(1)
36     "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
37     "b.gt       1b                             \n"
38   : "+r"(src_ptr),          // %0
39     "+r"(dst),              // %1
40     "+r"(dst_width)         // %2
41   :
42   : "v0", "v1"              // Clobber List
43   );
44 }
45 
46 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)47 void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
48                               ptrdiff_t src_stride,
49                               uint8* dst,
50                               int dst_width) {
51   (void)src_stride;
52   asm volatile (
53   "1:                                          \n"
54     MEMACCESS(0)
55     "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
56     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
57     "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
58     "uaddlp     v1.8h, v1.16b                  \n"
59     "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
60     "rshrn2     v0.16b, v1.8h, #1              \n"
61     MEMACCESS(1)
62     "st1        {v0.16b}, [%1], #16            \n"
63     "b.gt       1b                             \n"
64   : "+r"(src_ptr),          // %0
65     "+r"(dst),              // %1
66     "+r"(dst_width)         // %2
67   :
68   : "v0", "v1"     // Clobber List
69   );
70 }
71 
72 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)73 void ScaleRowDown2Box_NEON(const uint8* src_ptr,
74                            ptrdiff_t src_stride,
75                            uint8* dst,
76                            int dst_width) {
77   asm volatile (
78     // change the stride to row 2 pointer
79     "add        %1, %1, %0                     \n"
80   "1:                                          \n"
81     MEMACCESS(0)
82     "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
83     MEMACCESS(1)
84     "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
85     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
86     "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
87     "uaddlp     v1.8h, v1.16b                  \n"
88     "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
89     "uadalp     v1.8h, v3.16b                  \n"
90     "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
91     "rshrn2     v0.16b, v1.8h, #2              \n"
92     MEMACCESS(2)
93     "st1        {v0.16b}, [%2], #16            \n"
94     "b.gt       1b                             \n"
95   : "+r"(src_ptr),          // %0
96     "+r"(src_stride),       // %1
97     "+r"(dst),              // %2
98     "+r"(dst_width)         // %3
99   :
100   : "v0", "v1", "v2", "v3"     // Clobber List
101   );
102 }
103 
ScaleRowDown4_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)104 void ScaleRowDown4_NEON(const uint8* src_ptr,
105                         ptrdiff_t src_stride,
106                         uint8* dst_ptr,
107                         int dst_width) {
108   (void)src_stride;
109   asm volatile (
110   "1:                                          \n"
111     MEMACCESS(0)
112     "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
113     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
114     MEMACCESS(1)
115     "st1     {v2.8b}, [%1], #8                 \n"
116     "b.gt       1b                             \n"
117   : "+r"(src_ptr),          // %0
118     "+r"(dst_ptr),          // %1
119     "+r"(dst_width)         // %2
120   :
121   : "v0", "v1", "v2", "v3", "memory", "cc"
122   );
123 }
124 
ScaleRowDown4Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)125 void ScaleRowDown4Box_NEON(const uint8* src_ptr,
126                            ptrdiff_t src_stride,
127                            uint8* dst_ptr,
128                            int dst_width) {
129   const uint8* src_ptr1 = src_ptr + src_stride;
130   const uint8* src_ptr2 = src_ptr + src_stride * 2;
131   const uint8* src_ptr3 = src_ptr + src_stride * 3;
132   asm volatile (
133   "1:                                          \n"
134     MEMACCESS(0)
135     "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
136     MEMACCESS(3)
137     "ld1     {v1.16b}, [%2], #16               \n"
138     MEMACCESS(4)
139     "ld1     {v2.16b}, [%3], #16               \n"
140     MEMACCESS(5)
141     "ld1     {v3.16b}, [%4], #16               \n"
142     "subs    %w5, %w5, #4                      \n"
143     "uaddlp  v0.8h, v0.16b                     \n"
144     "uadalp  v0.8h, v1.16b                     \n"
145     "uadalp  v0.8h, v2.16b                     \n"
146     "uadalp  v0.8h, v3.16b                     \n"
147     "addp    v0.8h, v0.8h, v0.8h               \n"
148     "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
149     MEMACCESS(1)
150     "st1    {v0.s}[0], [%1], #4                \n"
151     "b.gt       1b                             \n"
152   : "+r"(src_ptr),   // %0
153     "+r"(dst_ptr),   // %1
154     "+r"(src_ptr1),  // %2
155     "+r"(src_ptr2),  // %3
156     "+r"(src_ptr3),  // %4
157     "+r"(dst_width)  // %5
158   :
159   : "v0", "v1", "v2", "v3", "memory", "cc"
160   );
161 }
162 
163 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
164 // to load up the every 4th pixel into a 4 different registers.
165 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)166 void ScaleRowDown34_NEON(const uint8* src_ptr,
167                          ptrdiff_t src_stride,
168                          uint8* dst_ptr,
169                          int dst_width) {
170   (void)src_stride;
171   asm volatile (
172   "1:                                                  \n"
173     MEMACCESS(0)
174     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
175     "subs      %w2, %w2, #24                           \n"
176     "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
177     MEMACCESS(1)
178     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
179     "b.gt      1b                                      \n"
180   : "+r"(src_ptr),          // %0
181     "+r"(dst_ptr),          // %1
182     "+r"(dst_width)         // %2
183   :
184   : "v0", "v1", "v2", "v3", "memory", "cc"
185   );
186 }
187 
ScaleRowDown34_0_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)188 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
189                                ptrdiff_t src_stride,
190                                uint8* dst_ptr,
191                                int dst_width) {
192   asm volatile (
193     "movi      v20.8b, #3                              \n"
194     "add       %3, %3, %0                              \n"
195   "1:                                                  \n"
196     MEMACCESS(0)
197     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
198     MEMACCESS(3)
199     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
200     "subs         %w2, %w2, #24                        \n"
201 
202     // filter src line 0 with src line 1
203     // expand chars to shorts to allow for room
204     // when adding lines together
205     "ushll     v16.8h, v4.8b, #0                       \n"
206     "ushll     v17.8h, v5.8b, #0                       \n"
207     "ushll     v18.8h, v6.8b, #0                       \n"
208     "ushll     v19.8h, v7.8b, #0                       \n"
209 
210     // 3 * line_0 + line_1
211     "umlal     v16.8h, v0.8b, v20.8b                   \n"
212     "umlal     v17.8h, v1.8b, v20.8b                   \n"
213     "umlal     v18.8h, v2.8b, v20.8b                   \n"
214     "umlal     v19.8h, v3.8b, v20.8b                   \n"
215 
216     // (3 * line_0 + line_1) >> 2
217     "uqrshrn   v0.8b, v16.8h, #2                       \n"
218     "uqrshrn   v1.8b, v17.8h, #2                       \n"
219     "uqrshrn   v2.8b, v18.8h, #2                       \n"
220     "uqrshrn   v3.8b, v19.8h, #2                       \n"
221 
222     // a0 = (src[0] * 3 + s[1] * 1) >> 2
223     "ushll     v16.8h, v1.8b, #0                       \n"
224     "umlal     v16.8h, v0.8b, v20.8b                   \n"
225     "uqrshrn   v0.8b, v16.8h, #2                       \n"
226 
227     // a1 = (src[1] * 1 + s[2] * 1) >> 1
228     "urhadd    v1.8b, v1.8b, v2.8b                     \n"
229 
230     // a2 = (src[2] * 1 + s[3] * 3) >> 2
231     "ushll     v16.8h, v2.8b, #0                       \n"
232     "umlal     v16.8h, v3.8b, v20.8b                   \n"
233     "uqrshrn   v2.8b, v16.8h, #2                       \n"
234 
235     MEMACCESS(1)
236     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
237 
238     "b.gt      1b                                      \n"
239   : "+r"(src_ptr),          // %0
240     "+r"(dst_ptr),          // %1
241     "+r"(dst_width),        // %2
242     "+r"(src_stride)        // %3
243   :
244   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
245     "v20", "memory", "cc"
246   );
247 }
248 
ScaleRowDown34_1_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)249 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
250                                ptrdiff_t src_stride,
251                                uint8* dst_ptr,
252                                int dst_width) {
253   asm volatile (
254     "movi      v20.8b, #3                              \n"
255     "add       %3, %3, %0                              \n"
256   "1:                                                  \n"
257     MEMACCESS(0)
258     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
259     MEMACCESS(3)
260     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
261     "subs         %w2, %w2, #24                        \n"
262     // average src line 0 with src line 1
263     "urhadd    v0.8b, v0.8b, v4.8b                     \n"
264     "urhadd    v1.8b, v1.8b, v5.8b                     \n"
265     "urhadd    v2.8b, v2.8b, v6.8b                     \n"
266     "urhadd    v3.8b, v3.8b, v7.8b                     \n"
267 
268     // a0 = (src[0] * 3 + s[1] * 1) >> 2
269     "ushll     v4.8h, v1.8b, #0                        \n"
270     "umlal     v4.8h, v0.8b, v20.8b                    \n"
271     "uqrshrn   v0.8b, v4.8h, #2                        \n"
272 
273     // a1 = (src[1] * 1 + s[2] * 1) >> 1
274     "urhadd    v1.8b, v1.8b, v2.8b                     \n"
275 
276     // a2 = (src[2] * 1 + s[3] * 3) >> 2
277     "ushll     v4.8h, v2.8b, #0                        \n"
278     "umlal     v4.8h, v3.8b, v20.8b                    \n"
279     "uqrshrn   v2.8b, v4.8h, #2                        \n"
280 
281     MEMACCESS(1)
282     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
283     "b.gt      1b                                      \n"
284   : "+r"(src_ptr),          // %0
285     "+r"(dst_ptr),          // %1
286     "+r"(dst_width),        // %2
287     "+r"(src_stride)        // %3
288   :
289   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
290   );
291 }
292 
293 static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
294 static uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
295                           34, 6,  22, 35, 0,  0,  0, 0};
296 static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
297                              65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
298 static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
299                              65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
300 
301 // 32 -> 12
ScaleRowDown38_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)302 void ScaleRowDown38_NEON(const uint8* src_ptr,
303                          ptrdiff_t src_stride,
304                          uint8* dst_ptr,
305                          int dst_width) {
306   (void)src_stride;
307   asm volatile (
308     MEMACCESS(3)
309     "ld1       {v3.16b}, [%3]                          \n"
310   "1:                                                  \n"
311     MEMACCESS(0)
312     "ld1       {v0.16b,v1.16b}, [%0], #32              \n"
313     "subs      %w2, %w2, #12                           \n"
314     "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
315     MEMACCESS(1)
316     "st1       {v2.8b}, [%1], #8                       \n"
317     MEMACCESS(1)
318     "st1       {v2.s}[2], [%1], #4                     \n"
319     "b.gt      1b                                      \n"
320   : "+r"(src_ptr),          // %0
321     "+r"(dst_ptr),          // %1
322     "+r"(dst_width)         // %2
323   : "r"(&kShuf38)           // %3
324   : "v0", "v1", "v2", "v3", "memory", "cc"
325   );
326 }
327 
328 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)329 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
330                                       ptrdiff_t src_stride,
331                                       uint8* dst_ptr,
332                                       int dst_width) {
333   const uint8* src_ptr1 = src_ptr + src_stride * 2;
334   ptrdiff_t tmp_src_stride = src_stride;
335 
336   asm volatile (
337     MEMACCESS(5)
338     "ld1       {v29.8h}, [%5]                          \n"
339     MEMACCESS(6)
340     "ld1       {v30.16b}, [%6]                         \n"
341     MEMACCESS(7)
342     "ld1       {v31.8h}, [%7]                          \n"
343     "add       %2, %2, %0                              \n"
344   "1:                                                  \n"
345 
346     // 00 40 01 41 02 42 03 43
347     // 10 50 11 51 12 52 13 53
348     // 20 60 21 61 22 62 23 63
349     // 30 70 31 71 32 72 33 73
350     MEMACCESS(0)
351     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
352     MEMACCESS(3)
353     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
354     MEMACCESS(4)
355     "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
356     "subs      %w4, %w4, #12                           \n"
357 
358     // Shuffle the input data around to get align the data
359     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
360     // 00 10 01 11 02 12 03 13
361     // 40 50 41 51 42 52 43 53
362     "trn1      v20.8b, v0.8b, v1.8b                    \n"
363     "trn2      v21.8b, v0.8b, v1.8b                    \n"
364     "trn1      v22.8b, v4.8b, v5.8b                    \n"
365     "trn2      v23.8b, v4.8b, v5.8b                    \n"
366     "trn1      v24.8b, v16.8b, v17.8b                  \n"
367     "trn2      v25.8b, v16.8b, v17.8b                  \n"
368 
369     // 20 30 21 31 22 32 23 33
370     // 60 70 61 71 62 72 63 73
371     "trn1      v0.8b, v2.8b, v3.8b                     \n"
372     "trn2      v1.8b, v2.8b, v3.8b                     \n"
373     "trn1      v4.8b, v6.8b, v7.8b                     \n"
374     "trn2      v5.8b, v6.8b, v7.8b                     \n"
375     "trn1      v16.8b, v18.8b, v19.8b                  \n"
376     "trn2      v17.8b, v18.8b, v19.8b                  \n"
377 
378     // 00+10 01+11 02+12 03+13
379     // 40+50 41+51 42+52 43+53
380     "uaddlp    v20.4h, v20.8b                          \n"
381     "uaddlp    v21.4h, v21.8b                          \n"
382     "uaddlp    v22.4h, v22.8b                          \n"
383     "uaddlp    v23.4h, v23.8b                          \n"
384     "uaddlp    v24.4h, v24.8b                          \n"
385     "uaddlp    v25.4h, v25.8b                          \n"
386 
387     // 60+70 61+71 62+72 63+73
388     "uaddlp    v1.4h, v1.8b                            \n"
389     "uaddlp    v5.4h, v5.8b                            \n"
390     "uaddlp    v17.4h, v17.8b                          \n"
391 
392     // combine source lines
393     "add       v20.4h, v20.4h, v22.4h                  \n"
394     "add       v21.4h, v21.4h, v23.4h                  \n"
395     "add       v20.4h, v20.4h, v24.4h                  \n"
396     "add       v21.4h, v21.4h, v25.4h                  \n"
397     "add       v2.4h, v1.4h, v5.4h                     \n"
398     "add       v2.4h, v2.4h, v17.4h                    \n"
399 
400     // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
401     //             + s[6 + st * 1] + s[7 + st * 1]
402     //             + s[6 + st * 2] + s[7 + st * 2]) / 6
403     "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
404     "xtn       v2.8b,  v2.8h                           \n"
405 
406     // Shuffle 2,3 reg around so that 2 can be added to the
407     //  0,1 reg and 3 can be added to the 4,5 reg. This
408     //  requires expanding from u8 to u16 as the 0,1 and 4,5
409     //  registers are already expanded. Then do transposes
410     //  to get aligned.
411     // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
412     "ushll     v16.8h, v16.8b, #0                      \n"
413     "uaddl     v0.8h, v0.8b, v4.8b                     \n"
414 
415     // combine source lines
416     "add       v0.8h, v0.8h, v16.8h                    \n"
417 
418     // xx 20 xx 21 xx 22 xx 23
419     // xx 30 xx 31 xx 32 xx 33
420     "trn1      v1.8h, v0.8h, v0.8h                     \n"
421     "trn2      v4.8h, v0.8h, v0.8h                     \n"
422     "xtn       v0.4h, v1.4s                            \n"
423     "xtn       v4.4h, v4.4s                            \n"
424 
425     // 0+1+2, 3+4+5
426     "add       v20.8h, v20.8h, v0.8h                   \n"
427     "add       v21.8h, v21.8h, v4.8h                   \n"
428 
429     // Need to divide, but can't downshift as the the value
430     //  isn't a power of 2. So multiply by 65536 / n
431     //  and take the upper 16 bits.
432     "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
433     "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
434 
435     // Align for table lookup, vtbl requires registers to
436     //  be adjacent
437     "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
438 
439     MEMACCESS(1)
440     "st1       {v3.8b}, [%1], #8                       \n"
441     MEMACCESS(1)
442     "st1       {v3.s}[2], [%1], #4                     \n"
443     "b.gt      1b                                      \n"
444   : "+r"(src_ptr),          // %0
445     "+r"(dst_ptr),          // %1
446     "+r"(tmp_src_stride),   // %2
447     "+r"(src_ptr1),         // %3
448     "+r"(dst_width)         // %4
449   : "r"(&kMult38_Div6),     // %5
450     "r"(&kShuf38_2),        // %6
451     "r"(&kMult38_Div9)      // %7
452   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
453     "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
454     "v30", "v31", "memory", "cc"
455   );
456 }
457 
458 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)459 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
460                                ptrdiff_t src_stride,
461                                uint8* dst_ptr,
462                                int dst_width) {
463   // TODO(fbarchard): use src_stride directly for clang 3.5+.
464   ptrdiff_t tmp_src_stride = src_stride;
465   asm volatile (
466     MEMACCESS(4)
467     "ld1       {v30.8h}, [%4]                          \n"
468     MEMACCESS(5)
469     "ld1       {v31.16b}, [%5]                         \n"
470     "add       %2, %2, %0                              \n"
471   "1:                                                  \n"
472 
473     // 00 40 01 41 02 42 03 43
474     // 10 50 11 51 12 52 13 53
475     // 20 60 21 61 22 62 23 63
476     // 30 70 31 71 32 72 33 73
477     MEMACCESS(0)
478     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
479     MEMACCESS(3)
480     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
481     "subs      %w3, %w3, #12                           \n"
482 
483     // Shuffle the input data around to get align the data
484     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
485     // 00 10 01 11 02 12 03 13
486     // 40 50 41 51 42 52 43 53
487     "trn1      v16.8b, v0.8b, v1.8b                    \n"
488     "trn2      v17.8b, v0.8b, v1.8b                    \n"
489     "trn1      v18.8b, v4.8b, v5.8b                    \n"
490     "trn2      v19.8b, v4.8b, v5.8b                    \n"
491 
492     // 20 30 21 31 22 32 23 33
493     // 60 70 61 71 62 72 63 73
494     "trn1      v0.8b, v2.8b, v3.8b                     \n"
495     "trn2      v1.8b, v2.8b, v3.8b                     \n"
496     "trn1      v4.8b, v6.8b, v7.8b                     \n"
497     "trn2      v5.8b, v6.8b, v7.8b                     \n"
498 
499     // 00+10 01+11 02+12 03+13
500     // 40+50 41+51 42+52 43+53
501     "uaddlp    v16.4h, v16.8b                          \n"
502     "uaddlp    v17.4h, v17.8b                          \n"
503     "uaddlp    v18.4h, v18.8b                          \n"
504     "uaddlp    v19.4h, v19.8b                          \n"
505 
506     // 60+70 61+71 62+72 63+73
507     "uaddlp    v1.4h, v1.8b                            \n"
508     "uaddlp    v5.4h, v5.8b                            \n"
509 
510     // combine source lines
511     "add       v16.4h, v16.4h, v18.4h                  \n"
512     "add       v17.4h, v17.4h, v19.4h                  \n"
513     "add       v2.4h, v1.4h, v5.4h                     \n"
514 
515     // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
516     "uqrshrn   v2.8b, v2.8h, #2                        \n"
517 
518     // Shuffle 2,3 reg around so that 2 can be added to the
519     //  0,1 reg and 3 can be added to the 4,5 reg. This
520     //  requires expanding from u8 to u16 as the 0,1 and 4,5
521     //  registers are already expanded. Then do transposes
522     //  to get aligned.
523     // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
524 
525     // combine source lines
526     "uaddl     v0.8h, v0.8b, v4.8b                     \n"
527 
528     // xx 20 xx 21 xx 22 xx 23
529     // xx 30 xx 31 xx 32 xx 33
530     "trn1      v1.8h, v0.8h, v0.8h                     \n"
531     "trn2      v4.8h, v0.8h, v0.8h                     \n"
532     "xtn       v0.4h, v1.4s                            \n"
533     "xtn       v4.4h, v4.4s                            \n"
534 
535     // 0+1+2, 3+4+5
536     "add       v16.8h, v16.8h, v0.8h                   \n"
537     "add       v17.8h, v17.8h, v4.8h                   \n"
538 
539     // Need to divide, but can't downshift as the the value
540     //  isn't a power of 2. So multiply by 65536 / n
541     //  and take the upper 16 bits.
542     "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
543     "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
544 
545     // Align for table lookup, vtbl requires registers to
546     //  be adjacent
547 
548     "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
549 
550     MEMACCESS(1)
551     "st1       {v3.8b}, [%1], #8                       \n"
552     MEMACCESS(1)
553     "st1       {v3.s}[2], [%1], #4                     \n"
554     "b.gt      1b                                      \n"
555   : "+r"(src_ptr),         // %0
556     "+r"(dst_ptr),         // %1
557     "+r"(tmp_src_stride),  // %2
558     "+r"(dst_width)        // %3
559   : "r"(&kMult38_Div6),    // %4
560     "r"(&kShuf38_2)        // %5
561   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
562     "v18", "v19", "v30", "v31", "memory", "cc"
563   );
564 }
565 
ScaleAddRows_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint16 * dst_ptr,int src_width,int src_height)566 void ScaleAddRows_NEON(const uint8* src_ptr,
567                        ptrdiff_t src_stride,
568                        uint16* dst_ptr,
569                        int src_width,
570                        int src_height) {
571   const uint8* src_tmp;
572   asm volatile (
573   "1:                                          \n"
574     "mov       %0, %1                          \n"
575     "mov       w12, %w5                        \n"
576     "eor       v2.16b, v2.16b, v2.16b          \n"
577     "eor       v3.16b, v3.16b, v3.16b          \n"
578   "2:                                          \n"
579     // load 16 pixels into q0
580     MEMACCESS(0)
581     "ld1       {v0.16b}, [%0], %3              \n"
582     "uaddw2    v3.8h, v3.8h, v0.16b            \n"
583     "uaddw     v2.8h, v2.8h, v0.8b             \n"
584     "subs      w12, w12, #1                    \n"
585     "b.gt      2b                              \n"
586     MEMACCESS(2)
587     "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
588     "add      %1, %1, #16                      \n"
589     "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
590     "b.gt     1b                               \n"
591   : "=&r"(src_tmp),    // %0
592     "+r"(src_ptr),     // %1
593     "+r"(dst_ptr),     // %2
594     "+r"(src_stride),  // %3
595     "+r"(src_width),   // %4
596     "+r"(src_height)   // %5
597   :
598   : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
599   );
600 }
601 
602 // clang-format off
603 // TODO(Yang Zhang): Investigate less load instructions for
604 // the x/dx stepping
605 #define LOAD2_DATA8_LANE(n)                                 \
606   "lsr        %5, %3, #16                    \n"            \
607   "add        %6, %1, %5                     \n"            \
608   "add        %3, %3, %4                     \n"            \
609   MEMACCESS(6)                                              \
610   "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
611 // clang-format on
612 
613 // The NEON version mimics this formula (from row_common.cc):
614 // #define BLENDER(a, b, f) (uint8)((int)(a) +
615 //    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
616 
ScaleFilterCols_NEON(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)617 void ScaleFilterCols_NEON(uint8* dst_ptr,
618                           const uint8* src_ptr,
619                           int dst_width,
620                           int x,
621                           int dx) {
622   int dx_offset[4] = {0, 1, 2, 3};
623   int* tmp = dx_offset;
624   const uint8* src_tmp = src_ptr;
625   int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
626   int64 x64 = (int64)x;
627   int64 dx64 = (int64)dx;
628   asm volatile (
629     "dup        v0.4s, %w3                     \n"  // x
630     "dup        v1.4s, %w4                     \n"  // dx
631     "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
632     "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
633     "mul        v1.4s, v1.4s, v2.4s            \n"
634     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
635     "add        v1.4s, v1.4s, v0.4s            \n"
636     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
637     "add        v2.4s, v1.4s, v3.4s            \n"
638     "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
639   "1:                                          \n"
640     LOAD2_DATA8_LANE(0)
641     LOAD2_DATA8_LANE(1)
642     LOAD2_DATA8_LANE(2)
643     LOAD2_DATA8_LANE(3)
644     LOAD2_DATA8_LANE(4)
645     LOAD2_DATA8_LANE(5)
646     LOAD2_DATA8_LANE(6)
647     LOAD2_DATA8_LANE(7)
648     "mov       v6.16b, v1.16b                  \n"
649     "mov       v7.16b, v2.16b                  \n"
650     "uzp1      v6.8h, v6.8h, v7.8h             \n"
651     "ushll     v4.8h, v4.8b, #0                \n"
652     "ushll     v5.8h, v5.8b, #0                \n"
653     "ssubl     v16.4s, v5.4h, v4.4h            \n"
654     "ssubl2    v17.4s, v5.8h, v4.8h            \n"
655     "ushll     v7.4s, v6.4h, #0                \n"
656     "ushll2    v6.4s, v6.8h, #0                \n"
657     "mul       v16.4s, v16.4s, v7.4s           \n"
658     "mul       v17.4s, v17.4s, v6.4s           \n"
659     "rshrn     v6.4h, v16.4s, #16              \n"
660     "rshrn2    v6.8h, v17.4s, #16              \n"
661     "add       v4.8h, v4.8h, v6.8h             \n"
662     "xtn       v4.8b, v4.8h                    \n"
663 
664     MEMACCESS(0)
665     "st1       {v4.8b}, [%0], #8               \n"  // store pixels
666     "add       v1.4s, v1.4s, v0.4s             \n"
667     "add       v2.4s, v2.4s, v0.4s             \n"
668     "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
669     "b.gt      1b                              \n"
670   : "+r"(dst_ptr),          // %0
671     "+r"(src_ptr),          // %1
672     "+r"(dst_width64),      // %2
673     "+r"(x64),              // %3
674     "+r"(dx64),             // %4
675     "+r"(tmp),              // %5
676     "+r"(src_tmp)           // %6
677   :
678   : "memory", "cc", "v0", "v1", "v2", "v3",
679     "v4", "v5", "v6", "v7", "v16", "v17"
680   );
681 }
682 
683 #undef LOAD2_DATA8_LANE
684 
685 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)686 void ScaleFilterRows_NEON(uint8* dst_ptr,
687                           const uint8* src_ptr,
688                           ptrdiff_t src_stride,
689                           int dst_width,
690                           int source_y_fraction) {
691   int y_fraction = 256 - source_y_fraction;
692   asm volatile (
693     "cmp          %w4, #0                      \n"
694     "b.eq         100f                         \n"
695     "add          %2, %2, %1                   \n"
696     "cmp          %w4, #64                     \n"
697     "b.eq         75f                          \n"
698     "cmp          %w4, #128                    \n"
699     "b.eq         50f                          \n"
700     "cmp          %w4, #192                    \n"
701     "b.eq         25f                          \n"
702 
703     "dup          v5.8b, %w4                   \n"
704     "dup          v4.8b, %w5                   \n"
705     // General purpose row blend.
706   "1:                                          \n"
707     MEMACCESS(1)
708     "ld1          {v0.16b}, [%1], #16          \n"
709     MEMACCESS(2)
710     "ld1          {v1.16b}, [%2], #16          \n"
711     "subs         %w3, %w3, #16                \n"
712     "umull        v6.8h, v0.8b, v4.8b          \n"
713     "umull2       v7.8h, v0.16b, v4.16b        \n"
714     "umlal        v6.8h, v1.8b, v5.8b          \n"
715     "umlal2       v7.8h, v1.16b, v5.16b        \n"
716     "rshrn        v0.8b, v6.8h, #8             \n"
717     "rshrn2       v0.16b, v7.8h, #8            \n"
718     MEMACCESS(0)
719     "st1          {v0.16b}, [%0], #16          \n"
720     "b.gt         1b                           \n"
721     "b            99f                          \n"
722 
723     // Blend 25 / 75.
724   "25:                                         \n"
725     MEMACCESS(1)
726     "ld1          {v0.16b}, [%1], #16          \n"
727     MEMACCESS(2)
728     "ld1          {v1.16b}, [%2], #16          \n"
729     "subs         %w3, %w3, #16                \n"
730     "urhadd       v0.16b, v0.16b, v1.16b       \n"
731     "urhadd       v0.16b, v0.16b, v1.16b       \n"
732     MEMACCESS(0)
733     "st1          {v0.16b}, [%0], #16          \n"
734     "b.gt         25b                          \n"
735     "b            99f                          \n"
736 
737     // Blend 50 / 50.
738   "50:                                         \n"
739     MEMACCESS(1)
740     "ld1          {v0.16b}, [%1], #16          \n"
741     MEMACCESS(2)
742     "ld1          {v1.16b}, [%2], #16          \n"
743     "subs         %w3, %w3, #16                \n"
744     "urhadd       v0.16b, v0.16b, v1.16b       \n"
745     MEMACCESS(0)
746     "st1          {v0.16b}, [%0], #16          \n"
747     "b.gt         50b                          \n"
748     "b            99f                          \n"
749 
750     // Blend 75 / 25.
751   "75:                                         \n"
752     MEMACCESS(1)
753     "ld1          {v1.16b}, [%1], #16          \n"
754     MEMACCESS(2)
755     "ld1          {v0.16b}, [%2], #16          \n"
756     "subs         %w3, %w3, #16                \n"
757     "urhadd       v0.16b, v0.16b, v1.16b       \n"
758     "urhadd       v0.16b, v0.16b, v1.16b       \n"
759     MEMACCESS(0)
760     "st1          {v0.16b}, [%0], #16          \n"
761     "b.gt         75b                          \n"
762     "b            99f                          \n"
763 
764     // Blend 100 / 0 - Copy row unchanged.
765   "100:                                        \n"
766     MEMACCESS(1)
767     "ld1          {v0.16b}, [%1], #16          \n"
768     "subs         %w3, %w3, #16                \n"
769     MEMACCESS(0)
770     "st1          {v0.16b}, [%0], #16          \n"
771     "b.gt         100b                         \n"
772 
773   "99:                                         \n"
774     MEMACCESS(0)
775     "st1          {v0.b}[15], [%0]             \n"
776   : "+r"(dst_ptr),          // %0
777     "+r"(src_ptr),          // %1
778     "+r"(src_stride),       // %2
779     "+r"(dst_width),        // %3
780     "+r"(source_y_fraction),// %4
781     "+r"(y_fraction)        // %5
782   :
783   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
784   );
785 }
786 
ScaleARGBRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)787 void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
788                             ptrdiff_t src_stride,
789                             uint8* dst,
790                             int dst_width) {
791   (void)src_stride;
792   asm volatile (
793   "1:                                          \n"
794     // load even pixels into q0, odd into q1
795     MEMACCESS (0)
796     "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
797     MEMACCESS (0)
798     "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
799     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
800     MEMACCESS (1)
801     "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
802     MEMACCESS (1)
803     "st1        {v3.16b}, [%1], #16            \n"
804     "b.gt       1b                             \n"
805   : "+r" (src_ptr),          // %0
806     "+r" (dst),              // %1
807     "+r" (dst_width)         // %2
808   :
809   : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
810   );
811 }
812 
ScaleARGBRowDown2Linear_NEON(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)813 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
814                                   ptrdiff_t src_stride,
815                                   uint8* dst_argb,
816                                   int dst_width) {
817   (void)src_stride;
818   asm volatile (
819   "1:                                          \n"
820     MEMACCESS (0)
821     // load 8 ARGB pixels.
822     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
823     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
824     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
825     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
826     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
827     "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
828     "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
829     "rshrn      v1.8b, v1.8h, #1               \n"
830     "rshrn      v2.8b, v2.8h, #1               \n"
831     "rshrn      v3.8b, v3.8h, #1               \n"
832     MEMACCESS (1)
833     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
834     "b.gt       1b                             \n"
835   : "+r"(src_argb),         // %0
836     "+r"(dst_argb),         // %1
837     "+r"(dst_width)         // %2
838   :
839   : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
840   );
841 }
842 
ScaleARGBRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)843 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
844                                ptrdiff_t src_stride,
845                                uint8* dst,
846                                int dst_width) {
847   asm volatile (
848     // change the stride to row 2 pointer
849     "add        %1, %1, %0                     \n"
850   "1:                                          \n"
851     MEMACCESS (0)
852     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
853     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
854     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
855     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
856     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
857     "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
858     MEMACCESS (1)
859     "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
860     "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
861     "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
862     "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
863     "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
864     "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
865     "rshrn      v1.8b, v1.8h, #2               \n"
866     "rshrn      v2.8b, v2.8h, #2               \n"
867     "rshrn      v3.8b, v3.8h, #2               \n"
868     MEMACCESS (2)
869     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
870     "b.gt       1b                             \n"
871   : "+r" (src_ptr),          // %0
872     "+r" (src_stride),       // %1
873     "+r" (dst),              // %2
874     "+r" (dst_width)         // %3
875   :
876   : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
877   );
878 }
879 
880 // Reads 4 pixels at a time.
881 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)882 void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
883                                ptrdiff_t src_stride,
884                                int src_stepx,
885                                uint8* dst_argb,
886                                int dst_width) {
887   (void)src_stride;
888   asm volatile (
889   "1:                                          \n"
890     MEMACCESS(0)
891     "ld1        {v0.s}[0], [%0], %3            \n"
892     MEMACCESS(0)
893     "ld1        {v0.s}[1], [%0], %3            \n"
894     MEMACCESS(0)
895     "ld1        {v0.s}[2], [%0], %3            \n"
896     MEMACCESS(0)
897     "ld1        {v0.s}[3], [%0], %3            \n"
898     "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
899     MEMACCESS(1)
900     "st1        {v0.16b}, [%1], #16            \n"
901     "b.gt       1b                             \n"
902   : "+r"(src_argb),    // %0
903     "+r"(dst_argb),    // %1
904     "+r"(dst_width)    // %2
905   : "r"((int64)(src_stepx * 4)) // %3
906   : "memory", "cc", "v0"
907   );
908 }
909 
910 // Reads 4 pixels at a time.
911 // Alignment requirement: src_argb 4 byte aligned.
912 // TODO(Yang Zhang): Might be worth another optimization pass in future.
913 // It could be upgraded to 8 pixels at a time to start with.
ScaleARGBRowDownEvenBox_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)914 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
915                                   ptrdiff_t src_stride,
916                                   int src_stepx,
917                                   uint8* dst_argb,
918                                   int dst_width) {
919   asm volatile (
920     "add        %1, %1, %0                     \n"
921   "1:                                          \n"
922     MEMACCESS(0)
923     "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
924     MEMACCESS(1)
925     "ld1        {v1.8b}, [%1], %4              \n"
926     MEMACCESS(0)
927     "ld1        {v2.8b}, [%0], %4              \n"
928     MEMACCESS(1)
929     "ld1        {v3.8b}, [%1], %4              \n"
930     MEMACCESS(0)
931     "ld1        {v4.8b}, [%0], %4              \n"
932     MEMACCESS(1)
933     "ld1        {v5.8b}, [%1], %4              \n"
934     MEMACCESS(0)
935     "ld1        {v6.8b}, [%0], %4              \n"
936     MEMACCESS(1)
937     "ld1        {v7.8b}, [%1], %4              \n"
938     "uaddl      v0.8h, v0.8b, v1.8b            \n"
939     "uaddl      v2.8h, v2.8b, v3.8b            \n"
940     "uaddl      v4.8h, v4.8b, v5.8b            \n"
941     "uaddl      v6.8h, v6.8b, v7.8b            \n"
942     "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
943     "mov        v0.d[1], v2.d[0]               \n"
944     "mov        v2.d[0], v16.d[1]              \n"
945     "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
946     "mov        v4.d[1], v6.d[0]               \n"
947     "mov        v6.d[0], v16.d[1]              \n"
948     "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
949     "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
950     "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
951     "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
952     "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
953     MEMACCESS(2)
954     "st1     {v0.16b}, [%2], #16               \n"
955     "b.gt       1b                             \n"
956   : "+r"(src_argb),    // %0
957     "+r"(src_stride),  // %1
958     "+r"(dst_argb),    // %2
959     "+r"(dst_width)    // %3
960   : "r"((int64)(src_stepx * 4)) // %4
961   : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
962   );
963 }
964 
965 // clang-format off
966 // TODO(Yang Zhang): Investigate less load instructions for
967 // the x/dx stepping
968 #define LOAD1_DATA32_LANE(vn, n)                            \
969   "lsr        %5, %3, #16                    \n"            \
970   "add        %6, %1, %5, lsl #2             \n"            \
971   "add        %3, %3, %4                     \n"            \
972   MEMACCESS(6)                                              \
973  "ld1        {" #vn ".s}[" #n "], [%6]       \n"
974 // clang-format on
975 
ScaleARGBCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)976 void ScaleARGBCols_NEON(uint8* dst_argb,
977                         const uint8* src_argb,
978                         int dst_width,
979                         int x,
980                         int dx) {
981   const uint8* src_tmp = src_argb;
982   int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
983   int64 x64 = (int64)x;
984   int64 dx64 = (int64)dx;
985   int64 tmp64;
986   asm volatile (
987   "1:                                          \n"
988     LOAD1_DATA32_LANE(v0, 0)
989     LOAD1_DATA32_LANE(v0, 1)
990     LOAD1_DATA32_LANE(v0, 2)
991     LOAD1_DATA32_LANE(v0, 3)
992     LOAD1_DATA32_LANE(v1, 0)
993     LOAD1_DATA32_LANE(v1, 1)
994     LOAD1_DATA32_LANE(v1, 2)
995     LOAD1_DATA32_LANE(v1, 3)
996 
997     MEMACCESS(0)
998     "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
999     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
1000     "b.gt        1b                            \n"
1001   : "+r"(dst_argb),     // %0
1002     "+r"(src_argb),     // %1
1003     "+r"(dst_width64),  // %2
1004     "+r"(x64),          // %3
1005     "+r"(dx64),         // %4
1006     "=&r"(tmp64),       // %5
1007     "+r"(src_tmp)       // %6
1008   :
1009   : "memory", "cc", "v0", "v1"
1010   );
1011 }
1012 
1013 #undef LOAD1_DATA32_LANE
1014 
1015 // clang-format off
1016 // TODO(Yang Zhang): Investigate less load instructions for
1017 // the x/dx stepping
1018 #define LOAD2_DATA32_LANE(vn1, vn2, n)                             \
1019   "lsr        %5, %3, #16                           \n"            \
1020   "add        %6, %1, %5, lsl #2                    \n"            \
1021   "add        %3, %3, %4                            \n"            \
1022   MEMACCESS(6)                                                     \
1023   "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
1024 // clang-format on
1025 
ScaleARGBFilterCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1026 void ScaleARGBFilterCols_NEON(uint8* dst_argb,
1027                               const uint8* src_argb,
1028                               int dst_width,
1029                               int x,
1030                               int dx) {
1031   int dx_offset[4] = {0, 1, 2, 3};
1032   int* tmp = dx_offset;
1033   const uint8* src_tmp = src_argb;
1034   int64 dst_width64 = (int64)dst_width;  // Work around ios 64 bit warning.
1035   int64 x64 = (int64)x;
1036   int64 dx64 = (int64)dx;
1037   asm volatile (
1038     "dup        v0.4s, %w3                     \n"  // x
1039     "dup        v1.4s, %w4                     \n"  // dx
1040     "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
1041     "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
1042     "mul        v1.4s, v1.4s, v2.4s            \n"
1043     "movi       v3.16b, #0x7f                  \n"  // 0x7F
1044     "movi       v4.8h, #0x7f                   \n"  // 0x7F
1045     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
1046     "add        v5.4s, v1.4s, v0.4s            \n"
1047   "1:                                          \n"
1048     // d0, d1: a
1049     // d2, d3: b
1050     LOAD2_DATA32_LANE(v0, v1, 0)
1051     LOAD2_DATA32_LANE(v0, v1, 1)
1052     LOAD2_DATA32_LANE(v0, v1, 2)
1053     LOAD2_DATA32_LANE(v0, v1, 3)
1054     "shrn       v2.4h, v5.4s, #9               \n"
1055     "and        v2.8b, v2.8b, v4.8b            \n"
1056     "dup        v16.8b, v2.b[0]                \n"
1057     "dup        v17.8b, v2.b[2]                \n"
1058     "dup        v18.8b, v2.b[4]                \n"
1059     "dup        v19.8b, v2.b[6]                \n"
1060     "ext        v2.8b, v16.8b, v17.8b, #4      \n"
1061     "ext        v17.8b, v18.8b, v19.8b, #4     \n"
1062     "ins        v2.d[1], v17.d[0]              \n"  // f
1063     "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
1064     "umull      v16.8h, v0.8b, v7.8b           \n"
1065     "umull2     v17.8h, v0.16b, v7.16b         \n"
1066     "umull      v18.8h, v1.8b, v2.8b           \n"
1067     "umull2     v19.8h, v1.16b, v2.16b         \n"
1068     "add        v16.8h, v16.8h, v18.8h         \n"
1069     "add        v17.8h, v17.8h, v19.8h         \n"
1070     "shrn       v0.8b, v16.8h, #7              \n"
1071     "shrn2      v0.16b, v17.8h, #7             \n"
1072 
1073     MEMACCESS(0)
1074     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
1075     "add     v5.4s, v5.4s, v6.4s               \n"
1076     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
1077     "b.gt    1b                                \n"
1078   : "+r"(dst_argb),         // %0
1079     "+r"(src_argb),         // %1
1080     "+r"(dst_width64),      // %2
1081     "+r"(x64),              // %3
1082     "+r"(dx64),             // %4
1083     "+r"(tmp),              // %5
1084     "+r"(src_tmp)           // %6
1085   :
1086   : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
1087     "v6", "v7", "v16", "v17", "v18", "v19"
1088   );
1089 }
1090 
1091 #undef LOAD2_DATA32_LANE
1092 
1093 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1094 
1095 #ifdef __cplusplus
1096 }  // extern "C"
1097 }  // namespace libyuv
1098 #endif
1099