• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/scale.h"
12 #include "libyuv/row.h"
13 #include "libyuv/scale_row.h"
14 
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19 
20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22 
23 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)24 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
25                         uint8* dst, int dst_width) {
26   asm volatile (
27   "1:                                          \n"
28     // load even pixels into v0, odd into v1
29     MEMACCESS(0)
30     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
31     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
32     MEMACCESS(1)
33     "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
34     "b.gt       1b                             \n"
35   : "+r"(src_ptr),          // %0
36     "+r"(dst),              // %1
37     "+r"(dst_width)         // %2
38   :
39   : "v0", "v1"              // Clobber List
40   );
41 }
42 
43 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)44 void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
45                            uint8* dst, int dst_width) {
46   asm volatile (
47   "1:                                          \n"
48     MEMACCESS(0)
49     "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
50     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
51     "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
52     "uaddlp     v1.8h, v1.16b                  \n"
53     "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
54     "rshrn2     v0.16b, v1.8h, #1              \n"
55     MEMACCESS(1)
56     "st1        {v0.16b}, [%1], #16            \n"
57     "b.gt       1b                             \n"
58   : "+r"(src_ptr),          // %0
59     "+r"(dst),              // %1
60     "+r"(dst_width)         // %2
61   :
62   : "v0", "v1"     // Clobber List
63   );
64 }
65 
66 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)67 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
68                            uint8* dst, int dst_width) {
69   asm volatile (
70     // change the stride to row 2 pointer
71     "add        %1, %1, %0                     \n"
72   "1:                                          \n"
73     MEMACCESS(0)
74     "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
75     MEMACCESS(1)
76     "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
77     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
78     "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
79     "uaddlp     v1.8h, v1.16b                  \n"
80     "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
81     "uadalp     v1.8h, v3.16b                  \n"
82     "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
83     "rshrn2     v0.16b, v1.8h, #2              \n"
84     MEMACCESS(2)
85     "st1        {v0.16b}, [%2], #16            \n"
86     "b.gt       1b                             \n"
87   : "+r"(src_ptr),          // %0
88     "+r"(src_stride),       // %1
89     "+r"(dst),              // %2
90     "+r"(dst_width)         // %3
91   :
92   : "v0", "v1", "v2", "v3"     // Clobber List
93   );
94 }
95 
ScaleRowDown4_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)96 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
97                         uint8* dst_ptr, int dst_width) {
98   asm volatile (
99   "1:                                          \n"
100     MEMACCESS(0)
101     "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
102     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
103     MEMACCESS(1)
104     "st1     {v2.8b}, [%1], #8                 \n"
105     "b.gt       1b                             \n"
106   : "+r"(src_ptr),          // %0
107     "+r"(dst_ptr),          // %1
108     "+r"(dst_width)         // %2
109   :
110   : "v0", "v1", "v2", "v3", "memory", "cc"
111   );
112 }
113 
ScaleRowDown4Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)114 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
115                            uint8* dst_ptr, int dst_width) {
116   const uint8* src_ptr1 = src_ptr + src_stride;
117   const uint8* src_ptr2 = src_ptr + src_stride * 2;
118   const uint8* src_ptr3 = src_ptr + src_stride * 3;
119 asm volatile (
120   "1:                                          \n"
121     MEMACCESS(0)
122     "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
123     MEMACCESS(3)
124     "ld1     {v1.16b}, [%2], #16               \n"
125     MEMACCESS(4)
126     "ld1     {v2.16b}, [%3], #16               \n"
127     MEMACCESS(5)
128     "ld1     {v3.16b}, [%4], #16               \n"
129     "subs    %w5, %w5, #4                      \n"
130     "uaddlp  v0.8h, v0.16b                     \n"
131     "uadalp  v0.8h, v1.16b                     \n"
132     "uadalp  v0.8h, v2.16b                     \n"
133     "uadalp  v0.8h, v3.16b                     \n"
134     "addp    v0.8h, v0.8h, v0.8h               \n"
135     "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
136     MEMACCESS(1)
137     "st1    {v0.s}[0], [%1], #4                \n"
138     "b.gt       1b                             \n"
139   : "+r"(src_ptr),   // %0
140     "+r"(dst_ptr),   // %1
141     "+r"(src_ptr1),  // %2
142     "+r"(src_ptr2),  // %3
143     "+r"(src_ptr3),  // %4
144     "+r"(dst_width)  // %5
145   :
146   : "v0", "v1", "v2", "v3", "memory", "cc"
147   );
148 }
149 
150 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
151 // to load up the every 4th pixel into a 4 different registers.
152 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)153 void ScaleRowDown34_NEON(const uint8* src_ptr,
154                          ptrdiff_t src_stride,
155                          uint8* dst_ptr, int dst_width) {
156   asm volatile (
157   "1:                                                  \n"
158     MEMACCESS(0)
159     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
160     "subs      %w2, %w2, #24                           \n"
161     "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
162     MEMACCESS(1)
163     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
164     "b.gt      1b                                      \n"
165   : "+r"(src_ptr),          // %0
166     "+r"(dst_ptr),          // %1
167     "+r"(dst_width)         // %2
168   :
169   : "v0", "v1", "v2", "v3", "memory", "cc"
170   );
171 }
172 
ScaleRowDown34_0_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)173 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
174                                ptrdiff_t src_stride,
175                                uint8* dst_ptr, int dst_width) {
176   asm volatile (
177     "movi      v20.8b, #3                              \n"
178     "add       %3, %3, %0                              \n"
179   "1:                                                  \n"
180     MEMACCESS(0)
181     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
182     MEMACCESS(3)
183     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
184     "subs         %w2, %w2, #24                        \n"
185 
186     // filter src line 0 with src line 1
187     // expand chars to shorts to allow for room
188     // when adding lines together
189     "ushll     v16.8h, v4.8b, #0                       \n"
190     "ushll     v17.8h, v5.8b, #0                       \n"
191     "ushll     v18.8h, v6.8b, #0                       \n"
192     "ushll     v19.8h, v7.8b, #0                       \n"
193 
194     // 3 * line_0 + line_1
195     "umlal     v16.8h, v0.8b, v20.8b                   \n"
196     "umlal     v17.8h, v1.8b, v20.8b                   \n"
197     "umlal     v18.8h, v2.8b, v20.8b                   \n"
198     "umlal     v19.8h, v3.8b, v20.8b                   \n"
199 
200     // (3 * line_0 + line_1) >> 2
201     "uqrshrn   v0.8b, v16.8h, #2                       \n"
202     "uqrshrn   v1.8b, v17.8h, #2                       \n"
203     "uqrshrn   v2.8b, v18.8h, #2                       \n"
204     "uqrshrn   v3.8b, v19.8h, #2                       \n"
205 
206     // a0 = (src[0] * 3 + s[1] * 1) >> 2
207     "ushll     v16.8h, v1.8b, #0                       \n"
208     "umlal     v16.8h, v0.8b, v20.8b                   \n"
209     "uqrshrn   v0.8b, v16.8h, #2                       \n"
210 
211     // a1 = (src[1] * 1 + s[2] * 1) >> 1
212     "urhadd    v1.8b, v1.8b, v2.8b                     \n"
213 
214     // a2 = (src[2] * 1 + s[3] * 3) >> 2
215     "ushll     v16.8h, v2.8b, #0                       \n"
216     "umlal     v16.8h, v3.8b, v20.8b                   \n"
217     "uqrshrn   v2.8b, v16.8h, #2                       \n"
218 
219     MEMACCESS(1)
220     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
221 
222     "b.gt      1b                                      \n"
223   : "+r"(src_ptr),          // %0
224     "+r"(dst_ptr),          // %1
225     "+r"(dst_width),        // %2
226     "+r"(src_stride)        // %3
227   :
228   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
229     "v20", "memory", "cc"
230   );
231 }
232 
ScaleRowDown34_1_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)233 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
234                                ptrdiff_t src_stride,
235                                uint8* dst_ptr, int dst_width) {
236   asm volatile (
237     "movi      v20.8b, #3                              \n"
238     "add       %3, %3, %0                              \n"
239   "1:                                                  \n"
240     MEMACCESS(0)
241     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
242     MEMACCESS(3)
243     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
244     "subs         %w2, %w2, #24                        \n"
245     // average src line 0 with src line 1
246     "urhadd    v0.8b, v0.8b, v4.8b                     \n"
247     "urhadd    v1.8b, v1.8b, v5.8b                     \n"
248     "urhadd    v2.8b, v2.8b, v6.8b                     \n"
249     "urhadd    v3.8b, v3.8b, v7.8b                     \n"
250 
251     // a0 = (src[0] * 3 + s[1] * 1) >> 2
252     "ushll     v4.8h, v1.8b, #0                        \n"
253     "umlal     v4.8h, v0.8b, v20.8b                    \n"
254     "uqrshrn   v0.8b, v4.8h, #2                        \n"
255 
256     // a1 = (src[1] * 1 + s[2] * 1) >> 1
257     "urhadd    v1.8b, v1.8b, v2.8b                     \n"
258 
259     // a2 = (src[2] * 1 + s[3] * 3) >> 2
260     "ushll     v4.8h, v2.8b, #0                        \n"
261     "umlal     v4.8h, v3.8b, v20.8b                    \n"
262     "uqrshrn   v2.8b, v4.8h, #2                        \n"
263 
264     MEMACCESS(1)
265     "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
266     "b.gt      1b                                      \n"
267   : "+r"(src_ptr),          // %0
268     "+r"(dst_ptr),          // %1
269     "+r"(dst_width),        // %2
270     "+r"(src_stride)        // %3
271   :
272   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
273   );
274 }
275 
276 static uvec8 kShuf38 =
277   { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
278 static uvec8 kShuf38_2 =
279   { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
280 static vec16 kMult38_Div6 =
281   { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
282     65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
283 static vec16 kMult38_Div9 =
284   { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
285     65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
286 
287 // 32 -> 12
ScaleRowDown38_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)288 void ScaleRowDown38_NEON(const uint8* src_ptr,
289                          ptrdiff_t src_stride,
290                          uint8* dst_ptr, int dst_width) {
291   asm volatile (
292     MEMACCESS(3)
293     "ld1       {v3.16b}, [%3]                          \n"
294   "1:                                                  \n"
295     MEMACCESS(0)
296     "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
297     "subs      %w2, %w2, #12                           \n"
298     "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
299     MEMACCESS(1)
300     "st1       {v2.8b}, [%1], #8                       \n"
301     MEMACCESS(1)
302     "st1       {v2.s}[2], [%1], #4                     \n"
303     "b.gt      1b                                      \n"
304   : "+r"(src_ptr),          // %0
305     "+r"(dst_ptr),          // %1
306     "+r"(dst_width)         // %2
307   : "r"(&kShuf38)           // %3
308   : "v0", "v1", "v2", "v3", "memory", "cc"
309   );
310 }
311 
312 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)313 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
314                                       ptrdiff_t src_stride,
315                                       uint8* dst_ptr, int dst_width) {
316   const uint8* src_ptr1 = src_ptr + src_stride * 2;
317   ptrdiff_t tmp_src_stride = src_stride;
318 
319   asm volatile (
320     MEMACCESS(5)
321     "ld1       {v29.8h}, [%5]                          \n"
322     MEMACCESS(6)
323     "ld1       {v30.16b}, [%6]                         \n"
324     MEMACCESS(7)
325     "ld1       {v31.8h}, [%7]                          \n"
326     "add       %2, %2, %0                              \n"
327   "1:                                                  \n"
328 
329     // 00 40 01 41 02 42 03 43
330     // 10 50 11 51 12 52 13 53
331     // 20 60 21 61 22 62 23 63
332     // 30 70 31 71 32 72 33 73
333     MEMACCESS(0)
334     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
335     MEMACCESS(3)
336     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
337     MEMACCESS(4)
338     "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
339     "subs      %w4, %w4, #12                           \n"
340 
341     // Shuffle the input data around to get align the data
342     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
343     // 00 10 01 11 02 12 03 13
344     // 40 50 41 51 42 52 43 53
345     "trn1      v20.8b, v0.8b, v1.8b                    \n"
346     "trn2      v21.8b, v0.8b, v1.8b                    \n"
347     "trn1      v22.8b, v4.8b, v5.8b                    \n"
348     "trn2      v23.8b, v4.8b, v5.8b                    \n"
349     "trn1      v24.8b, v16.8b, v17.8b                  \n"
350     "trn2      v25.8b, v16.8b, v17.8b                  \n"
351 
352     // 20 30 21 31 22 32 23 33
353     // 60 70 61 71 62 72 63 73
354     "trn1      v0.8b, v2.8b, v3.8b                     \n"
355     "trn2      v1.8b, v2.8b, v3.8b                     \n"
356     "trn1      v4.8b, v6.8b, v7.8b                     \n"
357     "trn2      v5.8b, v6.8b, v7.8b                     \n"
358     "trn1      v16.8b, v18.8b, v19.8b                  \n"
359     "trn2      v17.8b, v18.8b, v19.8b                  \n"
360 
361     // 00+10 01+11 02+12 03+13
362     // 40+50 41+51 42+52 43+53
363     "uaddlp    v20.4h, v20.8b                          \n"
364     "uaddlp    v21.4h, v21.8b                          \n"
365     "uaddlp    v22.4h, v22.8b                          \n"
366     "uaddlp    v23.4h, v23.8b                          \n"
367     "uaddlp    v24.4h, v24.8b                          \n"
368     "uaddlp    v25.4h, v25.8b                          \n"
369 
370     // 60+70 61+71 62+72 63+73
371     "uaddlp    v1.4h, v1.8b                            \n"
372     "uaddlp    v5.4h, v5.8b                            \n"
373     "uaddlp    v17.4h, v17.8b                          \n"
374 
375     // combine source lines
376     "add       v20.4h, v20.4h, v22.4h                  \n"
377     "add       v21.4h, v21.4h, v23.4h                  \n"
378     "add       v20.4h, v20.4h, v24.4h                  \n"
379     "add       v21.4h, v21.4h, v25.4h                  \n"
380     "add       v2.4h, v1.4h, v5.4h                     \n"
381     "add       v2.4h, v2.4h, v17.4h                    \n"
382 
383     // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
384     //             + s[6 + st * 1] + s[7 + st * 1]
385     //             + s[6 + st * 2] + s[7 + st * 2]) / 6
386     "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
387     "xtn       v2.8b,  v2.8h                           \n"
388 
389     // Shuffle 2,3 reg around so that 2 can be added to the
390     //  0,1 reg and 3 can be added to the 4,5 reg. This
391     //  requires expanding from u8 to u16 as the 0,1 and 4,5
392     //  registers are already expanded. Then do transposes
393     //  to get aligned.
394     // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
395     "ushll     v16.8h, v16.8b, #0                      \n"
396     "uaddl     v0.8h, v0.8b, v4.8b                     \n"
397 
398     // combine source lines
399     "add       v0.8h, v0.8h, v16.8h                    \n"
400 
401     // xx 20 xx 21 xx 22 xx 23
402     // xx 30 xx 31 xx 32 xx 33
403     "trn1      v1.8h, v0.8h, v0.8h                     \n"
404     "trn2      v4.8h, v0.8h, v0.8h                     \n"
405     "xtn       v0.4h, v1.4s                            \n"
406     "xtn       v4.4h, v4.4s                            \n"
407 
408     // 0+1+2, 3+4+5
409     "add       v20.8h, v20.8h, v0.8h                   \n"
410     "add       v21.8h, v21.8h, v4.8h                   \n"
411 
412     // Need to divide, but can't downshift as the the value
413     //  isn't a power of 2. So multiply by 65536 / n
414     //  and take the upper 16 bits.
415     "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
416     "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
417 
418     // Align for table lookup, vtbl requires registers to
419     //  be adjacent
420     "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
421 
422     MEMACCESS(1)
423     "st1       {v3.8b}, [%1], #8                       \n"
424     MEMACCESS(1)
425     "st1       {v3.s}[2], [%1], #4                     \n"
426     "b.gt      1b                                      \n"
427   : "+r"(src_ptr),          // %0
428     "+r"(dst_ptr),          // %1
429     "+r"(tmp_src_stride),   // %2
430     "+r"(src_ptr1),         // %3
431     "+r"(dst_width)         // %4
432   : "r"(&kMult38_Div6),     // %5
433     "r"(&kShuf38_2),        // %6
434     "r"(&kMult38_Div9)      // %7
435   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
436     "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
437     "v30", "v31", "memory", "cc"
438   );
439 }
440 
441 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)442 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
443                                ptrdiff_t src_stride,
444                                uint8* dst_ptr, int dst_width) {
445   // TODO(fbarchard): use src_stride directly for clang 3.5+.
446   ptrdiff_t tmp_src_stride = src_stride;
447   asm volatile (
448     MEMACCESS(4)
449     "ld1       {v30.8h}, [%4]                          \n"
450     MEMACCESS(5)
451     "ld1       {v31.16b}, [%5]                         \n"
452     "add       %2, %2, %0                              \n"
453   "1:                                                  \n"
454 
455     // 00 40 01 41 02 42 03 43
456     // 10 50 11 51 12 52 13 53
457     // 20 60 21 61 22 62 23 63
458     // 30 70 31 71 32 72 33 73
459     MEMACCESS(0)
460     "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
461     MEMACCESS(3)
462     "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
463     "subs      %w3, %w3, #12                           \n"
464 
465     // Shuffle the input data around to get align the data
466     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
467     // 00 10 01 11 02 12 03 13
468     // 40 50 41 51 42 52 43 53
469     "trn1      v16.8b, v0.8b, v1.8b                    \n"
470     "trn2      v17.8b, v0.8b, v1.8b                    \n"
471     "trn1      v18.8b, v4.8b, v5.8b                    \n"
472     "trn2      v19.8b, v4.8b, v5.8b                    \n"
473 
474     // 20 30 21 31 22 32 23 33
475     // 60 70 61 71 62 72 63 73
476     "trn1      v0.8b, v2.8b, v3.8b                     \n"
477     "trn2      v1.8b, v2.8b, v3.8b                     \n"
478     "trn1      v4.8b, v6.8b, v7.8b                     \n"
479     "trn2      v5.8b, v6.8b, v7.8b                     \n"
480 
481     // 00+10 01+11 02+12 03+13
482     // 40+50 41+51 42+52 43+53
483     "uaddlp    v16.4h, v16.8b                          \n"
484     "uaddlp    v17.4h, v17.8b                          \n"
485     "uaddlp    v18.4h, v18.8b                          \n"
486     "uaddlp    v19.4h, v19.8b                          \n"
487 
488     // 60+70 61+71 62+72 63+73
489     "uaddlp    v1.4h, v1.8b                            \n"
490     "uaddlp    v5.4h, v5.8b                            \n"
491 
492     // combine source lines
493     "add       v16.4h, v16.4h, v18.4h                  \n"
494     "add       v17.4h, v17.4h, v19.4h                  \n"
495     "add       v2.4h, v1.4h, v5.4h                     \n"
496 
497     // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
498     "uqrshrn   v2.8b, v2.8h, #2                        \n"
499 
500     // Shuffle 2,3 reg around so that 2 can be added to the
501     //  0,1 reg and 3 can be added to the 4,5 reg. This
502     //  requires expanding from u8 to u16 as the 0,1 and 4,5
503     //  registers are already expanded. Then do transposes
504     //  to get aligned.
505     // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
506 
507     // combine source lines
508     "uaddl     v0.8h, v0.8b, v4.8b                     \n"
509 
510     // xx 20 xx 21 xx 22 xx 23
511     // xx 30 xx 31 xx 32 xx 33
512     "trn1      v1.8h, v0.8h, v0.8h                     \n"
513     "trn2      v4.8h, v0.8h, v0.8h                     \n"
514     "xtn       v0.4h, v1.4s                            \n"
515     "xtn       v4.4h, v4.4s                            \n"
516 
517     // 0+1+2, 3+4+5
518     "add       v16.8h, v16.8h, v0.8h                   \n"
519     "add       v17.8h, v17.8h, v4.8h                   \n"
520 
521     // Need to divide, but can't downshift as the the value
522     //  isn't a power of 2. So multiply by 65536 / n
523     //  and take the upper 16 bits.
524     "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
525     "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
526 
527     // Align for table lookup, vtbl requires registers to
528     //  be adjacent
529 
530     "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
531 
532     MEMACCESS(1)
533     "st1       {v3.8b}, [%1], #8                       \n"
534     MEMACCESS(1)
535     "st1       {v3.s}[2], [%1], #4                     \n"
536     "b.gt      1b                                      \n"
537   : "+r"(src_ptr),         // %0
538     "+r"(dst_ptr),         // %1
539     "+r"(tmp_src_stride),  // %2
540     "+r"(dst_width)        // %3
541   : "r"(&kMult38_Div6),    // %4
542     "r"(&kShuf38_2)        // %5
543   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
544     "v18", "v19", "v30", "v31", "memory", "cc"
545   );
546 }
547 
ScaleAddRows_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint16 * dst_ptr,int src_width,int src_height)548 void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
549                     uint16* dst_ptr, int src_width, int src_height) {
550   const uint8* src_tmp = NULL;
551   asm volatile (
552   "1:                                          \n"
553     "mov       %0, %1                          \n"
554     "mov       w12, %w5                        \n"
555     "eor       v2.16b, v2.16b, v2.16b          \n"
556     "eor       v3.16b, v3.16b, v3.16b          \n"
557   "2:                                          \n"
558     // load 16 pixels into q0
559     MEMACCESS(0)
560     "ld1       {v0.16b}, [%0], %3              \n"
561     "uaddw2    v3.8h, v3.8h, v0.16b            \n"
562     "uaddw     v2.8h, v2.8h, v0.8b             \n"
563     "subs      w12, w12, #1                    \n"
564     "b.gt      2b                              \n"
565     MEMACCESS(2)
566     "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
567     "add      %1, %1, #16                      \n"
568     "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
569     "b.gt     1b                               \n"
570   : "+r"(src_tmp),          // %0
571     "+r"(src_ptr),          // %1
572     "+r"(dst_ptr),          // %2
573     "+r"(src_stride),       // %3
574     "+r"(src_width),        // %4
575     "+r"(src_height)        // %5
576   :
577   : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
578   );
579 }
580 
581 // TODO(Yang Zhang): Investigate less load instructions for
582 // the x/dx stepping
583 #define LOAD2_DATA8_LANE(n)                                    \
584     "lsr        %5, %3, #16                    \n"             \
585     "add        %6, %1, %5                    \n"              \
586     "add        %3, %3, %4                     \n"             \
587     MEMACCESS(6)                                               \
588     "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
589 
ScaleFilterCols_NEON(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)590 void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
591                           int dst_width, int x, int dx) {
592   int dx_offset[4] = {0, 1, 2, 3};
593   int* tmp = dx_offset;
594   const uint8* src_tmp = src_ptr;
595   int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
596   int64 x64 = (int64) x;
597   int64 dx64 = (int64) dx;
598   asm volatile (
599     "dup        v0.4s, %w3                     \n"  // x
600     "dup        v1.4s, %w4                     \n"  // dx
601     "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
602     "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
603     "mul        v1.4s, v1.4s, v2.4s            \n"
604     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
605     "add        v1.4s, v1.4s, v0.4s            \n"
606     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
607     "add        v2.4s, v1.4s, v3.4s            \n"
608     "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
609   "1:                                          \n"
610     LOAD2_DATA8_LANE(0)
611     LOAD2_DATA8_LANE(1)
612     LOAD2_DATA8_LANE(2)
613     LOAD2_DATA8_LANE(3)
614     LOAD2_DATA8_LANE(4)
615     LOAD2_DATA8_LANE(5)
616     LOAD2_DATA8_LANE(6)
617     LOAD2_DATA8_LANE(7)
618     "mov       v6.16b, v1.16b                  \n"
619     "mov       v7.16b, v2.16b                  \n"
620     "uzp1      v6.8h, v6.8h, v7.8h             \n"
621     "ushll     v4.8h, v4.8b, #0                \n"
622     "ushll     v5.8h, v5.8b, #0                \n"
623     "ssubl     v16.4s, v5.4h, v4.4h            \n"
624     "ssubl2    v17.4s, v5.8h, v4.8h            \n"
625     "ushll     v7.4s, v6.4h, #0                \n"
626     "ushll2    v6.4s, v6.8h, #0                \n"
627     "mul       v16.4s, v16.4s, v7.4s           \n"
628     "mul       v17.4s, v17.4s, v6.4s           \n"
629     "shrn      v6.4h, v16.4s, #16              \n"
630     "shrn2     v6.8h, v17.4s, #16              \n"
631     "add       v4.8h, v4.8h, v6.8h             \n"
632     "xtn       v4.8b, v4.8h                    \n"
633 
634     MEMACCESS(0)
635     "st1       {v4.8b}, [%0], #8               \n"  // store pixels
636     "add       v1.4s, v1.4s, v0.4s             \n"
637     "add       v2.4s, v2.4s, v0.4s             \n"
638     "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
639     "b.gt      1b                              \n"
640   : "+r"(dst_ptr),          // %0
641     "+r"(src_ptr),          // %1
642     "+r"(dst_width64),      // %2
643     "+r"(x64),              // %3
644     "+r"(dx64),             // %4
645     "+r"(tmp),              // %5
646     "+r"(src_tmp)           // %6
647   :
648   : "memory", "cc", "v0", "v1", "v2", "v3",
649     "v4", "v5", "v6", "v7", "v16", "v17"
650   );
651 }
652 
653 #undef LOAD2_DATA8_LANE
654 
655 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)656 void ScaleFilterRows_NEON(uint8* dst_ptr,
657                           const uint8* src_ptr, ptrdiff_t src_stride,
658                           int dst_width, int source_y_fraction) {
659     int y_fraction = 256 - source_y_fraction;
660   asm volatile (
661     "cmp          %w4, #0                      \n"
662     "b.eq         100f                         \n"
663     "add          %2, %2, %1                   \n"
664     "cmp          %w4, #64                     \n"
665     "b.eq         75f                          \n"
666     "cmp          %w4, #128                    \n"
667     "b.eq         50f                          \n"
668     "cmp          %w4, #192                    \n"
669     "b.eq         25f                          \n"
670 
671     "dup          v5.8b, %w4                   \n"
672     "dup          v4.8b, %w5                   \n"
673     // General purpose row blend.
674   "1:                                          \n"
675     MEMACCESS(1)
676     "ld1          {v0.16b}, [%1], #16          \n"
677     MEMACCESS(2)
678     "ld1          {v1.16b}, [%2], #16          \n"
679     "subs         %w3, %w3, #16                \n"
680     "umull        v6.8h, v0.8b, v4.8b          \n"
681     "umull2       v7.8h, v0.16b, v4.16b        \n"
682     "umlal        v6.8h, v1.8b, v5.8b          \n"
683     "umlal2       v7.8h, v1.16b, v5.16b        \n"
684     "rshrn        v0.8b, v6.8h, #8             \n"
685     "rshrn2       v0.16b, v7.8h, #8            \n"
686     MEMACCESS(0)
687     "st1          {v0.16b}, [%0], #16          \n"
688     "b.gt         1b                           \n"
689     "b            99f                          \n"
690 
691     // Blend 25 / 75.
692   "25:                                         \n"
693     MEMACCESS(1)
694     "ld1          {v0.16b}, [%1], #16          \n"
695     MEMACCESS(2)
696     "ld1          {v1.16b}, [%2], #16          \n"
697     "subs         %w3, %w3, #16                \n"
698     "urhadd       v0.16b, v0.16b, v1.16b       \n"
699     "urhadd       v0.16b, v0.16b, v1.16b       \n"
700     MEMACCESS(0)
701     "st1          {v0.16b}, [%0], #16          \n"
702     "b.gt         25b                          \n"
703     "b            99f                          \n"
704 
705     // Blend 50 / 50.
706   "50:                                         \n"
707     MEMACCESS(1)
708     "ld1          {v0.16b}, [%1], #16          \n"
709     MEMACCESS(2)
710     "ld1          {v1.16b}, [%2], #16          \n"
711     "subs         %w3, %w3, #16                \n"
712     "urhadd       v0.16b, v0.16b, v1.16b       \n"
713     MEMACCESS(0)
714     "st1          {v0.16b}, [%0], #16          \n"
715     "b.gt         50b                          \n"
716     "b            99f                          \n"
717 
718     // Blend 75 / 25.
719   "75:                                         \n"
720     MEMACCESS(1)
721     "ld1          {v1.16b}, [%1], #16          \n"
722     MEMACCESS(2)
723     "ld1          {v0.16b}, [%2], #16          \n"
724     "subs         %w3, %w3, #16                \n"
725     "urhadd       v0.16b, v0.16b, v1.16b       \n"
726     "urhadd       v0.16b, v0.16b, v1.16b       \n"
727     MEMACCESS(0)
728     "st1          {v0.16b}, [%0], #16          \n"
729     "b.gt         75b                          \n"
730     "b            99f                          \n"
731 
732     // Blend 100 / 0 - Copy row unchanged.
733   "100:                                        \n"
734     MEMACCESS(1)
735     "ld1          {v0.16b}, [%1], #16          \n"
736     "subs         %w3, %w3, #16                \n"
737     MEMACCESS(0)
738     "st1          {v0.16b}, [%0], #16          \n"
739     "b.gt         100b                         \n"
740 
741   "99:                                         \n"
742     MEMACCESS(0)
743     "st1          {v0.b}[15], [%0]             \n"
744   : "+r"(dst_ptr),          // %0
745     "+r"(src_ptr),          // %1
746     "+r"(src_stride),       // %2
747     "+r"(dst_width),        // %3
748     "+r"(source_y_fraction),// %4
749     "+r"(y_fraction)        // %5
750   :
751   : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
752   );
753 }
754 
ScaleARGBRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)755 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
756                             uint8* dst, int dst_width) {
757   asm volatile (
758   "1:                                          \n"
759     // load even pixels into q0, odd into q1
760     MEMACCESS (0)
761     "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
762     MEMACCESS (0)
763     "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
764     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
765     MEMACCESS (1)
766     "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
767     MEMACCESS (1)
768     "st1        {v3.16b}, [%1], #16            \n"
769     "b.gt       1b                             \n"
770   : "+r" (src_ptr),          // %0
771     "+r" (dst),              // %1
772     "+r" (dst_width)         // %2
773   :
774   : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
775   );
776 }
777 
ScaleARGBRowDown2Linear_NEON(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)778 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
779                                   uint8* dst_argb, int dst_width) {
780   asm volatile (
781   "1:                                          \n"
782     MEMACCESS (0)
783     // load 8 ARGB pixels.
784     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
785     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
786     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
787     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
788     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
789     "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
790     "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
791     "rshrn      v1.8b, v1.8h, #1               \n"
792     "rshrn      v2.8b, v2.8h, #1               \n"
793     "rshrn      v3.8b, v3.8h, #1               \n"
794     MEMACCESS (1)
795     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
796     "b.gt       1b                             \n"
797   : "+r"(src_argb),         // %0
798     "+r"(dst_argb),         // %1
799     "+r"(dst_width)         // %2
800   :
801   : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
802   );
803 }
804 
ScaleARGBRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)805 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
806                                uint8* dst, int dst_width) {
807   asm volatile (
808     // change the stride to row 2 pointer
809     "add        %1, %1, %0                     \n"
810   "1:                                          \n"
811     MEMACCESS (0)
812     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
813     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
814     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
815     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
816     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
817     "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
818     MEMACCESS (1)
819     "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
820     "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
821     "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
822     "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
823     "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
824     "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
825     "rshrn      v1.8b, v1.8h, #2               \n"
826     "rshrn      v2.8b, v2.8h, #2               \n"
827     "rshrn      v3.8b, v3.8h, #2               \n"
828     MEMACCESS (2)
829     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
830     "b.gt       1b                             \n"
831   : "+r" (src_ptr),          // %0
832     "+r" (src_stride),       // %1
833     "+r" (dst),              // %2
834     "+r" (dst_width)         // %3
835   :
836   : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
837   );
838 }
839 
840 // Reads 4 pixels at a time.
841 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)842 void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
843                                int src_stepx, uint8* dst_argb, int dst_width) {
844   asm volatile (
845   "1:                                          \n"
846     MEMACCESS(0)
847     "ld1        {v0.s}[0], [%0], %3            \n"
848     MEMACCESS(0)
849     "ld1        {v0.s}[1], [%0], %3            \n"
850     MEMACCESS(0)
851     "ld1        {v0.s}[2], [%0], %3            \n"
852     MEMACCESS(0)
853     "ld1        {v0.s}[3], [%0], %3            \n"
854     "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
855     MEMACCESS(1)
856     "st1        {v0.16b}, [%1], #16            \n"
857     "b.gt       1b                             \n"
858   : "+r"(src_argb),    // %0
859     "+r"(dst_argb),    // %1
860     "+r"(dst_width)    // %2
861   : "r"((int64)(src_stepx * 4)) // %3
862   : "memory", "cc", "v0"
863   );
864 }
865 
866 // Reads 4 pixels at a time.
867 // Alignment requirement: src_argb 4 byte aligned.
868 // TODO(Yang Zhang): Might be worth another optimization pass in future.
869 // It could be upgraded to 8 pixels at a time to start with.
ScaleARGBRowDownEvenBox_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)870 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
871                                   int src_stepx,
872                                   uint8* dst_argb, int dst_width) {
873   asm volatile (
874     "add        %1, %1, %0                     \n"
875   "1:                                          \n"
876     MEMACCESS(0)
877     "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
878     MEMACCESS(1)
879     "ld1        {v1.8b}, [%1], %4              \n"
880     MEMACCESS(0)
881     "ld1        {v2.8b}, [%0], %4              \n"
882     MEMACCESS(1)
883     "ld1        {v3.8b}, [%1], %4              \n"
884     MEMACCESS(0)
885     "ld1        {v4.8b}, [%0], %4              \n"
886     MEMACCESS(1)
887     "ld1        {v5.8b}, [%1], %4              \n"
888     MEMACCESS(0)
889     "ld1        {v6.8b}, [%0], %4              \n"
890     MEMACCESS(1)
891     "ld1        {v7.8b}, [%1], %4              \n"
892     "uaddl      v0.8h, v0.8b, v1.8b            \n"
893     "uaddl      v2.8h, v2.8b, v3.8b            \n"
894     "uaddl      v4.8h, v4.8b, v5.8b            \n"
895     "uaddl      v6.8h, v6.8b, v7.8b            \n"
896     "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
897     "mov        v0.d[1], v2.d[0]               \n"
898     "mov        v2.d[0], v16.d[1]              \n"
899     "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
900     "mov        v4.d[1], v6.d[0]               \n"
901     "mov        v6.d[0], v16.d[1]              \n"
902     "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
903     "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
904     "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
905     "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
906     "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
907     MEMACCESS(2)
908     "st1     {v0.16b}, [%2], #16               \n"
909     "b.gt       1b                             \n"
910   : "+r"(src_argb),    // %0
911     "+r"(src_stride),  // %1
912     "+r"(dst_argb),    // %2
913     "+r"(dst_width)    // %3
914   : "r"((int64)(src_stepx * 4)) // %4
915   : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
916   );
917 }
918 
919 // TODO(Yang Zhang): Investigate less load instructions for
920 // the x/dx stepping
921 #define LOAD1_DATA32_LANE(vn, n)                               \
922     "lsr        %5, %3, #16                    \n"             \
923     "add        %6, %1, %5, lsl #2             \n"             \
924     "add        %3, %3, %4                     \n"             \
925     MEMACCESS(6)                                               \
926     "ld1        {"#vn".s}["#n"], [%6]          \n"
927 
ScaleARGBCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)928 void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
929                         int dst_width, int x, int dx) {
930   const uint8* src_tmp = src_argb;
931   int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
932   int64 x64 = (int64) x;
933   int64 dx64 = (int64) dx;
934   int64 tmp64 = 0;
935   asm volatile (
936   "1:                                          \n"
937     LOAD1_DATA32_LANE(v0, 0)
938     LOAD1_DATA32_LANE(v0, 1)
939     LOAD1_DATA32_LANE(v0, 2)
940     LOAD1_DATA32_LANE(v0, 3)
941     LOAD1_DATA32_LANE(v1, 0)
942     LOAD1_DATA32_LANE(v1, 1)
943     LOAD1_DATA32_LANE(v1, 2)
944     LOAD1_DATA32_LANE(v1, 3)
945 
946     MEMACCESS(0)
947     "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
948     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
949     "b.gt        1b                            \n"
950   : "+r"(dst_argb),         // %0
951     "+r"(src_argb),         // %1
952     "+r"(dst_width64),      // %2
953     "+r"(x64),              // %3
954     "+r"(dx64),             // %4
955     "+r"(tmp64),            // %5
956     "+r"(src_tmp)           // %6
957   :
958   : "memory", "cc", "v0", "v1"
959   );
960 }
961 
962 #undef LOAD1_DATA32_LANE
963 
964 // TODO(Yang Zhang): Investigate less load instructions for
965 // the x/dx stepping
966 #define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
967     "lsr        %5, %3, #16                           \n"      \
968     "add        %6, %1, %5, lsl #2                    \n"      \
969     "add        %3, %3, %4                            \n"      \
970     MEMACCESS(6)                                               \
971     "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
972 
ScaleARGBFilterCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)973 void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
974                               int dst_width, int x, int dx) {
975   int dx_offset[4] = {0, 1, 2, 3};
976   int* tmp = dx_offset;
977   const uint8* src_tmp = src_argb;
978   int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
979   int64 x64 = (int64) x;
980   int64 dx64 = (int64) dx;
981   asm volatile (
982     "dup        v0.4s, %w3                     \n"  // x
983     "dup        v1.4s, %w4                     \n"  // dx
984     "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
985     "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
986     "mul        v1.4s, v1.4s, v2.4s            \n"
987     "movi       v3.16b, #0x7f                  \n"  // 0x7F
988     "movi       v4.8h, #0x7f                   \n"  // 0x7F
989     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
990     "add        v5.4s, v1.4s, v0.4s            \n"
991   "1:                                          \n"
992     // d0, d1: a
993     // d2, d3: b
994     LOAD2_DATA32_LANE(v0, v1, 0)
995     LOAD2_DATA32_LANE(v0, v1, 1)
996     LOAD2_DATA32_LANE(v0, v1, 2)
997     LOAD2_DATA32_LANE(v0, v1, 3)
998     "shrn       v2.4h, v5.4s, #9               \n"
999     "and        v2.8b, v2.8b, v4.8b            \n"
1000     "dup        v16.8b, v2.b[0]                \n"
1001     "dup        v17.8b, v2.b[2]                \n"
1002     "dup        v18.8b, v2.b[4]                \n"
1003     "dup        v19.8b, v2.b[6]                \n"
1004     "ext        v2.8b, v16.8b, v17.8b, #4      \n"
1005     "ext        v17.8b, v18.8b, v19.8b, #4     \n"
1006     "ins        v2.d[1], v17.d[0]              \n"  // f
1007     "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
1008     "umull      v16.8h, v0.8b, v7.8b           \n"
1009     "umull2     v17.8h, v0.16b, v7.16b         \n"
1010     "umull      v18.8h, v1.8b, v2.8b           \n"
1011     "umull2     v19.8h, v1.16b, v2.16b         \n"
1012     "add        v16.8h, v16.8h, v18.8h         \n"
1013     "add        v17.8h, v17.8h, v19.8h         \n"
1014     "shrn       v0.8b, v16.8h, #7              \n"
1015     "shrn2      v0.16b, v17.8h, #7             \n"
1016 
1017     MEMACCESS(0)
1018     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
1019     "add     v5.4s, v5.4s, v6.4s               \n"
1020     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
1021     "b.gt    1b                                \n"
1022   : "+r"(dst_argb),         // %0
1023     "+r"(src_argb),         // %1
1024     "+r"(dst_width64),      // %2
1025     "+r"(x64),              // %3
1026     "+r"(dx64),             // %4
1027     "+r"(tmp),              // %5
1028     "+r"(src_tmp)           // %6
1029   :
1030   : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
1031     "v6", "v7", "v16", "v17", "v18", "v19"
1032   );
1033 }
1034 
1035 #undef LOAD2_DATA32_LANE
1036 
1037 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1038 
1039 #ifdef __cplusplus
1040 }  // extern "C"
1041 }  // namespace libyuv
1042 #endif
1043