• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale.h"
13 #include "libyuv/scale_row.h"
14 
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19 
20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22 
23 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)24 void ScaleRowDown2_NEON(const uint8_t* src_ptr,
25                         ptrdiff_t src_stride,
26                         uint8_t* dst,
27                         int dst_width) {
28   (void)src_stride;
29   asm volatile(
30       "1:                                        \n"
31       // load even pixels into v0, odd into v1
32       "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
33       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
34       "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
35       "b.gt       1b                             \n"
36       : "+r"(src_ptr),   // %0
37         "+r"(dst),       // %1
38         "+r"(dst_width)  // %2
39       :
40       : "v0", "v1"  // Clobber List
41   );
42 }
43 
44 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)45 void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
46                               ptrdiff_t src_stride,
47                               uint8_t* dst,
48                               int dst_width) {
49   (void)src_stride;
50   asm volatile(
51       "1:                                        \n"
52       // load even pixels into v0, odd into v1
53       "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
54       "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
55       "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
56       "st1        {v0.16b}, [%1], #16            \n"
57       "b.gt       1b                             \n"
58       : "+r"(src_ptr),   // %0
59         "+r"(dst),       // %1
60         "+r"(dst_width)  // %2
61       :
62       : "v0", "v1"  // Clobber List
63   );
64 }
65 
66 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)67 void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
68                            ptrdiff_t src_stride,
69                            uint8_t* dst,
70                            int dst_width) {
71   asm volatile(
72       // change the stride to row 2 pointer
73       "add        %1, %1, %0                     \n"
74       "1:                                        \n"
75       "ld1        {v0.16b, v1.16b}, [%0], #32    \n"  // load row 1 and post inc
76       "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
77       "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
78       "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
79       "uaddlp     v1.8h, v1.16b                  \n"
80       "uadalp     v0.8h, v2.16b                  \n"  // += row 2 add adjacent
81       "uadalp     v1.8h, v3.16b                  \n"
82       "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
83       "rshrn2     v0.16b, v1.8h, #2              \n"
84       "st1        {v0.16b}, [%2], #16            \n"
85       "b.gt       1b                             \n"
86       : "+r"(src_ptr),     // %0
87         "+r"(src_stride),  // %1
88         "+r"(dst),         // %2
89         "+r"(dst_width)    // %3
90       :
91       : "v0", "v1", "v2", "v3"  // Clobber List
92   );
93 }
94 
ScaleRowDown4_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
96                         ptrdiff_t src_stride,
97                         uint8_t* dst_ptr,
98                         int dst_width) {
99   (void)src_stride;
100   asm volatile(
101       "1:                                        \n"
102       "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32  \n"  // src line 0
103       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
104       "st1     {v2.8b}, [%1], #8                 \n"
105       "b.gt       1b                             \n"
106       : "+r"(src_ptr),   // %0
107         "+r"(dst_ptr),   // %1
108         "+r"(dst_width)  // %2
109       :
110       : "v0", "v1", "v2", "v3", "memory", "cc");
111 }
112 
ScaleRowDown4Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)113 void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
114                            ptrdiff_t src_stride,
115                            uint8_t* dst_ptr,
116                            int dst_width) {
117   const uint8_t* src_ptr1 = src_ptr + src_stride;
118   const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
119   const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
120   asm volatile(
121       "1:                                        \n"
122       "ld1     {v0.16b}, [%0], #16               \n"  // load up 16x4
123       "ld1     {v1.16b}, [%2], #16               \n"
124       "ld1     {v2.16b}, [%3], #16               \n"
125       "ld1     {v3.16b}, [%4], #16               \n"
126       "subs    %w5, %w5, #4                      \n"
127       "uaddlp  v0.8h, v0.16b                     \n"
128       "uadalp  v0.8h, v1.16b                     \n"
129       "uadalp  v0.8h, v2.16b                     \n"
130       "uadalp  v0.8h, v3.16b                     \n"
131       "addp    v0.8h, v0.8h, v0.8h               \n"
132       "rshrn   v0.8b, v0.8h, #4                  \n"  // divide by 16 w/rounding
133       "st1    {v0.s}[0], [%1], #4                \n"
134       "b.gt       1b                             \n"
135       : "+r"(src_ptr),   // %0
136         "+r"(dst_ptr),   // %1
137         "+r"(src_ptr1),  // %2
138         "+r"(src_ptr2),  // %3
139         "+r"(src_ptr3),  // %4
140         "+r"(dst_width)  // %5
141       :
142       : "v0", "v1", "v2", "v3", "memory", "cc");
143 }
144 
145 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
146 // to load up the every 4th pixel into a 4 different registers.
147 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)148 void ScaleRowDown34_NEON(const uint8_t* src_ptr,
149                          ptrdiff_t src_stride,
150                          uint8_t* dst_ptr,
151                          int dst_width) {
152   (void)src_stride;
153   asm volatile(
154       "1:                                                \n"
155       "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
156       "subs      %w2, %w2, #24                           \n"
157       "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0,v1,v2
158       "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
159       "b.gt      1b                                      \n"
160       : "+r"(src_ptr),   // %0
161         "+r"(dst_ptr),   // %1
162         "+r"(dst_width)  // %2
163       :
164       : "v0", "v1", "v2", "v3", "memory", "cc");
165 }
166 
ScaleRowDown34_0_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)167 void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
168                                ptrdiff_t src_stride,
169                                uint8_t* dst_ptr,
170                                int dst_width) {
171   asm volatile(
172       "movi      v20.8b, #3                              \n"
173       "add       %3, %3, %0                              \n"
174       "1:                                                \n"
175       "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
176       "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
177       "subs         %w2, %w2, #24                        \n"
178 
179       // filter src line 0 with src line 1
180       // expand chars to shorts to allow for room
181       // when adding lines together
182       "ushll     v16.8h, v4.8b, #0                       \n"
183       "ushll     v17.8h, v5.8b, #0                       \n"
184       "ushll     v18.8h, v6.8b, #0                       \n"
185       "ushll     v19.8h, v7.8b, #0                       \n"
186 
187       // 3 * line_0 + line_1
188       "umlal     v16.8h, v0.8b, v20.8b                   \n"
189       "umlal     v17.8h, v1.8b, v20.8b                   \n"
190       "umlal     v18.8h, v2.8b, v20.8b                   \n"
191       "umlal     v19.8h, v3.8b, v20.8b                   \n"
192 
193       // (3 * line_0 + line_1) >> 2
194       "uqrshrn   v0.8b, v16.8h, #2                       \n"
195       "uqrshrn   v1.8b, v17.8h, #2                       \n"
196       "uqrshrn   v2.8b, v18.8h, #2                       \n"
197       "uqrshrn   v3.8b, v19.8h, #2                       \n"
198 
199       // a0 = (src[0] * 3 + s[1] * 1) >> 2
200       "ushll     v16.8h, v1.8b, #0                       \n"
201       "umlal     v16.8h, v0.8b, v20.8b                   \n"
202       "uqrshrn   v0.8b, v16.8h, #2                       \n"
203 
204       // a1 = (src[1] * 1 + s[2] * 1) >> 1
205       "urhadd    v1.8b, v1.8b, v2.8b                     \n"
206 
207       // a2 = (src[2] * 1 + s[3] * 3) >> 2
208       "ushll     v16.8h, v2.8b, #0                       \n"
209       "umlal     v16.8h, v3.8b, v20.8b                   \n"
210       "uqrshrn   v2.8b, v16.8h, #2                       \n"
211 
212       "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
213 
214       "b.gt      1b                                      \n"
215       : "+r"(src_ptr),    // %0
216         "+r"(dst_ptr),    // %1
217         "+r"(dst_width),  // %2
218         "+r"(src_stride)  // %3
219       :
220       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
221         "v19", "v20", "memory", "cc");
222 }
223 
ScaleRowDown34_1_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)224 void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
225                                ptrdiff_t src_stride,
226                                uint8_t* dst_ptr,
227                                int dst_width) {
228   asm volatile(
229       "movi      v20.8b, #3                              \n"
230       "add       %3, %3, %0                              \n"
231       "1:                                                \n"
232       "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
233       "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
234       "subs         %w2, %w2, #24                        \n"
235       // average src line 0 with src line 1
236       "urhadd    v0.8b, v0.8b, v4.8b                     \n"
237       "urhadd    v1.8b, v1.8b, v5.8b                     \n"
238       "urhadd    v2.8b, v2.8b, v6.8b                     \n"
239       "urhadd    v3.8b, v3.8b, v7.8b                     \n"
240 
241       // a0 = (src[0] * 3 + s[1] * 1) >> 2
242       "ushll     v4.8h, v1.8b, #0                        \n"
243       "umlal     v4.8h, v0.8b, v20.8b                    \n"
244       "uqrshrn   v0.8b, v4.8h, #2                        \n"
245 
246       // a1 = (src[1] * 1 + s[2] * 1) >> 1
247       "urhadd    v1.8b, v1.8b, v2.8b                     \n"
248 
249       // a2 = (src[2] * 1 + s[3] * 3) >> 2
250       "ushll     v4.8h, v2.8b, #0                        \n"
251       "umlal     v4.8h, v3.8b, v20.8b                    \n"
252       "uqrshrn   v2.8b, v4.8h, #2                        \n"
253 
254       "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
255       "b.gt      1b                                      \n"
256       : "+r"(src_ptr),    // %0
257         "+r"(dst_ptr),    // %1
258         "+r"(dst_width),  // %2
259         "+r"(src_stride)  // %3
260       :
261       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
262 }
263 
264 static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
265                               22, 24, 27, 30, 0,  0,  0,  0};
266 static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
267                                 34, 6,  22, 35, 0,  0,  0, 0};
268 static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
269                                    65536 / 12, 65536 / 12, 65536 / 12,
270                                    65536 / 12, 65536 / 12};
271 static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
272                                    65536 / 18, 65536 / 18, 65536 / 18,
273                                    65536 / 18, 65536 / 18};
274 
275 // 32 -> 12
ScaleRowDown38_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)276 void ScaleRowDown38_NEON(const uint8_t* src_ptr,
277                          ptrdiff_t src_stride,
278                          uint8_t* dst_ptr,
279                          int dst_width) {
280   (void)src_stride;
281   asm volatile(
282       "ld1       {v3.16b}, [%3]                          \n"
283       "1:                                                \n"
284       "ld1       {v0.16b,v1.16b}, [%0], #32              \n"
285       "subs      %w2, %w2, #12                           \n"
286       "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
287       "st1       {v2.8b}, [%1], #8                       \n"
288       "st1       {v2.s}[2], [%1], #4                     \n"
289       "b.gt      1b                                      \n"
290       : "+r"(src_ptr),   // %0
291         "+r"(dst_ptr),   // %1
292         "+r"(dst_width)  // %2
293       : "r"(&kShuf38)    // %3
294       : "v0", "v1", "v2", "v3", "memory", "cc");
295 }
296 
297 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)298 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
299                                       ptrdiff_t src_stride,
300                                       uint8_t* dst_ptr,
301                                       int dst_width) {
302   const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
303   ptrdiff_t tmp_src_stride = src_stride;
304 
305   asm volatile(
306       "ld1       {v29.8h}, [%5]                          \n"
307       "ld1       {v30.16b}, [%6]                         \n"
308       "ld1       {v31.8h}, [%7]                          \n"
309       "add       %2, %2, %0                              \n"
310       "1:                                                \n"
311 
312       // 00 40 01 41 02 42 03 43
313       // 10 50 11 51 12 52 13 53
314       // 20 60 21 61 22 62 23 63
315       // 30 70 31 71 32 72 33 73
316       "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
317       "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
318       "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32  \n"
319       "subs      %w4, %w4, #12                           \n"
320 
321       // Shuffle the input data around to get align the data
322       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
323       // 00 10 01 11 02 12 03 13
324       // 40 50 41 51 42 52 43 53
325       "trn1      v20.8b, v0.8b, v1.8b                    \n"
326       "trn2      v21.8b, v0.8b, v1.8b                    \n"
327       "trn1      v22.8b, v4.8b, v5.8b                    \n"
328       "trn2      v23.8b, v4.8b, v5.8b                    \n"
329       "trn1      v24.8b, v16.8b, v17.8b                  \n"
330       "trn2      v25.8b, v16.8b, v17.8b                  \n"
331 
332       // 20 30 21 31 22 32 23 33
333       // 60 70 61 71 62 72 63 73
334       "trn1      v0.8b, v2.8b, v3.8b                     \n"
335       "trn2      v1.8b, v2.8b, v3.8b                     \n"
336       "trn1      v4.8b, v6.8b, v7.8b                     \n"
337       "trn2      v5.8b, v6.8b, v7.8b                     \n"
338       "trn1      v16.8b, v18.8b, v19.8b                  \n"
339       "trn2      v17.8b, v18.8b, v19.8b                  \n"
340 
341       // 00+10 01+11 02+12 03+13
342       // 40+50 41+51 42+52 43+53
343       "uaddlp    v20.4h, v20.8b                          \n"
344       "uaddlp    v21.4h, v21.8b                          \n"
345       "uaddlp    v22.4h, v22.8b                          \n"
346       "uaddlp    v23.4h, v23.8b                          \n"
347       "uaddlp    v24.4h, v24.8b                          \n"
348       "uaddlp    v25.4h, v25.8b                          \n"
349 
350       // 60+70 61+71 62+72 63+73
351       "uaddlp    v1.4h, v1.8b                            \n"
352       "uaddlp    v5.4h, v5.8b                            \n"
353       "uaddlp    v17.4h, v17.8b                          \n"
354 
355       // combine source lines
356       "add       v20.4h, v20.4h, v22.4h                  \n"
357       "add       v21.4h, v21.4h, v23.4h                  \n"
358       "add       v20.4h, v20.4h, v24.4h                  \n"
359       "add       v21.4h, v21.4h, v25.4h                  \n"
360       "add       v2.4h, v1.4h, v5.4h                     \n"
361       "add       v2.4h, v2.4h, v17.4h                    \n"
362 
363       // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
364       //             + s[6 + st * 1] + s[7 + st * 1]
365       //             + s[6 + st * 2] + s[7 + st * 2]) / 6
366       "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
367       "xtn       v2.8b,  v2.8h                           \n"
368 
369       // Shuffle 2,3 reg around so that 2 can be added to the
370       //  0,1 reg and 3 can be added to the 4,5 reg. This
371       //  requires expanding from u8 to u16 as the 0,1 and 4,5
372       //  registers are already expanded. Then do transposes
373       //  to get aligned.
374       // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
375       "ushll     v16.8h, v16.8b, #0                      \n"
376       "uaddl     v0.8h, v0.8b, v4.8b                     \n"
377 
378       // combine source lines
379       "add       v0.8h, v0.8h, v16.8h                    \n"
380 
381       // xx 20 xx 21 xx 22 xx 23
382       // xx 30 xx 31 xx 32 xx 33
383       "trn1      v1.8h, v0.8h, v0.8h                     \n"
384       "trn2      v4.8h, v0.8h, v0.8h                     \n"
385       "xtn       v0.4h, v1.4s                            \n"
386       "xtn       v4.4h, v4.4s                            \n"
387 
388       // 0+1+2, 3+4+5
389       "add       v20.8h, v20.8h, v0.8h                   \n"
390       "add       v21.8h, v21.8h, v4.8h                   \n"
391 
392       // Need to divide, but can't downshift as the the value
393       //  isn't a power of 2. So multiply by 65536 / n
394       //  and take the upper 16 bits.
395       "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
396       "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
397 
398       // Align for table lookup, vtbl requires registers to be adjacent
399       "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
400 
401       "st1       {v3.8b}, [%1], #8                       \n"
402       "st1       {v3.s}[2], [%1], #4                     \n"
403       "b.gt      1b                                      \n"
404       : "+r"(src_ptr),         // %0
405         "+r"(dst_ptr),         // %1
406         "+r"(tmp_src_stride),  // %2
407         "+r"(src_ptr1),        // %3
408         "+r"(dst_width)        // %4
409       : "r"(&kMult38_Div6),    // %5
410         "r"(&kShuf38_2),       // %6
411         "r"(&kMult38_Div9)     // %7
412       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
413         "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
414         "memory", "cc");
415 }
416 
417 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)418 void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
419                                ptrdiff_t src_stride,
420                                uint8_t* dst_ptr,
421                                int dst_width) {
422   // TODO(fbarchard): use src_stride directly for clang 3.5+.
423   ptrdiff_t tmp_src_stride = src_stride;
424   asm volatile(
425       "ld1       {v30.8h}, [%4]                          \n"
426       "ld1       {v31.16b}, [%5]                         \n"
427       "add       %2, %2, %0                              \n"
428       "1:                                                \n"
429 
430       // 00 40 01 41 02 42 03 43
431       // 10 50 11 51 12 52 13 53
432       // 20 60 21 61 22 62 23 63
433       // 30 70 31 71 32 72 33 73
434       "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
435       "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
436       "subs      %w3, %w3, #12                           \n"
437 
438       // Shuffle the input data around to get align the data
439       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
440       // 00 10 01 11 02 12 03 13
441       // 40 50 41 51 42 52 43 53
442       "trn1      v16.8b, v0.8b, v1.8b                    \n"
443       "trn2      v17.8b, v0.8b, v1.8b                    \n"
444       "trn1      v18.8b, v4.8b, v5.8b                    \n"
445       "trn2      v19.8b, v4.8b, v5.8b                    \n"
446 
447       // 20 30 21 31 22 32 23 33
448       // 60 70 61 71 62 72 63 73
449       "trn1      v0.8b, v2.8b, v3.8b                     \n"
450       "trn2      v1.8b, v2.8b, v3.8b                     \n"
451       "trn1      v4.8b, v6.8b, v7.8b                     \n"
452       "trn2      v5.8b, v6.8b, v7.8b                     \n"
453 
454       // 00+10 01+11 02+12 03+13
455       // 40+50 41+51 42+52 43+53
456       "uaddlp    v16.4h, v16.8b                          \n"
457       "uaddlp    v17.4h, v17.8b                          \n"
458       "uaddlp    v18.4h, v18.8b                          \n"
459       "uaddlp    v19.4h, v19.8b                          \n"
460 
461       // 60+70 61+71 62+72 63+73
462       "uaddlp    v1.4h, v1.8b                            \n"
463       "uaddlp    v5.4h, v5.8b                            \n"
464 
465       // combine source lines
466       "add       v16.4h, v16.4h, v18.4h                  \n"
467       "add       v17.4h, v17.4h, v19.4h                  \n"
468       "add       v2.4h, v1.4h, v5.4h                     \n"
469 
470       // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
471       "uqrshrn   v2.8b, v2.8h, #2                        \n"
472 
473       // Shuffle 2,3 reg around so that 2 can be added to the
474       //  0,1 reg and 3 can be added to the 4,5 reg. This
475       //  requires expanding from u8 to u16 as the 0,1 and 4,5
476       //  registers are already expanded. Then do transposes
477       //  to get aligned.
478       // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
479 
480       // combine source lines
481       "uaddl     v0.8h, v0.8b, v4.8b                     \n"
482 
483       // xx 20 xx 21 xx 22 xx 23
484       // xx 30 xx 31 xx 32 xx 33
485       "trn1      v1.8h, v0.8h, v0.8h                     \n"
486       "trn2      v4.8h, v0.8h, v0.8h                     \n"
487       "xtn       v0.4h, v1.4s                            \n"
488       "xtn       v4.4h, v4.4s                            \n"
489 
490       // 0+1+2, 3+4+5
491       "add       v16.8h, v16.8h, v0.8h                   \n"
492       "add       v17.8h, v17.8h, v4.8h                   \n"
493 
494       // Need to divide, but can't downshift as the the value
495       //  isn't a power of 2. So multiply by 65536 / n
496       //  and take the upper 16 bits.
497       "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
498       "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
499 
500       // Align for table lookup, vtbl requires registers to
501       //  be adjacent
502 
503       "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
504 
505       "st1       {v3.8b}, [%1], #8                       \n"
506       "st1       {v3.s}[2], [%1], #4                     \n"
507       "b.gt      1b                                      \n"
508       : "+r"(src_ptr),         // %0
509         "+r"(dst_ptr),         // %1
510         "+r"(tmp_src_stride),  // %2
511         "+r"(dst_width)        // %3
512       : "r"(&kMult38_Div6),    // %4
513         "r"(&kShuf38_2)        // %5
514       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
515         "v19", "v30", "v31", "memory", "cc");
516 }
517 
518 // Add a row of bytes to a row of shorts.  Used for box filter.
519 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_NEON(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)520 void ScaleAddRow_NEON(const uint8_t* src_ptr,
521                       uint16_t* dst_ptr,
522                       int src_width) {
523   asm volatile(
524       "1:                                        \n"
525       "ld1      {v1.8h, v2.8h}, [%1]             \n"  // load accumulator
526       "ld1      {v0.16b}, [%0], #16              \n"  // load 16 bytes
527       "uaddw2   v2.8h, v2.8h, v0.16b             \n"  // add
528       "uaddw    v1.8h, v1.8h, v0.8b              \n"
529       "st1      {v1.8h, v2.8h}, [%1], #32        \n"  // store accumulator
530       "subs     %w2, %w2, #16                    \n"  // 16 processed per loop
531       "b.gt     1b                               \n"
532       : "+r"(src_ptr),   // %0
533         "+r"(dst_ptr),   // %1
534         "+r"(src_width)  // %2
535       :
536       : "memory", "cc", "v0", "v1", "v2"  // Clobber List
537   );
538 }
539 
540 // TODO(Yang Zhang): Investigate less load instructions for
541 // the x/dx stepping
542 #define LOAD2_DATA8_LANE(n)                      \
543   "lsr        %5, %3, #16                    \n" \
544   "add        %6, %1, %5                     \n" \
545   "add        %3, %3, %4                     \n" \
546   "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
547 
548 // The NEON version mimics this formula (from row_common.cc):
549 // #define BLENDER(a, b, f) (uint8_t)((int)(a) +
550 //    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
551 
ScaleFilterCols_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)552 void ScaleFilterCols_NEON(uint8_t* dst_ptr,
553                           const uint8_t* src_ptr,
554                           int dst_width,
555                           int x,
556                           int dx) {
557   int dx_offset[4] = {0, 1, 2, 3};
558   int* tmp = dx_offset;
559   const uint8_t* src_tmp = src_ptr;
560   int64_t x64 = (int64_t)x;    // NOLINT
561   int64_t dx64 = (int64_t)dx;  // NOLINT
562   asm volatile (
563     "dup        v0.4s, %w3                     \n"  // x
564     "dup        v1.4s, %w4                     \n"  // dx
565     "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
566     "shl        v3.4s, v1.4s, #2               \n"  // 4 * dx
567     "mul        v1.4s, v1.4s, v2.4s            \n"
568     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
569     "add        v1.4s, v1.4s, v0.4s            \n"
570     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
571     "add        v2.4s, v1.4s, v3.4s            \n"
572     "shl        v0.4s, v3.4s, #1               \n"  // 8 * dx
573   "1:                                          \n"
574     LOAD2_DATA8_LANE(0)
575     LOAD2_DATA8_LANE(1)
576     LOAD2_DATA8_LANE(2)
577     LOAD2_DATA8_LANE(3)
578     LOAD2_DATA8_LANE(4)
579     LOAD2_DATA8_LANE(5)
580     LOAD2_DATA8_LANE(6)
581     LOAD2_DATA8_LANE(7)
582     "mov       v6.16b, v1.16b                  \n"
583     "mov       v7.16b, v2.16b                  \n"
584     "uzp1      v6.8h, v6.8h, v7.8h             \n"
585     "ushll     v4.8h, v4.8b, #0                \n"
586     "ushll     v5.8h, v5.8b, #0                \n"
587     "ssubl     v16.4s, v5.4h, v4.4h            \n"
588     "ssubl2    v17.4s, v5.8h, v4.8h            \n"
589     "ushll     v7.4s, v6.4h, #0                \n"
590     "ushll2    v6.4s, v6.8h, #0                \n"
591     "mul       v16.4s, v16.4s, v7.4s           \n"
592     "mul       v17.4s, v17.4s, v6.4s           \n"
593     "rshrn     v6.4h, v16.4s, #16              \n"
594     "rshrn2    v6.8h, v17.4s, #16              \n"
595     "add       v4.8h, v4.8h, v6.8h             \n"
596     "xtn       v4.8b, v4.8h                    \n"
597 
598     "st1       {v4.8b}, [%0], #8               \n"  // store pixels
599     "add       v1.4s, v1.4s, v0.4s             \n"
600     "add       v2.4s, v2.4s, v0.4s             \n"
601     "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
602     "b.gt      1b                              \n"
603   : "+r"(dst_ptr),          // %0
604     "+r"(src_ptr),          // %1
605     "+r"(dst_width),        // %2
606     "+r"(x64),              // %3
607     "+r"(dx64),             // %4
608     "+r"(tmp),              // %5
609     "+r"(src_tmp)           // %6
610   :
611   : "memory", "cc", "v0", "v1", "v2", "v3",
612     "v4", "v5", "v6", "v7", "v16", "v17"
613   );
614 }
615 
616 #undef LOAD2_DATA8_LANE
617 
618 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)619 void ScaleFilterRows_NEON(uint8_t* dst_ptr,
620                           const uint8_t* src_ptr,
621                           ptrdiff_t src_stride,
622                           int dst_width,
623                           int source_y_fraction) {
624   int y_fraction = 256 - source_y_fraction;
625   asm volatile(
626       "cmp          %w4, #0                      \n"
627       "b.eq         100f                         \n"
628       "add          %2, %2, %1                   \n"
629       "cmp          %w4, #64                     \n"
630       "b.eq         75f                          \n"
631       "cmp          %w4, #128                    \n"
632       "b.eq         50f                          \n"
633       "cmp          %w4, #192                    \n"
634       "b.eq         25f                          \n"
635 
636       "dup          v5.8b, %w4                   \n"
637       "dup          v4.8b, %w5                   \n"
638       // General purpose row blend.
639       "1:                                        \n"
640       "ld1          {v0.16b}, [%1], #16          \n"
641       "ld1          {v1.16b}, [%2], #16          \n"
642       "subs         %w3, %w3, #16                \n"
643       "umull        v6.8h, v0.8b, v4.8b          \n"
644       "umull2       v7.8h, v0.16b, v4.16b        \n"
645       "umlal        v6.8h, v1.8b, v5.8b          \n"
646       "umlal2       v7.8h, v1.16b, v5.16b        \n"
647       "rshrn        v0.8b, v6.8h, #8             \n"
648       "rshrn2       v0.16b, v7.8h, #8            \n"
649       "st1          {v0.16b}, [%0], #16          \n"
650       "b.gt         1b                           \n"
651       "b            99f                          \n"
652 
653       // Blend 25 / 75.
654       "25:                                       \n"
655       "ld1          {v0.16b}, [%1], #16          \n"
656       "ld1          {v1.16b}, [%2], #16          \n"
657       "subs         %w3, %w3, #16                \n"
658       "urhadd       v0.16b, v0.16b, v1.16b       \n"
659       "urhadd       v0.16b, v0.16b, v1.16b       \n"
660       "st1          {v0.16b}, [%0], #16          \n"
661       "b.gt         25b                          \n"
662       "b            99f                          \n"
663 
664       // Blend 50 / 50.
665       "50:                                       \n"
666       "ld1          {v0.16b}, [%1], #16          \n"
667       "ld1          {v1.16b}, [%2], #16          \n"
668       "subs         %w3, %w3, #16                \n"
669       "urhadd       v0.16b, v0.16b, v1.16b       \n"
670       "st1          {v0.16b}, [%0], #16          \n"
671       "b.gt         50b                          \n"
672       "b            99f                          \n"
673 
674       // Blend 75 / 25.
675       "75:                                       \n"
676       "ld1          {v1.16b}, [%1], #16          \n"
677       "ld1          {v0.16b}, [%2], #16          \n"
678       "subs         %w3, %w3, #16                \n"
679       "urhadd       v0.16b, v0.16b, v1.16b       \n"
680       "urhadd       v0.16b, v0.16b, v1.16b       \n"
681       "st1          {v0.16b}, [%0], #16          \n"
682       "b.gt         75b                          \n"
683       "b            99f                          \n"
684 
685       // Blend 100 / 0 - Copy row unchanged.
686       "100:                                      \n"
687       "ld1          {v0.16b}, [%1], #16          \n"
688       "subs         %w3, %w3, #16                \n"
689       "st1          {v0.16b}, [%0], #16          \n"
690       "b.gt         100b                         \n"
691 
692       "99:                                       \n"
693       "st1          {v0.b}[15], [%0]             \n"
694       : "+r"(dst_ptr),            // %0
695         "+r"(src_ptr),            // %1
696         "+r"(src_stride),         // %2
697         "+r"(dst_width),          // %3
698         "+r"(source_y_fraction),  // %4
699         "+r"(y_fraction)          // %5
700       :
701       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
702 }
703 
ScaleARGBRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)704 void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
705                             ptrdiff_t src_stride,
706                             uint8_t* dst,
707                             int dst_width) {
708   (void)src_stride;
709   asm volatile(
710       "1:                                        \n"
711       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
712       "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
713       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
714       "mov        v2.16b, v3.16b                 \n"
715       "st2        {v1.4s,v2.4s}, [%1], #32       \n"  // store 8 odd pixels
716       "b.gt       1b                             \n"
717       : "+r"(src_ptr),   // %0
718         "+r"(dst),       // %1
719         "+r"(dst_width)  // %2
720       :
721       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
722   );
723 }
724 
ScaleARGBRowDown2Linear_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)725 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
726                                   ptrdiff_t src_stride,
727                                   uint8_t* dst_argb,
728                                   int dst_width) {
729   (void)src_stride;
730   asm volatile(
731       "1:                                        \n"
732       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
733       "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
734       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
735 
736       "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
737       "urhadd     v1.16b, v2.16b, v3.16b         \n"
738       "st2        {v0.4s,v1.4s}, [%1], #32       \n"  // store 8 pixels
739       "b.gt       1b                             \n"
740       : "+r"(src_argb),  // %0
741         "+r"(dst_argb),  // %1
742         "+r"(dst_width)  // %2
743       :
744       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
745   );
746 }
747 
ScaleARGBRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)748 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
749                                ptrdiff_t src_stride,
750                                uint8_t* dst,
751                                int dst_width) {
752   asm volatile(
753       // change the stride to row 2 pointer
754       "add        %1, %1, %0                     \n"
755       "1:                                        \n"
756       "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 8 ARGB
757       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
758       "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
759       "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
760       "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
761       "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
762       "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
763       "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
764       "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
765       "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
766       "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
767       "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
768       "rshrn      v1.8b, v1.8h, #2               \n"
769       "rshrn      v2.8b, v2.8h, #2               \n"
770       "rshrn      v3.8b, v3.8h, #2               \n"
771       "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
772       "b.gt       1b                             \n"
773       : "+r"(src_ptr),     // %0
774         "+r"(src_stride),  // %1
775         "+r"(dst),         // %2
776         "+r"(dst_width)    // %3
777       :
778       : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
779 }
780 
781 // Reads 4 pixels at a time.
782 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)783 void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
784                                ptrdiff_t src_stride,
785                                int src_stepx,
786                                uint8_t* dst_argb,
787                                int dst_width) {
788   (void)src_stride;
789   asm volatile(
790       "1:                                        \n"
791       "ld1        {v0.s}[0], [%0], %3            \n"
792       "ld1        {v0.s}[1], [%0], %3            \n"
793       "ld1        {v0.s}[2], [%0], %3            \n"
794       "ld1        {v0.s}[3], [%0], %3            \n"
795       "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
796       "st1        {v0.16b}, [%1], #16            \n"
797       "b.gt       1b                             \n"
798       : "+r"(src_argb),                // %0
799         "+r"(dst_argb),                // %1
800         "+r"(dst_width)                // %2
801       : "r"((int64_t)(src_stepx * 4))  // %3
802       : "memory", "cc", "v0");
803 }
804 
805 // Reads 4 pixels at a time.
806 // Alignment requirement: src_argb 4 byte aligned.
807 // TODO(Yang Zhang): Might be worth another optimization pass in future.
808 // It could be upgraded to 8 pixels at a time to start with.
ScaleARGBRowDownEvenBox_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)809 void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
810                                   ptrdiff_t src_stride,
811                                   int src_stepx,
812                                   uint8_t* dst_argb,
813                                   int dst_width) {
814   asm volatile(
815       "add        %1, %1, %0                     \n"
816       "1:                                        \n"
817       "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 -> 2x1
818       "ld1        {v1.8b}, [%1], %4              \n"
819       "ld1        {v2.8b}, [%0], %4              \n"
820       "ld1        {v3.8b}, [%1], %4              \n"
821       "ld1        {v4.8b}, [%0], %4              \n"
822       "ld1        {v5.8b}, [%1], %4              \n"
823       "ld1        {v6.8b}, [%0], %4              \n"
824       "ld1        {v7.8b}, [%1], %4              \n"
825       "uaddl      v0.8h, v0.8b, v1.8b            \n"
826       "uaddl      v2.8h, v2.8b, v3.8b            \n"
827       "uaddl      v4.8h, v4.8b, v5.8b            \n"
828       "uaddl      v6.8h, v6.8b, v7.8b            \n"
829       "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
830       "mov        v0.d[1], v2.d[0]               \n"
831       "mov        v2.d[0], v16.d[1]              \n"
832       "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
833       "mov        v4.d[1], v6.d[0]               \n"
834       "mov        v6.d[0], v16.d[1]              \n"
835       "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
836       "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
837       "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
838       "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
839       "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
840       "st1     {v0.16b}, [%2], #16               \n"
841       "b.gt       1b                             \n"
842       : "+r"(src_argb),                // %0
843         "+r"(src_stride),              // %1
844         "+r"(dst_argb),                // %2
845         "+r"(dst_width)                // %3
846       : "r"((int64_t)(src_stepx * 4))  // %4
847       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
848 }
849 
850 // TODO(Yang Zhang): Investigate less load instructions for
851 // the x/dx stepping
852 #define LOAD1_DATA32_LANE(vn, n)                 \
853   "lsr        %5, %3, #16                    \n" \
854   "add        %6, %1, %5, lsl #2             \n" \
855   "add        %3, %3, %4                     \n" \
856   "ld1        {" #vn ".s}[" #n "], [%6]      \n"
857 
ScaleARGBCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)858 void ScaleARGBCols_NEON(uint8_t* dst_argb,
859                         const uint8_t* src_argb,
860                         int dst_width,
861                         int x,
862                         int dx) {
863   const uint8_t* src_tmp = src_argb;
864   int64_t x64 = (int64_t)x;    // NOLINT
865   int64_t dx64 = (int64_t)dx;  // NOLINT
866   int64_t tmp64;
867   asm volatile(
868       "1:                                        \n"
869       // clang-format off
870       LOAD1_DATA32_LANE(v0, 0)
871       LOAD1_DATA32_LANE(v0, 1)
872       LOAD1_DATA32_LANE(v0, 2)
873       LOAD1_DATA32_LANE(v0, 3)
874       LOAD1_DATA32_LANE(v1, 0)
875       LOAD1_DATA32_LANE(v1, 1)
876       LOAD1_DATA32_LANE(v1, 2)
877       LOAD1_DATA32_LANE(v1, 3)
878       // clang-format on
879       "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
880       "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
881       "b.gt       1b                             \n"
882       : "+r"(dst_argb),   // %0
883         "+r"(src_argb),   // %1
884         "+r"(dst_width),  // %2
885         "+r"(x64),        // %3
886         "+r"(dx64),       // %4
887         "=&r"(tmp64),     // %5
888         "+r"(src_tmp)     // %6
889       :
890       : "memory", "cc", "v0", "v1");
891 }
892 
893 #undef LOAD1_DATA32_LANE
894 
895 // TODO(Yang Zhang): Investigate less load instructions for
896 // the x/dx stepping
897 #define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
898   "lsr        %5, %3, #16                           \n" \
899   "add        %6, %1, %5, lsl #2                    \n" \
900   "add        %3, %3, %4                            \n" \
901   "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
902 
ScaleARGBFilterCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)903 void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
904                               const uint8_t* src_argb,
905                               int dst_width,
906                               int x,
907                               int dx) {
908   int dx_offset[4] = {0, 1, 2, 3};
909   int* tmp = dx_offset;
910   const uint8_t* src_tmp = src_argb;
911   int64_t x64 = (int64_t)x;    // NOLINT
912   int64_t dx64 = (int64_t)dx;  // NOLINT
913   asm volatile (
914     "dup        v0.4s, %w3                     \n"  // x
915     "dup        v1.4s, %w4                     \n"  // dx
916     "ld1        {v2.4s}, [%5]                  \n"  // 0 1 2 3
917     "shl        v6.4s, v1.4s, #2               \n"  // 4 * dx
918     "mul        v1.4s, v1.4s, v2.4s            \n"
919     "movi       v3.16b, #0x7f                  \n"  // 0x7F
920     "movi       v4.8h, #0x7f                   \n"  // 0x7F
921     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
922     "add        v5.4s, v1.4s, v0.4s            \n"
923   "1:                                          \n"
924     // d0, d1: a
925     // d2, d3: b
926     LOAD2_DATA32_LANE(v0, v1, 0)
927     LOAD2_DATA32_LANE(v0, v1, 1)
928     LOAD2_DATA32_LANE(v0, v1, 2)
929     LOAD2_DATA32_LANE(v0, v1, 3)
930     "shrn       v2.4h, v5.4s, #9               \n"
931     "and        v2.8b, v2.8b, v4.8b            \n"
932     "dup        v16.8b, v2.b[0]                \n"
933     "dup        v17.8b, v2.b[2]                \n"
934     "dup        v18.8b, v2.b[4]                \n"
935     "dup        v19.8b, v2.b[6]                \n"
936     "ext        v2.8b, v16.8b, v17.8b, #4      \n"
937     "ext        v17.8b, v18.8b, v19.8b, #4     \n"
938     "ins        v2.d[1], v17.d[0]              \n"  // f
939     "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
940     "umull      v16.8h, v0.8b, v7.8b           \n"
941     "umull2     v17.8h, v0.16b, v7.16b         \n"
942     "umull      v18.8h, v1.8b, v2.8b           \n"
943     "umull2     v19.8h, v1.16b, v2.16b         \n"
944     "add        v16.8h, v16.8h, v18.8h         \n"
945     "add        v17.8h, v17.8h, v19.8h         \n"
946     "shrn       v0.8b, v16.8h, #7              \n"
947     "shrn2      v0.16b, v17.8h, #7             \n"
948 
949     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
950     "add     v5.4s, v5.4s, v6.4s               \n"
951     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
952     "b.gt    1b                                \n"
953   : "+r"(dst_argb),         // %0
954     "+r"(src_argb),         // %1
955     "+r"(dst_width),        // %2
956     "+r"(x64),              // %3
957     "+r"(dx64),             // %4
958     "+r"(tmp),              // %5
959     "+r"(src_tmp)           // %6
960   :
961   : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
962     "v6", "v7", "v16", "v17", "v18", "v19"
963   );
964 }
965 
966 #undef LOAD2_DATA32_LANE
967 
968 // Read 16x2 average down and write 8x1.
ScaleRowDown2Box_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)969 void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
970                               ptrdiff_t src_stride,
971                               uint16_t* dst,
972                               int dst_width) {
973   asm volatile(
974       // change the stride to row 2 pointer
975       "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
976       "1:                                        \n"
977       "ld1        {v0.8h, v1.8h}, [%0], #32      \n"  // load row 1 and post inc
978       "ld1        {v2.8h, v3.8h}, [%1], #32      \n"  // load row 2 and post inc
979       "subs       %w3, %w3, #8                   \n"  // 8 processed per loop
980       "uaddlp     v0.4s, v0.8h                   \n"  // row 1 add adjacent
981       "uaddlp     v1.4s, v1.8h                   \n"
982       "uadalp     v0.4s, v2.8h                   \n"  // +row 2 add adjacent
983       "uadalp     v1.4s, v3.8h                   \n"
984       "rshrn      v0.4h, v0.4s, #2               \n"  // round and pack
985       "rshrn2     v0.8h, v1.4s, #2               \n"
986       "st1        {v0.8h}, [%2], #16             \n"
987       "b.gt       1b                             \n"
988       : "+r"(src_ptr),     // %0
989         "+r"(src_stride),  // %1
990         "+r"(dst),         // %2
991         "+r"(dst_width)    // %3
992       :
993       : "v0", "v1", "v2", "v3"  // Clobber List
994   );
995 }
996 
997 // Read 8x2 upsample with filtering and write 16x1.
998 // Actually reads an extra pixel, so 9x2.
ScaleRowUp2_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)999 void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
1000                          ptrdiff_t src_stride,
1001                          uint16_t* dst,
1002                          int dst_width) {
1003   asm volatile(
1004       "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
1005       "movi       v0.8h, #9                      \n"  // constants
1006       "movi       v1.4s, #3                      \n"
1007 
1008       "1:                                        \n"
1009       "ld1        {v3.8h}, [%0], %4              \n"  // TL read first 8
1010       "ld1        {v4.8h}, [%0], %5              \n"  // TR read 8 offset by 1
1011       "ld1        {v5.8h}, [%1], %4              \n"  // BL read 8 from next row
1012       "ld1        {v6.8h}, [%1], %5              \n"  // BR offset by 1
1013       "subs       %w3, %w3, #16                  \n"  // 16 dst pixels per loop
1014       "umull      v16.4s, v3.4h, v0.4h           \n"
1015       "umull2     v7.4s, v3.8h, v0.8h            \n"
1016       "umull      v18.4s, v4.4h, v0.4h           \n"
1017       "umull2     v17.4s, v4.8h, v0.8h           \n"
1018       "uaddw      v16.4s, v16.4s, v6.4h          \n"
1019       "uaddl2     v19.4s, v6.8h, v3.8h           \n"
1020       "uaddl      v3.4s, v6.4h, v3.4h            \n"
1021       "uaddw2     v6.4s, v7.4s, v6.8h            \n"
1022       "uaddl2     v7.4s, v5.8h, v4.8h            \n"
1023       "uaddl      v4.4s, v5.4h, v4.4h            \n"
1024       "uaddw      v18.4s, v18.4s, v5.4h          \n"
1025       "mla        v16.4s, v4.4s, v1.4s           \n"
1026       "mla        v18.4s, v3.4s, v1.4s           \n"
1027       "mla        v6.4s, v7.4s, v1.4s            \n"
1028       "uaddw2     v4.4s, v17.4s, v5.8h           \n"
1029       "uqrshrn    v16.4h,  v16.4s, #4            \n"
1030       "mla        v4.4s, v19.4s, v1.4s           \n"
1031       "uqrshrn2   v16.8h, v6.4s, #4              \n"
1032       "uqrshrn    v17.4h, v18.4s, #4             \n"
1033       "uqrshrn2   v17.8h, v4.4s, #4              \n"
1034       "st2        {v16.8h-v17.8h}, [%2], #32     \n"
1035       "b.gt       1b                             \n"
1036       : "+r"(src_ptr),     // %0
1037         "+r"(src_stride),  // %1
1038         "+r"(dst),         // %2
1039         "+r"(dst_width)    // %3
1040       : "r"(2LL),          // %4
1041         "r"(14LL)          // %5
1042       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
1043         "v19"  // Clobber List
1044   );
1045 }
1046 
1047 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1048 
1049 #ifdef __cplusplus
1050 }  // extern "C"
1051 }  // namespace libyuv
1052 #endif
1053