• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale.h"
13 #include "libyuv/scale_row.h"
14 
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19 
20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22 
23 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)24 void ScaleRowDown2_NEON(const uint8_t* src_ptr,
25                         ptrdiff_t src_stride,
26                         uint8_t* dst,
27                         int dst_width) {
28   (void)src_stride;
29   asm volatile(
30       "1:                                        \n"
31       // load even pixels into v0, odd into v1
32       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
33       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
34       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
35       "st1         {v1.16b}, [%1], #16           \n"  // store odd pixels
36       "b.gt        1b                            \n"
37       : "+r"(src_ptr),   // %0
38         "+r"(dst),       // %1
39         "+r"(dst_width)  // %2
40       :
41       : "v0", "v1"  // Clobber List
42   );
43 }
44 
45 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)46 void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
47                               ptrdiff_t src_stride,
48                               uint8_t* dst,
49                               int dst_width) {
50   (void)src_stride;
51   asm volatile(
52       "1:                                        \n"
53       // load even pixels into v0, odd into v1
54       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
55       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
56       "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
57       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
58       "st1         {v0.16b}, [%1], #16           \n"
59       "b.gt        1b                            \n"
60       : "+r"(src_ptr),   // %0
61         "+r"(dst),       // %1
62         "+r"(dst_width)  // %2
63       :
64       : "v0", "v1"  // Clobber List
65   );
66 }
67 
68 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)69 void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
70                            ptrdiff_t src_stride,
71                            uint8_t* dst,
72                            int dst_width) {
73   asm volatile(
74       // change the stride to row 2 pointer
75       "add         %1, %1, %0                    \n"
76       "1:                                        \n"
77       "ld1         {v0.16b, v1.16b}, [%0], #32   \n"  // load row 1 and post inc
78       "ld1         {v2.16b, v3.16b}, [%1], #32   \n"  // load row 2 and post inc
79       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
80       "uaddlp      v0.8h, v0.16b                 \n"  // row 1 add adjacent
81       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
82       "uaddlp      v1.8h, v1.16b                 \n"
83       "prfm        pldl1keep, [%1, 448]          \n"
84       "uadalp      v0.8h, v2.16b                 \n"  // += row 2 add adjacent
85       "uadalp      v1.8h, v3.16b                 \n"
86       "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
87       "rshrn2      v0.16b, v1.8h, #2             \n"
88       "st1         {v0.16b}, [%2], #16           \n"
89       "b.gt        1b                            \n"
90       : "+r"(src_ptr),     // %0
91         "+r"(src_stride),  // %1
92         "+r"(dst),         // %2
93         "+r"(dst_width)    // %3
94       :
95       : "v0", "v1", "v2", "v3"  // Clobber List
96   );
97 }
98 
ScaleRowDown4_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)99 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
100                         ptrdiff_t src_stride,
101                         uint8_t* dst_ptr,
102                         int dst_width) {
103   (void)src_stride;
104   asm volatile(
105       "1:                                        \n"
106       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
107       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
108       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
109       "st1         {v2.8b}, [%1], #8             \n"
110       "b.gt        1b                            \n"
111       : "+r"(src_ptr),   // %0
112         "+r"(dst_ptr),   // %1
113         "+r"(dst_width)  // %2
114       :
115       : "v0", "v1", "v2", "v3", "memory", "cc");
116 }
117 
ScaleRowDown4Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)118 void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
119                            ptrdiff_t src_stride,
120                            uint8_t* dst_ptr,
121                            int dst_width) {
122   const uint8_t* src_ptr1 = src_ptr + src_stride;
123   const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
124   const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
125   asm volatile(
126       "1:                                        \n"
127       "ld1         {v0.16b}, [%0], #16           \n"  // load up 16x4
128       "ld1         {v1.16b}, [%2], #16           \n"
129       "ld1         {v2.16b}, [%3], #16           \n"
130       "ld1         {v3.16b}, [%4], #16           \n"
131       "subs        %w5, %w5, #4                  \n"
132       "uaddlp      v0.8h, v0.16b                 \n"
133       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
134       "uadalp      v0.8h, v1.16b                 \n"
135       "prfm        pldl1keep, [%2, 448]          \n"
136       "uadalp      v0.8h, v2.16b                 \n"
137       "prfm        pldl1keep, [%3, 448]          \n"
138       "uadalp      v0.8h, v3.16b                 \n"
139       "prfm        pldl1keep, [%4, 448]          \n"
140       "addp        v0.8h, v0.8h, v0.8h           \n"
141       "rshrn       v0.8b, v0.8h, #4              \n"  // divide by 16 w/rounding
142       "st1         {v0.s}[0], [%1], #4           \n"
143       "b.gt        1b                            \n"
144       : "+r"(src_ptr),   // %0
145         "+r"(dst_ptr),   // %1
146         "+r"(src_ptr1),  // %2
147         "+r"(src_ptr2),  // %3
148         "+r"(src_ptr3),  // %4
149         "+r"(dst_width)  // %5
150       :
151       : "v0", "v1", "v2", "v3", "memory", "cc");
152 }
153 
154 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
155 // to load up the every 4th pixel into a 4 different registers.
156 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)157 void ScaleRowDown34_NEON(const uint8_t* src_ptr,
158                          ptrdiff_t src_stride,
159                          uint8_t* dst_ptr,
160                          int dst_width) {
161   (void)src_stride;
162   asm volatile(
163       "1:                                        \n"
164       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
165       "subs        %w2, %w2, #24                 \n"
166       "orr         v2.16b, v3.16b, v3.16b        \n"  // order v0,v1,v2
167       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
168       "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
169       "b.gt        1b                            \n"
170       : "+r"(src_ptr),   // %0
171         "+r"(dst_ptr),   // %1
172         "+r"(dst_width)  // %2
173       :
174       : "v0", "v1", "v2", "v3", "memory", "cc");
175 }
176 
ScaleRowDown34_0_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)177 void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
178                                ptrdiff_t src_stride,
179                                uint8_t* dst_ptr,
180                                int dst_width) {
181   asm volatile(
182       "movi        v20.8b, #3                    \n"
183       "add         %3, %3, %0                    \n"
184       "1:                                        \n"
185       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
186       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
187       "subs        %w2, %w2, #24                 \n"
188 
189       // filter src line 0 with src line 1
190       // expand chars to shorts to allow for room
191       // when adding lines together
192       "ushll       v16.8h, v4.8b, #0             \n"
193       "ushll       v17.8h, v5.8b, #0             \n"
194       "ushll       v18.8h, v6.8b, #0             \n"
195       "ushll       v19.8h, v7.8b, #0             \n"
196 
197       // 3 * line_0 + line_1
198       "umlal       v16.8h, v0.8b, v20.8b         \n"
199       "umlal       v17.8h, v1.8b, v20.8b         \n"
200       "umlal       v18.8h, v2.8b, v20.8b         \n"
201       "umlal       v19.8h, v3.8b, v20.8b         \n"
202       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
203 
204       // (3 * line_0 + line_1) >> 2
205       "uqrshrn     v0.8b, v16.8h, #2             \n"
206       "uqrshrn     v1.8b, v17.8h, #2             \n"
207       "uqrshrn     v2.8b, v18.8h, #2             \n"
208       "uqrshrn     v3.8b, v19.8h, #2             \n"
209       "prfm        pldl1keep, [%3, 448]          \n"
210 
211       // a0 = (src[0] * 3 + s[1] * 1) >> 2
212       "ushll       v16.8h, v1.8b, #0             \n"
213       "umlal       v16.8h, v0.8b, v20.8b         \n"
214       "uqrshrn     v0.8b, v16.8h, #2             \n"
215 
216       // a1 = (src[1] * 1 + s[2] * 1) >> 1
217       "urhadd      v1.8b, v1.8b, v2.8b           \n"
218 
219       // a2 = (src[2] * 1 + s[3] * 3) >> 2
220       "ushll       v16.8h, v2.8b, #0             \n"
221       "umlal       v16.8h, v3.8b, v20.8b         \n"
222       "uqrshrn     v2.8b, v16.8h, #2             \n"
223 
224       "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
225 
226       "b.gt        1b                            \n"
227       : "+r"(src_ptr),    // %0
228         "+r"(dst_ptr),    // %1
229         "+r"(dst_width),  // %2
230         "+r"(src_stride)  // %3
231       :
232       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
233         "v19", "v20", "memory", "cc");
234 }
235 
ScaleRowDown34_1_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)236 void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
237                                ptrdiff_t src_stride,
238                                uint8_t* dst_ptr,
239                                int dst_width) {
240   asm volatile(
241       "movi        v20.8b, #3                    \n"
242       "add         %3, %3, %0                    \n"
243       "1:                                        \n"
244       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
245       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
246       "subs        %w2, %w2, #24                 \n"
247       // average src line 0 with src line 1
248       "urhadd      v0.8b, v0.8b, v4.8b           \n"
249       "urhadd      v1.8b, v1.8b, v5.8b           \n"
250       "urhadd      v2.8b, v2.8b, v6.8b           \n"
251       "urhadd      v3.8b, v3.8b, v7.8b           \n"
252       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
253 
254       // a0 = (src[0] * 3 + s[1] * 1) >> 2
255       "ushll       v4.8h, v1.8b, #0              \n"
256       "umlal       v4.8h, v0.8b, v20.8b          \n"
257       "uqrshrn     v0.8b, v4.8h, #2              \n"
258       "prfm        pldl1keep, [%3, 448]          \n"
259 
260       // a1 = (src[1] * 1 + s[2] * 1) >> 1
261       "urhadd      v1.8b, v1.8b, v2.8b           \n"
262 
263       // a2 = (src[2] * 1 + s[3] * 3) >> 2
264       "ushll       v4.8h, v2.8b, #0              \n"
265       "umlal       v4.8h, v3.8b, v20.8b          \n"
266       "uqrshrn     v2.8b, v4.8h, #2              \n"
267 
268       "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
269       "b.gt        1b                            \n"
270       : "+r"(src_ptr),    // %0
271         "+r"(dst_ptr),    // %1
272         "+r"(dst_width),  // %2
273         "+r"(src_stride)  // %3
274       :
275       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
276 }
277 
278 static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
279                               22, 24, 27, 30, 0,  0,  0,  0};
280 static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
281                                 34, 6,  22, 35, 0,  0,  0, 0};
282 static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
283                                    65536 / 12, 65536 / 12, 65536 / 12,
284                                    65536 / 12, 65536 / 12};
285 static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
286                                    65536 / 18, 65536 / 18, 65536 / 18,
287                                    65536 / 18, 65536 / 18};
288 
289 // 32 -> 12
ScaleRowDown38_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)290 void ScaleRowDown38_NEON(const uint8_t* src_ptr,
291                          ptrdiff_t src_stride,
292                          uint8_t* dst_ptr,
293                          int dst_width) {
294   (void)src_stride;
295   asm volatile(
296       "ld1         {v3.16b}, [%3]                \n"
297       "1:                                        \n"
298       "ld1         {v0.16b,v1.16b}, [%0], #32    \n"
299       "subs        %w2, %w2, #12                 \n"
300       "tbl         v2.16b, {v0.16b,v1.16b}, v3.16b \n"
301       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
302       "st1         {v2.8b}, [%1], #8             \n"
303       "st1         {v2.s}[2], [%1], #4           \n"
304       "b.gt        1b                            \n"
305       : "+r"(src_ptr),   // %0
306         "+r"(dst_ptr),   // %1
307         "+r"(dst_width)  // %2
308       : "r"(&kShuf38)    // %3
309       : "v0", "v1", "v2", "v3", "memory", "cc");
310 }
311 
312 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)313 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
314                                       ptrdiff_t src_stride,
315                                       uint8_t* dst_ptr,
316                                       int dst_width) {
317   const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
318   ptrdiff_t tmp_src_stride = src_stride;
319 
320   asm volatile(
321       "ld1         {v29.8h}, [%5]                \n"
322       "ld1         {v30.16b}, [%6]               \n"
323       "ld1         {v31.8h}, [%7]                \n"
324       "add         %2, %2, %0                    \n"
325       "1:                                        \n"
326 
327       // 00 40 01 41 02 42 03 43
328       // 10 50 11 51 12 52 13 53
329       // 20 60 21 61 22 62 23 63
330       // 30 70 31 71 32 72 33 73
331       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
332       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
333       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
334       "subs        %w4, %w4, #12                 \n"
335 
336       // Shuffle the input data around to get align the data
337       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
338       // 00 10 01 11 02 12 03 13
339       // 40 50 41 51 42 52 43 53
340       "trn1        v20.8b, v0.8b, v1.8b          \n"
341       "trn2        v21.8b, v0.8b, v1.8b          \n"
342       "trn1        v22.8b, v4.8b, v5.8b          \n"
343       "trn2        v23.8b, v4.8b, v5.8b          \n"
344       "trn1        v24.8b, v16.8b, v17.8b        \n"
345       "trn2        v25.8b, v16.8b, v17.8b        \n"
346 
347       // 20 30 21 31 22 32 23 33
348       // 60 70 61 71 62 72 63 73
349       "trn1        v0.8b, v2.8b, v3.8b           \n"
350       "trn2        v1.8b, v2.8b, v3.8b           \n"
351       "trn1        v4.8b, v6.8b, v7.8b           \n"
352       "trn2        v5.8b, v6.8b, v7.8b           \n"
353       "trn1        v16.8b, v18.8b, v19.8b        \n"
354       "trn2        v17.8b, v18.8b, v19.8b        \n"
355 
356       // 00+10 01+11 02+12 03+13
357       // 40+50 41+51 42+52 43+53
358       "uaddlp      v20.4h, v20.8b                \n"
359       "uaddlp      v21.4h, v21.8b                \n"
360       "uaddlp      v22.4h, v22.8b                \n"
361       "uaddlp      v23.4h, v23.8b                \n"
362       "uaddlp      v24.4h, v24.8b                \n"
363       "uaddlp      v25.4h, v25.8b                \n"
364 
365       // 60+70 61+71 62+72 63+73
366       "uaddlp      v1.4h, v1.8b                  \n"
367       "uaddlp      v5.4h, v5.8b                  \n"
368       "uaddlp      v17.4h, v17.8b                \n"
369 
370       // combine source lines
371       "add         v20.4h, v20.4h, v22.4h        \n"
372       "add         v21.4h, v21.4h, v23.4h        \n"
373       "add         v20.4h, v20.4h, v24.4h        \n"
374       "add         v21.4h, v21.4h, v25.4h        \n"
375       "add         v2.4h, v1.4h, v5.4h           \n"
376       "add         v2.4h, v2.4h, v17.4h          \n"
377 
378       // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
379       //             + s[6 + st * 1] + s[7 + st * 1]
380       //             + s[6 + st * 2] + s[7 + st * 2]) / 6
381       "sqrdmulh    v2.8h, v2.8h, v29.8h          \n"
382       "xtn         v2.8b,  v2.8h                 \n"
383 
384       // Shuffle 2,3 reg around so that 2 can be added to the
385       //  0,1 reg and 3 can be added to the 4,5 reg. This
386       //  requires expanding from u8 to u16 as the 0,1 and 4,5
387       //  registers are already expanded. Then do transposes
388       //  to get aligned.
389       // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
390       "ushll       v16.8h, v16.8b, #0            \n"
391       "uaddl       v0.8h, v0.8b, v4.8b           \n"
392 
393       // combine source lines
394       "add         v0.8h, v0.8h, v16.8h          \n"
395 
396       // xx 20 xx 21 xx 22 xx 23
397       // xx 30 xx 31 xx 32 xx 33
398       "trn1        v1.8h, v0.8h, v0.8h           \n"
399       "trn2        v4.8h, v0.8h, v0.8h           \n"
400       "xtn         v0.4h, v1.4s                  \n"
401       "xtn         v4.4h, v4.4s                  \n"
402       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
403 
404       // 0+1+2, 3+4+5
405       "add         v20.8h, v20.8h, v0.8h         \n"
406       "add         v21.8h, v21.8h, v4.8h         \n"
407       "prfm        pldl1keep, [%2, 448]          \n"
408 
409       // Need to divide, but can't downshift as the the value
410       //  isn't a power of 2. So multiply by 65536 / n
411       //  and take the upper 16 bits.
412       "sqrdmulh    v0.8h, v20.8h, v31.8h         \n"
413       "sqrdmulh    v1.8h, v21.8h, v31.8h         \n"
414       "prfm        pldl1keep, [%3, 448]          \n"
415 
416       // Align for table lookup, vtbl requires registers to be adjacent
417       "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
418 
419       "st1         {v3.8b}, [%1], #8             \n"
420       "st1         {v3.s}[2], [%1], #4           \n"
421       "b.gt        1b                            \n"
422       : "+r"(src_ptr),         // %0
423         "+r"(dst_ptr),         // %1
424         "+r"(tmp_src_stride),  // %2
425         "+r"(src_ptr1),        // %3
426         "+r"(dst_width)        // %4
427       : "r"(&kMult38_Div6),    // %5
428         "r"(&kShuf38_2),       // %6
429         "r"(&kMult38_Div9)     // %7
430       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
431         "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
432         "memory", "cc");
433 }
434 
435 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)436 void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
437                                ptrdiff_t src_stride,
438                                uint8_t* dst_ptr,
439                                int dst_width) {
440   // TODO(fbarchard): use src_stride directly for clang 3.5+.
441   ptrdiff_t tmp_src_stride = src_stride;
442   asm volatile(
443       "ld1         {v30.8h}, [%4]                \n"
444       "ld1         {v31.16b}, [%5]               \n"
445       "add         %2, %2, %0                    \n"
446       "1:                                        \n"
447 
448       // 00 40 01 41 02 42 03 43
449       // 10 50 11 51 12 52 13 53
450       // 20 60 21 61 22 62 23 63
451       // 30 70 31 71 32 72 33 73
452       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
453       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
454       "subs        %w3, %w3, #12                 \n"
455 
456       // Shuffle the input data around to get align the data
457       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
458       // 00 10 01 11 02 12 03 13
459       // 40 50 41 51 42 52 43 53
460       "trn1        v16.8b, v0.8b, v1.8b          \n"
461       "trn2        v17.8b, v0.8b, v1.8b          \n"
462       "trn1        v18.8b, v4.8b, v5.8b          \n"
463       "trn2        v19.8b, v4.8b, v5.8b          \n"
464 
465       // 20 30 21 31 22 32 23 33
466       // 60 70 61 71 62 72 63 73
467       "trn1        v0.8b, v2.8b, v3.8b           \n"
468       "trn2        v1.8b, v2.8b, v3.8b           \n"
469       "trn1        v4.8b, v6.8b, v7.8b           \n"
470       "trn2        v5.8b, v6.8b, v7.8b           \n"
471 
472       // 00+10 01+11 02+12 03+13
473       // 40+50 41+51 42+52 43+53
474       "uaddlp      v16.4h, v16.8b                \n"
475       "uaddlp      v17.4h, v17.8b                \n"
476       "uaddlp      v18.4h, v18.8b                \n"
477       "uaddlp      v19.4h, v19.8b                \n"
478 
479       // 60+70 61+71 62+72 63+73
480       "uaddlp      v1.4h, v1.8b                  \n"
481       "uaddlp      v5.4h, v5.8b                  \n"
482 
483       // combine source lines
484       "add         v16.4h, v16.4h, v18.4h        \n"
485       "add         v17.4h, v17.4h, v19.4h        \n"
486       "add         v2.4h, v1.4h, v5.4h           \n"
487 
488       // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
489       "uqrshrn     v2.8b, v2.8h, #2              \n"
490 
491       // Shuffle 2,3 reg around so that 2 can be added to the
492       //  0,1 reg and 3 can be added to the 4,5 reg. This
493       //  requires expanding from u8 to u16 as the 0,1 and 4,5
494       //  registers are already expanded. Then do transposes
495       //  to get aligned.
496       // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
497 
498       // combine source lines
499       "uaddl       v0.8h, v0.8b, v4.8b           \n"
500 
501       // xx 20 xx 21 xx 22 xx 23
502       // xx 30 xx 31 xx 32 xx 33
503       "trn1        v1.8h, v0.8h, v0.8h           \n"
504       "trn2        v4.8h, v0.8h, v0.8h           \n"
505       "xtn         v0.4h, v1.4s                  \n"
506       "xtn         v4.4h, v4.4s                  \n"
507       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
508 
509       // 0+1+2, 3+4+5
510       "add         v16.8h, v16.8h, v0.8h         \n"
511       "add         v17.8h, v17.8h, v4.8h         \n"
512       "prfm        pldl1keep, [%2, 448]          \n"
513 
514       // Need to divide, but can't downshift as the the value
515       //  isn't a power of 2. So multiply by 65536 / n
516       //  and take the upper 16 bits.
517       "sqrdmulh    v0.8h, v16.8h, v30.8h         \n"
518       "sqrdmulh    v1.8h, v17.8h, v30.8h         \n"
519 
520       // Align for table lookup, vtbl requires registers to
521       //  be adjacent
522 
523       "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
524 
525       "st1         {v3.8b}, [%1], #8             \n"
526       "st1         {v3.s}[2], [%1], #4           \n"
527       "b.gt        1b                            \n"
528       : "+r"(src_ptr),         // %0
529         "+r"(dst_ptr),         // %1
530         "+r"(tmp_src_stride),  // %2
531         "+r"(dst_width)        // %3
532       : "r"(&kMult38_Div6),    // %4
533         "r"(&kShuf38_2)        // %5
534       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
535         "v19", "v30", "v31", "memory", "cc");
536 }
537 
538 // Add a row of bytes to a row of shorts.  Used for box filter.
539 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_NEON(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)540 void ScaleAddRow_NEON(const uint8_t* src_ptr,
541                       uint16_t* dst_ptr,
542                       int src_width) {
543   asm volatile(
544       "1:                                        \n"
545       "ld1         {v1.8h, v2.8h}, [%1]          \n"  // load accumulator
546       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 bytes
547       "uaddw2      v2.8h, v2.8h, v0.16b          \n"  // add
548       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
549       "uaddw       v1.8h, v1.8h, v0.8b           \n"
550       "st1         {v1.8h, v2.8h}, [%1], #32     \n"  // store accumulator
551       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
552       "b.gt        1b                            \n"
553       : "+r"(src_ptr),   // %0
554         "+r"(dst_ptr),   // %1
555         "+r"(src_width)  // %2
556       :
557       : "memory", "cc", "v0", "v1", "v2"  // Clobber List
558   );
559 }
560 
561 // TODO(Yang Zhang): Investigate less load instructions for
562 // the x/dx stepping
563 #define LOAD2_DATA8_LANE(n)                      \
564   "lsr        %5, %3, #16                    \n" \
565   "add        %6, %1, %5                     \n" \
566   "add        %3, %3, %4                     \n" \
567   "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
568 
569 // The NEON version mimics this formula (from row_common.cc):
570 // #define BLENDER(a, b, f) (uint8_t)((int)(a) +
571 //    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
572 
ScaleFilterCols_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)573 void ScaleFilterCols_NEON(uint8_t* dst_ptr,
574                           const uint8_t* src_ptr,
575                           int dst_width,
576                           int x,
577                           int dx) {
578   int dx_offset[4] = {0, 1, 2, 3};
579   int* tmp = dx_offset;
580   const uint8_t* src_tmp = src_ptr;
581   int64_t x64 = (int64_t)x;    // NOLINT
582   int64_t dx64 = (int64_t)dx;  // NOLINT
583   asm volatile (
584       "dup         v0.4s, %w3                    \n"  // x
585       "dup         v1.4s, %w4                    \n"  // dx
586       "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
587       "shl         v3.4s, v1.4s, #2              \n"  // 4 * dx
588       "mul         v1.4s, v1.4s, v2.4s           \n"
589     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
590       "add         v1.4s, v1.4s, v0.4s           \n"
591     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
592       "add         v2.4s, v1.4s, v3.4s           \n"
593       "shl         v0.4s, v3.4s, #1              \n"  // 8 * dx
594       "1:                                        \n"
595     LOAD2_DATA8_LANE(0)
596     LOAD2_DATA8_LANE(1)
597     LOAD2_DATA8_LANE(2)
598     LOAD2_DATA8_LANE(3)
599     LOAD2_DATA8_LANE(4)
600     LOAD2_DATA8_LANE(5)
601     LOAD2_DATA8_LANE(6)
602     LOAD2_DATA8_LANE(7)
603       "mov         v6.16b, v1.16b                \n"
604       "mov         v7.16b, v2.16b                \n"
605       "uzp1        v6.8h, v6.8h, v7.8h           \n"
606       "ushll       v4.8h, v4.8b, #0              \n"
607       "ushll       v5.8h, v5.8b, #0              \n"
608       "ssubl       v16.4s, v5.4h, v4.4h          \n"
609       "ssubl2      v17.4s, v5.8h, v4.8h          \n"
610       "ushll       v7.4s, v6.4h, #0              \n"
611       "ushll2      v6.4s, v6.8h, #0              \n"
612       "mul         v16.4s, v16.4s, v7.4s         \n"
613       "mul         v17.4s, v17.4s, v6.4s         \n"
614       "rshrn       v6.4h, v16.4s, #16            \n"
615       "rshrn2      v6.8h, v17.4s, #16            \n"
616       "add         v4.8h, v4.8h, v6.8h           \n"
617       "xtn         v4.8b, v4.8h                  \n"
618 
619       "st1         {v4.8b}, [%0], #8             \n"  // store pixels
620       "add         v1.4s, v1.4s, v0.4s           \n"
621       "add         v2.4s, v2.4s, v0.4s           \n"
622       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
623       "b.gt        1b                            \n"
624   : "+r"(dst_ptr),          // %0
625     "+r"(src_ptr),          // %1
626     "+r"(dst_width),        // %2
627     "+r"(x64),              // %3
628     "+r"(dx64),             // %4
629     "+r"(tmp),              // %5
630     "+r"(src_tmp)           // %6
631   :
632   : "memory", "cc", "v0", "v1", "v2", "v3",
633     "v4", "v5", "v6", "v7", "v16", "v17"
634   );
635 }
636 
637 #undef LOAD2_DATA8_LANE
638 
639 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)640 void ScaleFilterRows_NEON(uint8_t* dst_ptr,
641                           const uint8_t* src_ptr,
642                           ptrdiff_t src_stride,
643                           int dst_width,
644                           int source_y_fraction) {
645   int y_fraction = 256 - source_y_fraction;
646   asm volatile(
647       "cmp         %w4, #0                       \n"
648       "b.eq        100f                          \n"
649       "add         %2, %2, %1                    \n"
650       "cmp         %w4, #64                      \n"
651       "b.eq        75f                           \n"
652       "cmp         %w4, #128                     \n"
653       "b.eq        50f                           \n"
654       "cmp         %w4, #192                     \n"
655       "b.eq        25f                           \n"
656 
657       "dup         v5.8b, %w4                    \n"
658       "dup         v4.8b, %w5                    \n"
659       // General purpose row blend.
660       "1:                                        \n"
661       "ld1         {v0.16b}, [%1], #16           \n"
662       "ld1         {v1.16b}, [%2], #16           \n"
663       "subs        %w3, %w3, #16                 \n"
664       "umull       v6.8h, v0.8b, v4.8b           \n"
665       "umull2      v7.8h, v0.16b, v4.16b         \n"
666       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
667       "umlal       v6.8h, v1.8b, v5.8b           \n"
668       "umlal2      v7.8h, v1.16b, v5.16b         \n"
669       "prfm        pldl1keep, [%2, 448]          \n"
670       "rshrn       v0.8b, v6.8h, #8              \n"
671       "rshrn2      v0.16b, v7.8h, #8             \n"
672       "st1         {v0.16b}, [%0], #16           \n"
673       "b.gt        1b                            \n"
674       "b           99f                           \n"
675 
676       // Blend 25 / 75.
677       "25:                                       \n"
678       "ld1         {v0.16b}, [%1], #16           \n"
679       "ld1         {v1.16b}, [%2], #16           \n"
680       "subs        %w3, %w3, #16                 \n"
681       "urhadd      v0.16b, v0.16b, v1.16b        \n"
682       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
683       "urhadd      v0.16b, v0.16b, v1.16b        \n"
684       "prfm        pldl1keep, [%2, 448]          \n"
685       "st1         {v0.16b}, [%0], #16           \n"
686       "b.gt        25b                           \n"
687       "b           99f                           \n"
688 
689       // Blend 50 / 50.
690       "50:                                       \n"
691       "ld1         {v0.16b}, [%1], #16           \n"
692       "ld1         {v1.16b}, [%2], #16           \n"
693       "subs        %w3, %w3, #16                 \n"
694       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
695       "urhadd      v0.16b, v0.16b, v1.16b        \n"
696       "prfm        pldl1keep, [%2, 448]          \n"
697       "st1         {v0.16b}, [%0], #16           \n"
698       "b.gt        50b                           \n"
699       "b           99f                           \n"
700 
701       // Blend 75 / 25.
702       "75:                                       \n"
703       "ld1         {v1.16b}, [%1], #16           \n"
704       "ld1         {v0.16b}, [%2], #16           \n"
705       "subs        %w3, %w3, #16                 \n"
706       "urhadd      v0.16b, v0.16b, v1.16b        \n"
707       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
708       "urhadd      v0.16b, v0.16b, v1.16b        \n"
709       "prfm        pldl1keep, [%2, 448]          \n"
710       "st1         {v0.16b}, [%0], #16           \n"
711       "b.gt        75b                           \n"
712       "b           99f                           \n"
713 
714       // Blend 100 / 0 - Copy row unchanged.
715       "100:                                      \n"
716       "ld1         {v0.16b}, [%1], #16           \n"
717       "subs        %w3, %w3, #16                 \n"
718       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
719       "st1         {v0.16b}, [%0], #16           \n"
720       "b.gt        100b                          \n"
721 
722       "99:                                       \n"
723       "st1         {v0.b}[15], [%0]              \n"
724       : "+r"(dst_ptr),            // %0
725         "+r"(src_ptr),            // %1
726         "+r"(src_stride),         // %2
727         "+r"(dst_width),          // %3
728         "+r"(source_y_fraction),  // %4
729         "+r"(y_fraction)          // %5
730       :
731       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
732 }
733 
ScaleARGBRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)734 void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
735                             ptrdiff_t src_stride,
736                             uint8_t* dst,
737                             int dst_width) {
738   (void)src_stride;
739   asm volatile(
740       "1:                                        \n"
741       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
742       "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
743       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
744       "mov         v2.16b, v3.16b                \n"
745       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
746       "st2         {v1.4s,v2.4s}, [%1], #32      \n"  // store 8 odd pixels
747       "b.gt        1b                            \n"
748       : "+r"(src_ptr),   // %0
749         "+r"(dst),       // %1
750         "+r"(dst_width)  // %2
751       :
752       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
753   );
754 }
755 
ScaleARGBRowDown2Linear_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)756 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
757                                   ptrdiff_t src_stride,
758                                   uint8_t* dst_argb,
759                                   int dst_width) {
760   (void)src_stride;
761   asm volatile(
762       "1:                                        \n"
763       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
764       "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
765       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
766 
767       "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
768       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
769       "urhadd      v1.16b, v2.16b, v3.16b        \n"
770       "st2         {v0.4s,v1.4s}, [%1], #32      \n"  // store 8 pixels
771       "b.gt        1b                            \n"
772       : "+r"(src_argb),  // %0
773         "+r"(dst_argb),  // %1
774         "+r"(dst_width)  // %2
775       :
776       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
777   );
778 }
779 
ScaleARGBRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)780 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
781                                ptrdiff_t src_stride,
782                                uint8_t* dst,
783                                int dst_width) {
784   asm volatile(
785       // change the stride to row 2 pointer
786       "add         %1, %1, %0                    \n"
787       "1:                                        \n"
788       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
789       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
790       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
791       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
792       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
793       "uaddlp      v3.8h, v3.16b                 \n"  // A 16 bytes -> 8 shorts.
794       "ld4         {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
795       "uadalp      v0.8h, v16.16b                \n"  // B 16 bytes -> 8 shorts.
796       "uadalp      v1.8h, v17.16b                \n"  // G 16 bytes -> 8 shorts.
797       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
798       "uadalp      v2.8h, v18.16b                \n"  // R 16 bytes -> 8 shorts.
799       "uadalp      v3.8h, v19.16b                \n"  // A 16 bytes -> 8 shorts.
800       "prfm        pldl1keep, [%1, 448]          \n"
801       "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
802       "rshrn       v1.8b, v1.8h, #2              \n"
803       "rshrn       v2.8b, v2.8h, #2              \n"
804       "rshrn       v3.8b, v3.8h, #2              \n"
805       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
806       "b.gt        1b                            \n"
807       : "+r"(src_ptr),     // %0
808         "+r"(src_stride),  // %1
809         "+r"(dst),         // %2
810         "+r"(dst_width)    // %3
811       :
812       : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
813 }
814 
815 // Reads 4 pixels at a time.
816 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)817 void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
818                                ptrdiff_t src_stride,
819                                int src_stepx,
820                                uint8_t* dst_argb,
821                                int dst_width) {
822   (void)src_stride;
823   asm volatile(
824       "1:                                        \n"
825       "ld1         {v0.s}[0], [%0], %3           \n"
826       "ld1         {v0.s}[1], [%0], %3           \n"
827       "ld1         {v0.s}[2], [%0], %3           \n"
828       "ld1         {v0.s}[3], [%0], %3           \n"
829       "subs        %w2, %w2, #4                  \n"  // 4 pixels per loop.
830       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
831       "st1         {v0.16b}, [%1], #16           \n"
832       "b.gt        1b                            \n"
833       : "+r"(src_argb),                // %0
834         "+r"(dst_argb),                // %1
835         "+r"(dst_width)                // %2
836       : "r"((int64_t)(src_stepx * 4))  // %3
837       : "memory", "cc", "v0");
838 }
839 
840 // Reads 4 pixels at a time.
841 // Alignment requirement: src_argb 4 byte aligned.
842 // TODO(Yang Zhang): Might be worth another optimization pass in future.
843 // It could be upgraded to 8 pixels at a time to start with.
ScaleARGBRowDownEvenBox_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)844 void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
845                                   ptrdiff_t src_stride,
846                                   int src_stepx,
847                                   uint8_t* dst_argb,
848                                   int dst_width) {
849   asm volatile(
850       "add         %1, %1, %0                    \n"
851       "1:                                        \n"
852       "ld1         {v0.8b}, [%0], %4             \n"  // Read 4 2x2 -> 2x1
853       "ld1         {v1.8b}, [%1], %4             \n"
854       "ld1         {v2.8b}, [%0], %4             \n"
855       "ld1         {v3.8b}, [%1], %4             \n"
856       "ld1         {v4.8b}, [%0], %4             \n"
857       "ld1         {v5.8b}, [%1], %4             \n"
858       "ld1         {v6.8b}, [%0], %4             \n"
859       "ld1         {v7.8b}, [%1], %4             \n"
860       "uaddl       v0.8h, v0.8b, v1.8b           \n"
861       "uaddl       v2.8h, v2.8b, v3.8b           \n"
862       "uaddl       v4.8h, v4.8b, v5.8b           \n"
863       "uaddl       v6.8h, v6.8b, v7.8b           \n"
864       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
865       "mov         v16.d[1], v0.d[1]             \n"  // ab_cd -> ac_bd
866       "mov         v0.d[1], v2.d[0]              \n"
867       "mov         v2.d[0], v16.d[1]             \n"
868       "mov         v16.d[1], v4.d[1]             \n"  // ef_gh -> eg_fh
869       "mov         v4.d[1], v6.d[0]              \n"
870       "mov         v6.d[0], v16.d[1]             \n"
871       "prfm        pldl1keep, [%1, 448]          \n"
872       "add         v0.8h, v0.8h, v2.8h           \n"  // (a+b)_(c+d)
873       "add         v4.8h, v4.8h, v6.8h           \n"  // (e+f)_(g+h)
874       "rshrn       v0.8b, v0.8h, #2              \n"  // first 2 pixels.
875       "rshrn2      v0.16b, v4.8h, #2             \n"  // next 2 pixels.
876       "subs        %w3, %w3, #4                  \n"  // 4 pixels per loop.
877       "st1         {v0.16b}, [%2], #16           \n"
878       "b.gt        1b                            \n"
879       : "+r"(src_argb),                // %0
880         "+r"(src_stride),              // %1
881         "+r"(dst_argb),                // %2
882         "+r"(dst_width)                // %3
883       : "r"((int64_t)(src_stepx * 4))  // %4
884       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
885 }
886 
887 // TODO(Yang Zhang): Investigate less load instructions for
888 // the x/dx stepping
889 #define LOAD1_DATA32_LANE(vn, n)                 \
890   "lsr        %5, %3, #16                    \n" \
891   "add        %6, %1, %5, lsl #2             \n" \
892   "add        %3, %3, %4                     \n" \
893   "ld1        {" #vn ".s}[" #n "], [%6]      \n"
894 
ScaleARGBCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)895 void ScaleARGBCols_NEON(uint8_t* dst_argb,
896                         const uint8_t* src_argb,
897                         int dst_width,
898                         int x,
899                         int dx) {
900   const uint8_t* src_tmp = src_argb;
901   int64_t x64 = (int64_t)x;    // NOLINT
902   int64_t dx64 = (int64_t)dx;  // NOLINT
903   int64_t tmp64;
904   asm volatile(
905       "1:                                        \n"
906       // clang-format off
907       LOAD1_DATA32_LANE(v0, 0)
908       LOAD1_DATA32_LANE(v0, 1)
909       LOAD1_DATA32_LANE(v0, 2)
910       LOAD1_DATA32_LANE(v0, 3)
911       LOAD1_DATA32_LANE(v1, 0)
912       LOAD1_DATA32_LANE(v1, 1)
913       LOAD1_DATA32_LANE(v1, 2)
914       LOAD1_DATA32_LANE(v1, 3)
915       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
916       // clang-format on
917       "st1         {v0.4s, v1.4s}, [%0], #32     \n"  // store pixels
918       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
919       "b.gt        1b                            \n"
920       : "+r"(dst_argb),   // %0
921         "+r"(src_argb),   // %1
922         "+r"(dst_width),  // %2
923         "+r"(x64),        // %3
924         "+r"(dx64),       // %4
925         "=&r"(tmp64),     // %5
926         "+r"(src_tmp)     // %6
927       :
928       : "memory", "cc", "v0", "v1");
929 }
930 
931 #undef LOAD1_DATA32_LANE
932 
933 // TODO(Yang Zhang): Investigate less load instructions for
934 // the x/dx stepping
935 #define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
936   "lsr        %5, %3, #16                           \n" \
937   "add        %6, %1, %5, lsl #2                    \n" \
938   "add        %3, %3, %4                            \n" \
939   "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
940 
ScaleARGBFilterCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)941 void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
942                               const uint8_t* src_argb,
943                               int dst_width,
944                               int x,
945                               int dx) {
946   int dx_offset[4] = {0, 1, 2, 3};
947   int* tmp = dx_offset;
948   const uint8_t* src_tmp = src_argb;
949   int64_t x64 = (int64_t)x;    // NOLINT
950   int64_t dx64 = (int64_t)dx;  // NOLINT
951   asm volatile (
952       "dup         v0.4s, %w3                    \n"  // x
953       "dup         v1.4s, %w4                    \n"  // dx
954       "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
955       "shl         v6.4s, v1.4s, #2              \n"  // 4 * dx
956       "mul         v1.4s, v1.4s, v2.4s           \n"
957       "movi        v3.16b, #0x7f                 \n"  // 0x7F
958       "movi        v4.8h, #0x7f                  \n"  // 0x7F
959     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
960       "add         v5.4s, v1.4s, v0.4s           \n"
961       "1:                                        \n"
962     // d0, d1: a
963     // d2, d3: b
964     LOAD2_DATA32_LANE(v0, v1, 0)
965     LOAD2_DATA32_LANE(v0, v1, 1)
966     LOAD2_DATA32_LANE(v0, v1, 2)
967     LOAD2_DATA32_LANE(v0, v1, 3)
968     "shrn       v2.4h, v5.4s, #9               \n"
969     "and        v2.8b, v2.8b, v4.8b            \n"
970     "dup        v16.8b, v2.b[0]                \n"
971     "dup        v17.8b, v2.b[2]                \n"
972     "dup        v18.8b, v2.b[4]                \n"
973     "dup        v19.8b, v2.b[6]                \n"
974     "ext        v2.8b, v16.8b, v17.8b, #4      \n"
975     "ext        v17.8b, v18.8b, v19.8b, #4     \n"
976     "ins        v2.d[1], v17.d[0]              \n"  // f
977     "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
978     "umull      v16.8h, v0.8b, v7.8b           \n"
979     "umull2     v17.8h, v0.16b, v7.16b         \n"
980     "umull      v18.8h, v1.8b, v2.8b           \n"
981     "umull2     v19.8h, v1.16b, v2.16b         \n"
982     "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
983     "add        v16.8h, v16.8h, v18.8h         \n"
984     "add        v17.8h, v17.8h, v19.8h         \n"
985     "shrn       v0.8b, v16.8h, #7              \n"
986     "shrn2      v0.16b, v17.8h, #7             \n"
987     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
988     "add     v5.4s, v5.4s, v6.4s               \n"
989     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
990     "b.gt       1b                             \n"
991   : "+r"(dst_argb),         // %0
992     "+r"(src_argb),         // %1
993     "+r"(dst_width),        // %2
994     "+r"(x64),              // %3
995     "+r"(dx64),             // %4
996     "+r"(tmp),              // %5
997     "+r"(src_tmp)           // %6
998   :
999   : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
1000     "v6", "v7", "v16", "v17", "v18", "v19"
1001   );
1002 }
1003 
1004 #undef LOAD2_DATA32_LANE
1005 
1006 // Read 16x2 average down and write 8x1.
ScaleRowDown2Box_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)1007 void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
1008                               ptrdiff_t src_stride,
1009                               uint16_t* dst,
1010                               int dst_width) {
1011   asm volatile(
1012       // change the stride to row 2 pointer
1013       "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
1014       "1:                                        \n"
1015       "ld1         {v0.8h, v1.8h}, [%0], #32     \n"  // load row 1 and post inc
1016       "ld1         {v2.8h, v3.8h}, [%1], #32     \n"  // load row 2 and post inc
1017       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop
1018       "uaddlp      v0.4s, v0.8h                  \n"  // row 1 add adjacent
1019       "uaddlp      v1.4s, v1.8h                  \n"
1020       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1021       "uadalp      v0.4s, v2.8h                  \n"  // +row 2 add adjacent
1022       "uadalp      v1.4s, v3.8h                  \n"
1023       "prfm        pldl1keep, [%1, 448]          \n"
1024       "rshrn       v0.4h, v0.4s, #2              \n"  // round and pack
1025       "rshrn2      v0.8h, v1.4s, #2              \n"
1026       "st1         {v0.8h}, [%2], #16            \n"
1027       "b.gt        1b                            \n"
1028       : "+r"(src_ptr),     // %0
1029         "+r"(src_stride),  // %1
1030         "+r"(dst),         // %2
1031         "+r"(dst_width)    // %3
1032       :
1033       : "v0", "v1", "v2", "v3"  // Clobber List
1034   );
1035 }
1036 
1037 // Read 8x2 upsample with filtering and write 16x1.
1038 // Actually reads an extra pixel, so 9x2.
ScaleRowUp2_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)1039 void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
1040                          ptrdiff_t src_stride,
1041                          uint16_t* dst,
1042                          int dst_width) {
1043   asm volatile(
1044       "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
1045       "movi        v0.8h, #9                     \n"  // constants
1046       "movi        v1.4s, #3                     \n"
1047 
1048       "1:                                        \n"
1049       "ld1         {v3.8h}, [%0], %4             \n"  // TL read first 8
1050       "ld1         {v4.8h}, [%0], %5             \n"  // TR read 8 offset by 1
1051       "ld1         {v5.8h}, [%1], %4             \n"  // BL read 8 from next row
1052       "ld1         {v6.8h}, [%1], %5             \n"  // BR offset by 1
1053       "subs        %w3, %w3, #16                 \n"  // 16 dst pixels per loop
1054       "umull       v16.4s, v3.4h, v0.4h          \n"
1055       "umull2      v7.4s, v3.8h, v0.8h           \n"
1056       "umull       v18.4s, v4.4h, v0.4h          \n"
1057       "umull2      v17.4s, v4.8h, v0.8h          \n"
1058       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1059       "uaddw       v16.4s, v16.4s, v6.4h         \n"
1060       "uaddl2      v19.4s, v6.8h, v3.8h          \n"
1061       "uaddl       v3.4s, v6.4h, v3.4h           \n"
1062       "uaddw2      v6.4s, v7.4s, v6.8h           \n"
1063       "uaddl2      v7.4s, v5.8h, v4.8h           \n"
1064       "uaddl       v4.4s, v5.4h, v4.4h           \n"
1065       "uaddw       v18.4s, v18.4s, v5.4h         \n"
1066       "prfm        pldl1keep, [%1, 448]          \n"
1067       "mla         v16.4s, v4.4s, v1.4s          \n"
1068       "mla         v18.4s, v3.4s, v1.4s          \n"
1069       "mla         v6.4s, v7.4s, v1.4s           \n"
1070       "uaddw2      v4.4s, v17.4s, v5.8h          \n"
1071       "uqrshrn     v16.4h,  v16.4s, #4           \n"
1072       "mla         v4.4s, v19.4s, v1.4s          \n"
1073       "uqrshrn2    v16.8h, v6.4s, #4             \n"
1074       "uqrshrn     v17.4h, v18.4s, #4            \n"
1075       "uqrshrn2    v17.8h, v4.4s, #4             \n"
1076       "st2         {v16.8h-v17.8h}, [%2], #32    \n"
1077       "b.gt        1b                            \n"
1078       : "+r"(src_ptr),     // %0
1079         "+r"(src_stride),  // %1
1080         "+r"(dst),         // %2
1081         "+r"(dst_width)    // %3
1082       : "r"(2LL),          // %4
1083         "r"(14LL)          // %5
1084       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
1085         "v19"  // Clobber List
1086   );
1087 }
1088 
ScaleUVRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1089 void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
1090                              ptrdiff_t src_stride,
1091                              uint8_t* dst,
1092                              int dst_width) {
1093   asm volatile(
1094       // change the stride to row 2 pointer
1095       "add         %1, %1, %0                    \n"
1096       "1:                                        \n"
1097       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 UV
1098       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
1099       "uaddlp      v0.8h, v0.16b                 \n"  // U 16 bytes -> 8 shorts.
1100       "uaddlp      v1.8h, v1.16b                 \n"  // V 16 bytes -> 8 shorts.
1101       "ld2         {v16.16b,v17.16b}, [%1], #32  \n"  // load 16
1102       "uadalp      v0.8h, v16.16b                \n"  // U 16 bytes -> 8 shorts.
1103       "uadalp      v1.8h, v17.16b                \n"  // V 16 bytes -> 8 shorts.
1104       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1105       "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
1106       "prfm        pldl1keep, [%1, 448]          \n"
1107       "rshrn       v1.8b, v1.8h, #2              \n"
1108       "st2         {v0.8b,v1.8b}, [%2], #16      \n"
1109       "b.gt        1b                            \n"
1110       : "+r"(src_ptr),     // %0
1111         "+r"(src_stride),  // %1
1112         "+r"(dst),         // %2
1113         "+r"(dst_width)    // %3
1114       :
1115       : "memory", "cc", "v0", "v1", "v16", "v17");
1116 }
1117 
1118 // Reads 4 pixels at a time.
ScaleUVRowDownEven_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_ptr,int dst_width)1119 void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
1120                              ptrdiff_t src_stride,
1121                              int src_stepx,  // pixel step
1122                              uint8_t* dst_ptr,
1123                              int dst_width) {
1124   const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
1125   const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
1126   const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
1127   (void)src_stride;
1128   asm volatile(
1129       "1:                                        \n"
1130       "ld1        {v0.h}[0], [%0], %6            \n"
1131       "ld1        {v1.h}[0], [%1], %6            \n"
1132       "ld1        {v2.h}[0], [%2], %6            \n"
1133       "ld1        {v3.h}[0], [%3], %6            \n"
1134       "subs       %w5, %w5, #4                   \n"  // 4 pixels per loop.
1135       "st4        {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
1136       "b.gt       1b                             \n"
1137       : "+r"(src_ptr),                 // %0
1138         "+r"(src1_ptr),                // %1
1139         "+r"(src2_ptr),                // %2
1140         "+r"(src3_ptr),                // %3
1141         "+r"(dst_ptr),                 // %4
1142         "+r"(dst_width)                // %5
1143       : "r"((int64_t)(src_stepx * 8))  // %6
1144       : "memory", "cc", "v0", "v1", "v2", "v3");
1145 }
1146 
1147 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1148 
1149 #ifdef __cplusplus
1150 }  // extern "C"
1151 }  // namespace libyuv
1152 #endif
1153