• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale.h"
13 #include "libyuv/scale_row.h"
14 
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19 
20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22 
23 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)24 void ScaleRowDown2_NEON(const uint8_t* src_ptr,
25                         ptrdiff_t src_stride,
26                         uint8_t* dst,
27                         int dst_width) {
28   (void)src_stride;
29   asm volatile(
30       "1:                                        \n"
31       // load even pixels into v0, odd into v1
32       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
33       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
34       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
35       "st1         {v1.16b}, [%1], #16           \n"  // store odd pixels
36       "b.gt        1b                            \n"
37       : "+r"(src_ptr),   // %0
38         "+r"(dst),       // %1
39         "+r"(dst_width)  // %2
40       :
41       : "v0", "v1"  // Clobber List
42   );
43 }
44 
45 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)46 void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
47                               ptrdiff_t src_stride,
48                               uint8_t* dst,
49                               int dst_width) {
50   (void)src_stride;
51   asm volatile(
52       "1:                                        \n"
53       // load even pixels into v0, odd into v1
54       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"
55       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
56       "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
57       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
58       "st1         {v0.16b}, [%1], #16           \n"
59       "b.gt        1b                            \n"
60       : "+r"(src_ptr),   // %0
61         "+r"(dst),       // %1
62         "+r"(dst_width)  // %2
63       :
64       : "v0", "v1"  // Clobber List
65   );
66 }
67 
68 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)69 void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
70                            ptrdiff_t src_stride,
71                            uint8_t* dst,
72                            int dst_width) {
73   asm volatile(
74       // change the stride to row 2 pointer
75       "add         %1, %1, %0                    \n"
76       "1:                                        \n"
77       "ld1         {v0.16b, v1.16b}, [%0], #32   \n"  // load row 1 and post inc
78       "ld1         {v2.16b, v3.16b}, [%1], #32   \n"  // load row 2 and post inc
79       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
80       "uaddlp      v0.8h, v0.16b                 \n"  // row 1 add adjacent
81       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
82       "uaddlp      v1.8h, v1.16b                 \n"
83       "prfm        pldl1keep, [%1, 448]          \n"
84       "uadalp      v0.8h, v2.16b                 \n"  // += row 2 add adjacent
85       "uadalp      v1.8h, v3.16b                 \n"
86       "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
87       "rshrn2      v0.16b, v1.8h, #2             \n"
88       "st1         {v0.16b}, [%2], #16           \n"
89       "b.gt        1b                            \n"
90       : "+r"(src_ptr),     // %0
91         "+r"(src_stride),  // %1
92         "+r"(dst),         // %2
93         "+r"(dst_width)    // %3
94       :
95       : "v0", "v1", "v2", "v3"  // Clobber List
96   );
97 }
98 
ScaleRowDown4_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)99 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
100                         ptrdiff_t src_stride,
101                         uint8_t* dst_ptr,
102                         int dst_width) {
103   (void)src_stride;
104   asm volatile(
105       "1:                                        \n"
106       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
107       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
108       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
109       "st1         {v2.8b}, [%1], #8             \n"
110       "b.gt        1b                            \n"
111       : "+r"(src_ptr),   // %0
112         "+r"(dst_ptr),   // %1
113         "+r"(dst_width)  // %2
114       :
115       : "v0", "v1", "v2", "v3", "memory", "cc");
116 }
117 
ScaleRowDown4Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)118 void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
119                            ptrdiff_t src_stride,
120                            uint8_t* dst_ptr,
121                            int dst_width) {
122   const uint8_t* src_ptr1 = src_ptr + src_stride;
123   const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
124   const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
125   asm volatile(
126       "1:                                        \n"
127       "ld1         {v0.16b}, [%0], #16           \n"  // load up 16x4
128       "ld1         {v1.16b}, [%2], #16           \n"
129       "ld1         {v2.16b}, [%3], #16           \n"
130       "ld1         {v3.16b}, [%4], #16           \n"
131       "subs        %w5, %w5, #4                  \n"
132       "uaddlp      v0.8h, v0.16b                 \n"
133       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
134       "uadalp      v0.8h, v1.16b                 \n"
135       "prfm        pldl1keep, [%2, 448]          \n"
136       "uadalp      v0.8h, v2.16b                 \n"
137       "prfm        pldl1keep, [%3, 448]          \n"
138       "uadalp      v0.8h, v3.16b                 \n"
139       "prfm        pldl1keep, [%4, 448]          \n"
140       "addp        v0.8h, v0.8h, v0.8h           \n"
141       "rshrn       v0.8b, v0.8h, #4              \n"  // divide by 16 w/rounding
142       "st1         {v0.s}[0], [%1], #4           \n"
143       "b.gt        1b                            \n"
144       : "+r"(src_ptr),   // %0
145         "+r"(dst_ptr),   // %1
146         "+r"(src_ptr1),  // %2
147         "+r"(src_ptr2),  // %3
148         "+r"(src_ptr3),  // %4
149         "+r"(dst_width)  // %5
150       :
151       : "v0", "v1", "v2", "v3", "memory", "cc");
152 }
153 
154 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
155 // to load up the every 4th pixel into a 4 different registers.
156 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)157 void ScaleRowDown34_NEON(const uint8_t* src_ptr,
158                          ptrdiff_t src_stride,
159                          uint8_t* dst_ptr,
160                          int dst_width) {
161   (void)src_stride;
162   asm volatile(
163       "1:                                        \n"
164       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
165       "subs        %w2, %w2, #24                 \n"
166       "orr         v2.16b, v3.16b, v3.16b        \n"  // order v0,v1,v2
167       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
168       "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
169       "b.gt        1b                            \n"
170       : "+r"(src_ptr),   // %0
171         "+r"(dst_ptr),   // %1
172         "+r"(dst_width)  // %2
173       :
174       : "v0", "v1", "v2", "v3", "memory", "cc");
175 }
176 
ScaleRowDown34_0_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)177 void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
178                                ptrdiff_t src_stride,
179                                uint8_t* dst_ptr,
180                                int dst_width) {
181   asm volatile(
182       "movi        v20.8b, #3                    \n"
183       "add         %3, %3, %0                    \n"
184       "1:                                        \n"
185       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
186       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
187       "subs        %w2, %w2, #24                 \n"
188 
189       // filter src line 0 with src line 1
190       // expand chars to shorts to allow for room
191       // when adding lines together
192       "ushll       v16.8h, v4.8b, #0             \n"
193       "ushll       v17.8h, v5.8b, #0             \n"
194       "ushll       v18.8h, v6.8b, #0             \n"
195       "ushll       v19.8h, v7.8b, #0             \n"
196 
197       // 3 * line_0 + line_1
198       "umlal       v16.8h, v0.8b, v20.8b         \n"
199       "umlal       v17.8h, v1.8b, v20.8b         \n"
200       "umlal       v18.8h, v2.8b, v20.8b         \n"
201       "umlal       v19.8h, v3.8b, v20.8b         \n"
202       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
203 
204       // (3 * line_0 + line_1 + 2) >> 2
205       "uqrshrn     v0.8b, v16.8h, #2             \n"
206       "uqrshrn     v1.8b, v17.8h, #2             \n"
207       "uqrshrn     v2.8b, v18.8h, #2             \n"
208       "uqrshrn     v3.8b, v19.8h, #2             \n"
209       "prfm        pldl1keep, [%3, 448]          \n"
210 
211       // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
212       "ushll       v16.8h, v1.8b, #0             \n"
213       "umlal       v16.8h, v0.8b, v20.8b         \n"
214       "uqrshrn     v0.8b, v16.8h, #2             \n"
215 
216       // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
217       "urhadd      v1.8b, v1.8b, v2.8b           \n"
218 
219       // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
220       "ushll       v16.8h, v2.8b, #0             \n"
221       "umlal       v16.8h, v3.8b, v20.8b         \n"
222       "uqrshrn     v2.8b, v16.8h, #2             \n"
223 
224       "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
225 
226       "b.gt        1b                            \n"
227       : "+r"(src_ptr),    // %0
228         "+r"(dst_ptr),    // %1
229         "+r"(dst_width),  // %2
230         "+r"(src_stride)  // %3
231       :
232       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
233         "v19", "v20", "memory", "cc");
234 }
235 
ScaleRowDown34_1_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)236 void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
237                                ptrdiff_t src_stride,
238                                uint8_t* dst_ptr,
239                                int dst_width) {
240   asm volatile(
241       "movi        v20.8b, #3                    \n"
242       "add         %3, %3, %0                    \n"
243       "1:                                        \n"
244       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // src line 0
245       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n"  // src line 1
246       "subs        %w2, %w2, #24                 \n"
247       // average src line 0 with src line 1
248       "urhadd      v0.8b, v0.8b, v4.8b           \n"
249       "urhadd      v1.8b, v1.8b, v5.8b           \n"
250       "urhadd      v2.8b, v2.8b, v6.8b           \n"
251       "urhadd      v3.8b, v3.8b, v7.8b           \n"
252       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
253 
254       // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
255       "ushll       v4.8h, v1.8b, #0              \n"
256       "umlal       v4.8h, v0.8b, v20.8b          \n"
257       "uqrshrn     v0.8b, v4.8h, #2              \n"
258       "prfm        pldl1keep, [%3, 448]          \n"
259 
260       // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
261       "urhadd      v1.8b, v1.8b, v2.8b           \n"
262 
263       // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
264       "ushll       v4.8h, v2.8b, #0              \n"
265       "umlal       v4.8h, v3.8b, v20.8b          \n"
266       "uqrshrn     v2.8b, v4.8h, #2              \n"
267 
268       "st3         {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
269       "b.gt        1b                            \n"
270       : "+r"(src_ptr),    // %0
271         "+r"(dst_ptr),    // %1
272         "+r"(dst_width),  // %2
273         "+r"(src_stride)  // %3
274       :
275       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
276 }
277 
278 static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
279                               22, 24, 27, 30, 0,  0,  0,  0};
280 static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
281                                 34, 6,  22, 35, 0,  0,  0, 0};
282 static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
283                                    65536 / 12, 65536 / 12, 65536 / 12,
284                                    65536 / 12, 65536 / 12};
285 static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
286                                    65536 / 18, 65536 / 18, 65536 / 18,
287                                    65536 / 18, 65536 / 18};
288 
289 // 32 -> 12
ScaleRowDown38_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)290 void ScaleRowDown38_NEON(const uint8_t* src_ptr,
291                          ptrdiff_t src_stride,
292                          uint8_t* dst_ptr,
293                          int dst_width) {
294   (void)src_stride;
295   asm volatile(
296       "ld1         {v3.16b}, [%3]                \n"
297       "1:                                        \n"
298       "ld1         {v0.16b,v1.16b}, [%0], #32    \n"
299       "subs        %w2, %w2, #12                 \n"
300       "tbl         v2.16b, {v0.16b,v1.16b}, v3.16b \n"
301       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
302       "st1         {v2.8b}, [%1], #8             \n"
303       "st1         {v2.s}[2], [%1], #4           \n"
304       "b.gt        1b                            \n"
305       : "+r"(src_ptr),   // %0
306         "+r"(dst_ptr),   // %1
307         "+r"(dst_width)  // %2
308       : "r"(&kShuf38)    // %3
309       : "v0", "v1", "v2", "v3", "memory", "cc");
310 }
311 
312 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)313 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
314                                       ptrdiff_t src_stride,
315                                       uint8_t* dst_ptr,
316                                       int dst_width) {
317   const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
318   ptrdiff_t tmp_src_stride = src_stride;
319 
320   asm volatile(
321       "ld1         {v29.8h}, [%5]                \n"
322       "ld1         {v30.16b}, [%6]               \n"
323       "ld1         {v31.8h}, [%7]                \n"
324       "add         %2, %2, %0                    \n"
325       "1:                                        \n"
326 
327       // 00 40 01 41 02 42 03 43
328       // 10 50 11 51 12 52 13 53
329       // 20 60 21 61 22 62 23 63
330       // 30 70 31 71 32 72 33 73
331       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
332       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
333       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
334       "subs        %w4, %w4, #12                 \n"
335 
336       // Shuffle the input data around to get align the data
337       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
338       // 00 10 01 11 02 12 03 13
339       // 40 50 41 51 42 52 43 53
340       "trn1        v20.8b, v0.8b, v1.8b          \n"
341       "trn2        v21.8b, v0.8b, v1.8b          \n"
342       "trn1        v22.8b, v4.8b, v5.8b          \n"
343       "trn2        v23.8b, v4.8b, v5.8b          \n"
344       "trn1        v24.8b, v16.8b, v17.8b        \n"
345       "trn2        v25.8b, v16.8b, v17.8b        \n"
346 
347       // 20 30 21 31 22 32 23 33
348       // 60 70 61 71 62 72 63 73
349       "trn1        v0.8b, v2.8b, v3.8b           \n"
350       "trn2        v1.8b, v2.8b, v3.8b           \n"
351       "trn1        v4.8b, v6.8b, v7.8b           \n"
352       "trn2        v5.8b, v6.8b, v7.8b           \n"
353       "trn1        v16.8b, v18.8b, v19.8b        \n"
354       "trn2        v17.8b, v18.8b, v19.8b        \n"
355 
356       // 00+10 01+11 02+12 03+13
357       // 40+50 41+51 42+52 43+53
358       "uaddlp      v20.4h, v20.8b                \n"
359       "uaddlp      v21.4h, v21.8b                \n"
360       "uaddlp      v22.4h, v22.8b                \n"
361       "uaddlp      v23.4h, v23.8b                \n"
362       "uaddlp      v24.4h, v24.8b                \n"
363       "uaddlp      v25.4h, v25.8b                \n"
364 
365       // 60+70 61+71 62+72 63+73
366       "uaddlp      v1.4h, v1.8b                  \n"
367       "uaddlp      v5.4h, v5.8b                  \n"
368       "uaddlp      v17.4h, v17.8b                \n"
369 
370       // combine source lines
371       "add         v20.4h, v20.4h, v22.4h        \n"
372       "add         v21.4h, v21.4h, v23.4h        \n"
373       "add         v20.4h, v20.4h, v24.4h        \n"
374       "add         v21.4h, v21.4h, v25.4h        \n"
375       "add         v2.4h, v1.4h, v5.4h           \n"
376       "add         v2.4h, v2.4h, v17.4h          \n"
377 
378       // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
379       //             + s[6 + st * 1] + s[7 + st * 1]
380       //             + s[6 + st * 2] + s[7 + st * 2]) / 6
381       "sqrdmulh    v2.8h, v2.8h, v29.8h          \n"
382       "xtn         v2.8b,  v2.8h                 \n"
383 
384       // Shuffle 2,3 reg around so that 2 can be added to the
385       //  0,1 reg and 3 can be added to the 4,5 reg. This
386       //  requires expanding from u8 to u16 as the 0,1 and 4,5
387       //  registers are already expanded. Then do transposes
388       //  to get aligned.
389       // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
390       "ushll       v16.8h, v16.8b, #0            \n"
391       "uaddl       v0.8h, v0.8b, v4.8b           \n"
392 
393       // combine source lines
394       "add         v0.8h, v0.8h, v16.8h          \n"
395 
396       // xx 20 xx 21 xx 22 xx 23
397       // xx 30 xx 31 xx 32 xx 33
398       "trn1        v1.8h, v0.8h, v0.8h           \n"
399       "trn2        v4.8h, v0.8h, v0.8h           \n"
400       "xtn         v0.4h, v1.4s                  \n"
401       "xtn         v4.4h, v4.4s                  \n"
402       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
403 
404       // 0+1+2, 3+4+5
405       "add         v20.8h, v20.8h, v0.8h         \n"
406       "add         v21.8h, v21.8h, v4.8h         \n"
407       "prfm        pldl1keep, [%2, 448]          \n"
408 
409       // Need to divide, but can't downshift as the the value
410       //  isn't a power of 2. So multiply by 65536 / n
411       //  and take the upper 16 bits.
412       "sqrdmulh    v0.8h, v20.8h, v31.8h         \n"
413       "sqrdmulh    v1.8h, v21.8h, v31.8h         \n"
414       "prfm        pldl1keep, [%3, 448]          \n"
415 
416       // Align for table lookup, vtbl requires registers to be adjacent
417       "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
418 
419       "st1         {v3.8b}, [%1], #8             \n"
420       "st1         {v3.s}[2], [%1], #4           \n"
421       "b.gt        1b                            \n"
422       : "+r"(src_ptr),         // %0
423         "+r"(dst_ptr),         // %1
424         "+r"(tmp_src_stride),  // %2
425         "+r"(src_ptr1),        // %3
426         "+r"(dst_width)        // %4
427       : "r"(&kMult38_Div6),    // %5
428         "r"(&kShuf38_2),       // %6
429         "r"(&kMult38_Div9)     // %7
430       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
431         "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
432         "memory", "cc");
433 }
434 
435 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)436 void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
437                                ptrdiff_t src_stride,
438                                uint8_t* dst_ptr,
439                                int dst_width) {
440   // TODO(fbarchard): use src_stride directly for clang 3.5+.
441   ptrdiff_t tmp_src_stride = src_stride;
442   asm volatile(
443       "ld1         {v30.8h}, [%4]                \n"
444       "ld1         {v31.16b}, [%5]               \n"
445       "add         %2, %2, %0                    \n"
446       "1:                                        \n"
447 
448       // 00 40 01 41 02 42 03 43
449       // 10 50 11 51 12 52 13 53
450       // 20 60 21 61 22 62 23 63
451       // 30 70 31 71 32 72 33 73
452       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
453       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
454       "subs        %w3, %w3, #12                 \n"
455 
456       // Shuffle the input data around to get align the data
457       //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
458       // 00 10 01 11 02 12 03 13
459       // 40 50 41 51 42 52 43 53
460       "trn1        v16.8b, v0.8b, v1.8b          \n"
461       "trn2        v17.8b, v0.8b, v1.8b          \n"
462       "trn1        v18.8b, v4.8b, v5.8b          \n"
463       "trn2        v19.8b, v4.8b, v5.8b          \n"
464 
465       // 20 30 21 31 22 32 23 33
466       // 60 70 61 71 62 72 63 73
467       "trn1        v0.8b, v2.8b, v3.8b           \n"
468       "trn2        v1.8b, v2.8b, v3.8b           \n"
469       "trn1        v4.8b, v6.8b, v7.8b           \n"
470       "trn2        v5.8b, v6.8b, v7.8b           \n"
471 
472       // 00+10 01+11 02+12 03+13
473       // 40+50 41+51 42+52 43+53
474       "uaddlp      v16.4h, v16.8b                \n"
475       "uaddlp      v17.4h, v17.8b                \n"
476       "uaddlp      v18.4h, v18.8b                \n"
477       "uaddlp      v19.4h, v19.8b                \n"
478 
479       // 60+70 61+71 62+72 63+73
480       "uaddlp      v1.4h, v1.8b                  \n"
481       "uaddlp      v5.4h, v5.8b                  \n"
482 
483       // combine source lines
484       "add         v16.4h, v16.4h, v18.4h        \n"
485       "add         v17.4h, v17.4h, v19.4h        \n"
486       "add         v2.4h, v1.4h, v5.4h           \n"
487 
488       // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
489       "uqrshrn     v2.8b, v2.8h, #2              \n"
490 
491       // Shuffle 2,3 reg around so that 2 can be added to the
492       //  0,1 reg and 3 can be added to the 4,5 reg. This
493       //  requires expanding from u8 to u16 as the 0,1 and 4,5
494       //  registers are already expanded. Then do transposes
495       //  to get aligned.
496       // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
497 
498       // combine source lines
499       "uaddl       v0.8h, v0.8b, v4.8b           \n"
500 
501       // xx 20 xx 21 xx 22 xx 23
502       // xx 30 xx 31 xx 32 xx 33
503       "trn1        v1.8h, v0.8h, v0.8h           \n"
504       "trn2        v4.8h, v0.8h, v0.8h           \n"
505       "xtn         v0.4h, v1.4s                  \n"
506       "xtn         v4.4h, v4.4s                  \n"
507       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
508 
509       // 0+1+2, 3+4+5
510       "add         v16.8h, v16.8h, v0.8h         \n"
511       "add         v17.8h, v17.8h, v4.8h         \n"
512       "prfm        pldl1keep, [%2, 448]          \n"
513 
514       // Need to divide, but can't downshift as the the value
515       //  isn't a power of 2. So multiply by 65536 / n
516       //  and take the upper 16 bits.
517       "sqrdmulh    v0.8h, v16.8h, v30.8h         \n"
518       "sqrdmulh    v1.8h, v17.8h, v30.8h         \n"
519 
520       // Align for table lookup, vtbl requires registers to
521       //  be adjacent
522 
523       "tbl         v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
524 
525       "st1         {v3.8b}, [%1], #8             \n"
526       "st1         {v3.s}[2], [%1], #4           \n"
527       "b.gt        1b                            \n"
528       : "+r"(src_ptr),         // %0
529         "+r"(dst_ptr),         // %1
530         "+r"(tmp_src_stride),  // %2
531         "+r"(dst_width)        // %3
532       : "r"(&kMult38_Div6),    // %4
533         "r"(&kShuf38_2)        // %5
534       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
535         "v19", "v30", "v31", "memory", "cc");
536 }
537 
ScaleRowUp2_Linear_NEON(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)538 void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
539                              uint8_t* dst_ptr,
540                              int dst_width) {
541   const uint8_t* src_temp = src_ptr + 1;
542   asm volatile(
543       "movi        v31.8b, #3                    \n"
544 
545       "1:                                        \n"
546       "ldr         d0, [%0], #8                  \n"  // 01234567
547       "ldr         d1, [%1], #8                  \n"  // 12345678
548       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
549 
550       "ushll       v2.8h, v0.8b, #0              \n"  // 01234567 (16b)
551       "ushll       v3.8h, v1.8b, #0              \n"  // 12345678 (16b)
552 
553       "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (odd)
554       "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (even)
555 
556       "rshrn       v2.8b, v2.8h, #2              \n"  // 3/4*near+1/4*far (odd)
557       "rshrn       v1.8b, v3.8h, #2              \n"  // 3/4*near+1/4*far (even)
558 
559       "st2         {v1.8b, v2.8b}, [%2], #16     \n"  // store
560       "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
561       "b.gt        1b                            \n"
562       : "+r"(src_ptr),   // %0
563         "+r"(src_temp),  // %1
564         "+r"(dst_ptr),   // %2
565         "+r"(dst_width)  // %3
566       :
567       : "memory", "cc", "v0", "v1", "v2", "v3", "v31"  // Clobber List
568   );
569 }
570 
ScaleRowUp2_Bilinear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)571 void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
572                                ptrdiff_t src_stride,
573                                uint8_t* dst_ptr,
574                                ptrdiff_t dst_stride,
575                                int dst_width) {
576   const uint8_t* src_ptr1 = src_ptr + src_stride;
577   uint8_t* dst_ptr1 = dst_ptr + dst_stride;
578   const uint8_t* src_temp = src_ptr + 1;
579   const uint8_t* src_temp1 = src_ptr1 + 1;
580 
581   asm volatile(
582       "movi        v31.8b, #3                    \n"
583       "movi        v30.8h, #3                    \n"
584 
585       "1:                                        \n"
586       "ldr         d0, [%0], #8                  \n"  // 01234567
587       "ldr         d1, [%2], #8                  \n"  // 12345678
588       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
589 
590       "ushll       v2.8h, v0.8b, #0              \n"  // 01234567 (16b)
591       "ushll       v3.8h, v1.8b, #0              \n"  // 12345678 (16b)
592       "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (1, odd)
593       "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (1, even)
594 
595       "ldr         d0, [%1], #8                  \n"
596       "ldr         d1, [%3], #8                  \n"
597       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
598 
599       "ushll       v4.8h, v0.8b, #0              \n"  // 01234567 (16b)
600       "ushll       v5.8h, v1.8b, #0              \n"  // 12345678 (16b)
601       "umlal       v4.8h, v1.8b, v31.8b          \n"  // 3*near+far (2, odd)
602       "umlal       v5.8h, v0.8b, v31.8b          \n"  // 3*near+far (2, even)
603 
604       "mov         v0.16b, v4.16b                \n"
605       "mov         v1.16b, v5.16b                \n"
606       "mla         v4.8h, v2.8h, v30.8h          \n"  // 9 3 3 1 (1, odd)
607       "mla         v5.8h, v3.8h, v30.8h          \n"  // 9 3 3 1 (1, even)
608       "mla         v2.8h, v0.8h, v30.8h          \n"  // 9 3 3 1 (2, odd)
609       "mla         v3.8h, v1.8h, v30.8h          \n"  // 9 3 3 1 (2, even)
610 
611       "rshrn       v2.8b, v2.8h, #4              \n"  // 2, odd
612       "rshrn       v1.8b, v3.8h, #4              \n"  // 2, even
613       "rshrn       v4.8b, v4.8h, #4              \n"  // 1, odd
614       "rshrn       v3.8b, v5.8h, #4              \n"  // 1, even
615 
616       "st2         {v1.8b, v2.8b}, [%5], #16     \n"  // store 1
617       "st2         {v3.8b, v4.8b}, [%4], #16     \n"  // store 2
618       "subs        %w6, %w6, #16                 \n"  // 8 sample -> 16 sample
619       "b.gt        1b                            \n"
620       : "+r"(src_ptr),    // %0
621         "+r"(src_ptr1),   // %1
622         "+r"(src_temp),   // %2
623         "+r"(src_temp1),  // %3
624         "+r"(dst_ptr),    // %4
625         "+r"(dst_ptr1),   // %5
626         "+r"(dst_width)   // %6
627       :
628       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
629         "v31"  // Clobber List
630   );
631 }
632 
ScaleRowUp2_Linear_12_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)633 void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
634                                 uint16_t* dst_ptr,
635                                 int dst_width) {
636   const uint16_t* src_temp = src_ptr + 1;
637   asm volatile(
638       "movi        v31.8h, #3                    \n"
639 
640       "1:                                        \n"
641       "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
642       "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
643       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
644 
645       "mov         v2.16b, v0.16b                \n"
646       "mla         v0.8h, v1.8h, v31.8h          \n"  // 3*near+far (odd)
647       "mla         v1.8h, v2.8h, v31.8h          \n"  // 3*near+far (even)
648 
649       "urshr       v2.8h, v0.8h, #2              \n"  // 3/4*near+1/4*far (odd)
650       "urshr       v1.8h, v1.8h, #2              \n"  // 3/4*near+1/4*far (even)
651 
652       "st2         {v1.8h, v2.8h}, [%2], #32     \n"  // store
653       "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
654       "b.gt        1b                            \n"
655       : "+r"(src_ptr),   // %0
656         "+r"(src_temp),  // %1
657         "+r"(dst_ptr),   // %2
658         "+r"(dst_width)  // %3
659       :
660       : "memory", "cc", "v0", "v1", "v2", "v31"  // Clobber List
661   );
662 }
663 
ScaleRowUp2_Bilinear_12_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)664 void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
665                                   ptrdiff_t src_stride,
666                                   uint16_t* dst_ptr,
667                                   ptrdiff_t dst_stride,
668                                   int dst_width) {
669   const uint16_t* src_ptr1 = src_ptr + src_stride;
670   uint16_t* dst_ptr1 = dst_ptr + dst_stride;
671   const uint16_t* src_temp = src_ptr + 1;
672   const uint16_t* src_temp1 = src_ptr1 + 1;
673 
674   asm volatile(
675       "movi        v31.8h, #3                    \n"
676 
677       "1:                                        \n"
678       "ld1         {v2.8h}, [%0], #16            \n"  // 01234567 (16b)
679       "ld1         {v3.8h}, [%2], #16            \n"  // 12345678 (16b)
680       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
681 
682       "mov         v0.16b, v2.16b                \n"
683       "mla         v2.8h, v3.8h, v31.8h          \n"  // 3*near+far (odd)
684       "mla         v3.8h, v0.8h, v31.8h          \n"  // 3*near+far (even)
685 
686       "ld1         {v4.8h}, [%1], #16            \n"  // 01234567 (16b)
687       "ld1         {v5.8h}, [%3], #16            \n"  // 12345678 (16b)
688       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
689 
690       "mov         v0.16b, v4.16b                \n"
691       "mla         v4.8h, v5.8h, v31.8h          \n"  // 3*near+far (odd)
692       "mla         v5.8h, v0.8h, v31.8h          \n"  // 3*near+far (even)
693 
694       "mov         v0.16b, v4.16b                \n"
695       "mov         v1.16b, v5.16b                \n"
696       "mla         v4.8h, v2.8h, v31.8h          \n"  // 9 3 3 1 (1, odd)
697       "mla         v5.8h, v3.8h, v31.8h          \n"  // 9 3 3 1 (1, even)
698       "mla         v2.8h, v0.8h, v31.8h          \n"  // 9 3 3 1 (2, odd)
699       "mla         v3.8h, v1.8h, v31.8h          \n"  // 9 3 3 1 (2, even)
700 
701       "urshr       v2.8h, v2.8h, #4              \n"  // 2, odd
702       "urshr       v1.8h, v3.8h, #4              \n"  // 2, even
703       "urshr       v4.8h, v4.8h, #4              \n"  // 1, odd
704       "urshr       v3.8h, v5.8h, #4              \n"  // 1, even
705 
706       "st2         {v3.8h, v4.8h}, [%4], #32     \n"  // store 1
707       "st2         {v1.8h, v2.8h}, [%5], #32     \n"  // store 2
708 
709       "subs        %w6, %w6, #16                 \n"  // 8 sample -> 16 sample
710       "b.gt        1b                            \n"
711       : "+r"(src_ptr),    // %0
712         "+r"(src_ptr1),   // %1
713         "+r"(src_temp),   // %2
714         "+r"(src_temp1),  // %3
715         "+r"(dst_ptr),    // %4
716         "+r"(dst_ptr1),   // %5
717         "+r"(dst_width)   // %6
718       :
719       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
720         "v31"  // Clobber List
721   );
722 }
723 
ScaleRowUp2_Linear_16_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)724 void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
725                                 uint16_t* dst_ptr,
726                                 int dst_width) {
727   const uint16_t* src_temp = src_ptr + 1;
728   asm volatile(
729       "movi        v31.8h, #3                    \n"
730 
731       "1:                                        \n"
732       "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
733       "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
734       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
735 
736       "ushll       v2.4s, v0.4h, #0              \n"  // 0123 (32b)
737       "ushll2      v3.4s, v0.8h, #0              \n"  // 4567 (32b)
738       "ushll       v4.4s, v1.4h, #0              \n"  // 1234 (32b)
739       "ushll2      v5.4s, v1.8h, #0              \n"  // 5678 (32b)
740 
741       "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
742       "umlal2      v3.4s, v1.8h, v31.8h          \n"  // 3*near+far (2, odd)
743       "umlal       v4.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
744       "umlal2      v5.4s, v0.8h, v31.8h          \n"  // 3*near+far (2, even)
745 
746       "rshrn       v0.4h, v4.4s, #2              \n"  // 3/4*near+1/4*far
747       "rshrn2      v0.8h, v5.4s, #2              \n"  // 3/4*near+1/4*far (even)
748       "rshrn       v1.4h, v2.4s, #2              \n"  // 3/4*near+1/4*far
749       "rshrn2      v1.8h, v3.4s, #2              \n"  // 3/4*near+1/4*far (odd)
750 
751       "st2         {v0.8h, v1.8h}, [%2], #32     \n"  // store
752       "subs        %w3, %w3, #16                 \n"  // 8 sample -> 16 sample
753       "b.gt        1b                            \n"
754       : "+r"(src_ptr),   // %0
755         "+r"(src_temp),  // %1
756         "+r"(dst_ptr),   // %2
757         "+r"(dst_width)  // %3
758       :
759       : "memory", "cc", "v0", "v1", "v2", "v31"  // Clobber List
760   );
761 }
762 
ScaleRowUp2_Bilinear_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)763 void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
764                                   ptrdiff_t src_stride,
765                                   uint16_t* dst_ptr,
766                                   ptrdiff_t dst_stride,
767                                   int dst_width) {
768   const uint16_t* src_ptr1 = src_ptr + src_stride;
769   uint16_t* dst_ptr1 = dst_ptr + dst_stride;
770   const uint16_t* src_temp = src_ptr + 1;
771   const uint16_t* src_temp1 = src_ptr1 + 1;
772 
773   asm volatile(
774       "movi        v31.4h, #3                    \n"
775       "movi        v30.4s, #3                    \n"
776 
777       "1:                                        \n"
778       "ldr         d0, [%0], #8                  \n"  // 0123 (16b)
779       "ldr         d1, [%2], #8                  \n"  // 1234 (16b)
780       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
781       "ushll       v2.4s, v0.4h, #0              \n"  // 0123 (32b)
782       "ushll       v3.4s, v1.4h, #0              \n"  // 1234 (32b)
783       "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
784       "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
785 
786       "ldr         d0, [%1], #8                  \n"  // 0123 (16b)
787       "ldr         d1, [%3], #8                  \n"  // 1234 (16b)
788       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
789       "ushll       v4.4s, v0.4h, #0              \n"  // 0123 (32b)
790       "ushll       v5.4s, v1.4h, #0              \n"  // 1234 (32b)
791       "umlal       v4.4s, v1.4h, v31.4h          \n"  // 3*near+far (2, odd)
792       "umlal       v5.4s, v0.4h, v31.4h          \n"  // 3*near+far (2, even)
793 
794       "mov         v0.16b, v4.16b                \n"
795       "mov         v1.16b, v5.16b                \n"
796       "mla         v4.4s, v2.4s, v30.4s          \n"  // 9 3 3 1 (1, odd)
797       "mla         v5.4s, v3.4s, v30.4s          \n"  // 9 3 3 1 (1, even)
798       "mla         v2.4s, v0.4s, v30.4s          \n"  // 9 3 3 1 (2, odd)
799       "mla         v3.4s, v1.4s, v30.4s          \n"  // 9 3 3 1 (2, even)
800 
801       "rshrn       v1.4h, v4.4s, #4              \n"  // 3/4*near+1/4*far
802       "rshrn       v0.4h, v5.4s, #4              \n"  // 3/4*near+1/4*far
803       "rshrn       v5.4h, v2.4s, #4              \n"  // 3/4*near+1/4*far
804       "rshrn       v4.4h, v3.4s, #4              \n"  // 3/4*near+1/4*far
805 
806       "st2         {v0.4h, v1.4h}, [%4], #16     \n"  // store 1
807       "st2         {v4.4h, v5.4h}, [%5], #16     \n"  // store 2
808 
809       "subs        %w6, %w6, #8                  \n"  // 4 sample -> 8 sample
810       "b.gt        1b                            \n"
811       : "+r"(src_ptr),    // %0
812         "+r"(src_ptr1),   // %1
813         "+r"(src_temp),   // %2
814         "+r"(src_temp1),  // %3
815         "+r"(dst_ptr),    // %4
816         "+r"(dst_ptr1),   // %5
817         "+r"(dst_width)   // %6
818       :
819       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
820         "v31"  // Clobber List
821   );
822 }
823 
ScaleUVRowUp2_Linear_NEON(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)824 void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
825                                uint8_t* dst_ptr,
826                                int dst_width) {
827   const uint8_t* src_temp = src_ptr + 2;
828   asm volatile(
829       "movi        v31.8b, #3                    \n"
830 
831       "1:                                        \n"
832       "ldr         d0, [%0], #8                  \n"  // 00112233 (1u1v)
833       "ldr         d1, [%1], #8                  \n"  // 11223344 (1u1v)
834       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
835 
836       "ushll       v2.8h, v0.8b, #0              \n"  // 00112233 (1u1v, 16b)
837       "ushll       v3.8h, v1.8b, #0              \n"  // 11223344 (1u1v, 16b)
838 
839       "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (odd)
840       "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (even)
841 
842       "rshrn       v2.8b, v2.8h, #2              \n"  // 3/4*near+1/4*far (odd)
843       "rshrn       v1.8b, v3.8h, #2              \n"  // 3/4*near+1/4*far (even)
844 
845       "st2         {v1.4h, v2.4h}, [%2], #16     \n"  // store
846       "subs        %w3, %w3, #8                  \n"  // 4 uv -> 8 uv
847       "b.gt        1b                            \n"
848       : "+r"(src_ptr),   // %0
849         "+r"(src_temp),  // %1
850         "+r"(dst_ptr),   // %2
851         "+r"(dst_width)  // %3
852       :
853       : "memory", "cc", "v0", "v1", "v2", "v3", "v31"  // Clobber List
854   );
855 }
856 
ScaleUVRowUp2_Bilinear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)857 void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
858                                  ptrdiff_t src_stride,
859                                  uint8_t* dst_ptr,
860                                  ptrdiff_t dst_stride,
861                                  int dst_width) {
862   const uint8_t* src_ptr1 = src_ptr + src_stride;
863   uint8_t* dst_ptr1 = dst_ptr + dst_stride;
864   const uint8_t* src_temp = src_ptr + 2;
865   const uint8_t* src_temp1 = src_ptr1 + 2;
866 
867   asm volatile(
868       "movi        v31.8b, #3                    \n"
869       "movi        v30.8h, #3                    \n"
870 
871       "1:                                        \n"
872       "ldr         d0, [%0], #8                  \n"
873       "ldr         d1, [%2], #8                  \n"
874       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
875 
876       "ushll       v2.8h, v0.8b, #0              \n"
877       "ushll       v3.8h, v1.8b, #0              \n"
878       "umlal       v2.8h, v1.8b, v31.8b          \n"  // 3*near+far (1, odd)
879       "umlal       v3.8h, v0.8b, v31.8b          \n"  // 3*near+far (1, even)
880 
881       "ldr         d0, [%1], #8                  \n"
882       "ldr         d1, [%3], #8                  \n"
883       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
884 
885       "ushll       v4.8h, v0.8b, #0              \n"
886       "ushll       v5.8h, v1.8b, #0              \n"
887       "umlal       v4.8h, v1.8b, v31.8b          \n"  // 3*near+far (2, odd)
888       "umlal       v5.8h, v0.8b, v31.8b          \n"  // 3*near+far (2, even)
889 
890       "mov         v0.16b, v4.16b                \n"
891       "mov         v1.16b, v5.16b                \n"
892       "mla         v4.8h, v2.8h, v30.8h          \n"  // 9 3 3 1 (1, odd)
893       "mla         v5.8h, v3.8h, v30.8h          \n"  // 9 3 3 1 (1, even)
894       "mla         v2.8h, v0.8h, v30.8h          \n"  // 9 3 3 1 (2, odd)
895       "mla         v3.8h, v1.8h, v30.8h          \n"  // 9 3 3 1 (2, even)
896 
897       "rshrn       v2.8b, v2.8h, #4              \n"  // 2, odd
898       "rshrn       v1.8b, v3.8h, #4              \n"  // 2, even
899       "rshrn       v4.8b, v4.8h, #4              \n"  // 1, odd
900       "rshrn       v3.8b, v5.8h, #4              \n"  // 1, even
901 
902       "st2         {v1.4h, v2.4h}, [%5], #16     \n"  // store 2
903       "st2         {v3.4h, v4.4h}, [%4], #16     \n"  // store 1
904       "subs        %w6, %w6, #8                  \n"  // 4 uv -> 8 uv
905       "b.gt        1b                            \n"
906       : "+r"(src_ptr),    // %0
907         "+r"(src_ptr1),   // %1
908         "+r"(src_temp),   // %2
909         "+r"(src_temp1),  // %3
910         "+r"(dst_ptr),    // %4
911         "+r"(dst_ptr1),   // %5
912         "+r"(dst_width)   // %6
913       :
914       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
915         "v31"  // Clobber List
916   );
917 }
918 
ScaleUVRowUp2_Linear_16_NEON(const uint16_t * src_ptr,uint16_t * dst_ptr,int dst_width)919 void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
920                                   uint16_t* dst_ptr,
921                                   int dst_width) {
922   const uint16_t* src_temp = src_ptr + 2;
923   asm volatile(
924       "movi        v31.8h, #3                    \n"
925 
926       "1:                                        \n"
927       "ld1         {v0.8h}, [%0], #16            \n"  // 01234567 (16b)
928       "ld1         {v1.8h}, [%1], #16            \n"  // 12345678 (16b)
929       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
930 
931       "ushll       v2.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
932       "ushll       v3.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
933       "ushll2      v4.4s, v0.8h, #0              \n"  // 2233 (1u1v, 32b)
934       "ushll2      v5.4s, v1.8h, #0              \n"  // 3344 (1u1v, 32b)
935 
936       "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (odd)
937       "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (even)
938       "umlal2      v4.4s, v1.8h, v31.8h          \n"  // 3*near+far (odd)
939       "umlal2      v5.4s, v0.8h, v31.8h          \n"  // 3*near+far (even)
940 
941       "rshrn       v2.4h, v2.4s, #2              \n"  // 3/4*near+1/4*far (odd)
942       "rshrn       v1.4h, v3.4s, #2              \n"  // 3/4*near+1/4*far (even)
943       "rshrn       v4.4h, v4.4s, #2              \n"  // 3/4*near+1/4*far (odd)
944       "rshrn       v3.4h, v5.4s, #2              \n"  // 3/4*near+1/4*far (even)
945 
946       "st2         {v1.2s, v2.2s}, [%2], #16     \n"  // store
947       "st2         {v3.2s, v4.2s}, [%2], #16     \n"  // store
948       "subs        %w3, %w3, #8                  \n"  // 4 uv -> 8 uv
949       "b.gt        1b                            \n"
950       : "+r"(src_ptr),   // %0
951         "+r"(src_temp),  // %1
952         "+r"(dst_ptr),   // %2
953         "+r"(dst_width)  // %3
954       :
955       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
956         "v31"  // Clobber List
957   );
958 }
959 
ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)960 void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
961                                     ptrdiff_t src_stride,
962                                     uint16_t* dst_ptr,
963                                     ptrdiff_t dst_stride,
964                                     int dst_width) {
965   const uint16_t* src_ptr1 = src_ptr + src_stride;
966   uint16_t* dst_ptr1 = dst_ptr + dst_stride;
967   const uint16_t* src_temp = src_ptr + 2;
968   const uint16_t* src_temp1 = src_ptr1 + 2;
969 
970   asm volatile(
971       "movi        v31.4h, #3                    \n"
972       "movi        v30.4s, #3                    \n"
973 
974       "1:                                        \n"
975       "ldr         d0, [%0], #8                  \n"
976       "ldr         d1, [%2], #8                  \n"
977       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
978       "ushll       v2.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
979       "ushll       v3.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
980       "umlal       v2.4s, v1.4h, v31.4h          \n"  // 3*near+far (1, odd)
981       "umlal       v3.4s, v0.4h, v31.4h          \n"  // 3*near+far (1, even)
982 
983       "ldr         d0, [%1], #8                  \n"
984       "ldr         d1, [%3], #8                  \n"
985       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
986       "ushll       v4.4s, v0.4h, #0              \n"  // 0011 (1u1v, 32b)
987       "ushll       v5.4s, v1.4h, #0              \n"  // 1122 (1u1v, 32b)
988       "umlal       v4.4s, v1.4h, v31.4h          \n"  // 3*near+far (2, odd)
989       "umlal       v5.4s, v0.4h, v31.4h          \n"  // 3*near+far (2, even)
990 
991       "mov         v0.16b, v4.16b                \n"
992       "mov         v1.16b, v5.16b                \n"
993       "mla         v4.4s, v2.4s, v30.4s          \n"  // 9 3 3 1 (1, odd)
994       "mla         v5.4s, v3.4s, v30.4s          \n"  // 9 3 3 1 (1, even)
995       "mla         v2.4s, v0.4s, v30.4s          \n"  // 9 3 3 1 (2, odd)
996       "mla         v3.4s, v1.4s, v30.4s          \n"  // 9 3 3 1 (2, even)
997 
998       "rshrn       v1.4h, v2.4s, #4              \n"  // 2, odd
999       "rshrn       v0.4h, v3.4s, #4              \n"  // 2, even
1000       "rshrn       v3.4h, v4.4s, #4              \n"  // 1, odd
1001       "rshrn       v2.4h, v5.4s, #4              \n"  // 1, even
1002 
1003       "st2         {v0.2s, v1.2s}, [%5], #16     \n"  // store 2
1004       "st2         {v2.2s, v3.2s}, [%4], #16     \n"  // store 1
1005       "subs        %w6, %w6, #4                  \n"  // 2 uv -> 4 uv
1006       "b.gt        1b                            \n"
1007       : "+r"(src_ptr),    // %0
1008         "+r"(src_ptr1),   // %1
1009         "+r"(src_temp),   // %2
1010         "+r"(src_temp1),  // %3
1011         "+r"(dst_ptr),    // %4
1012         "+r"(dst_ptr1),   // %5
1013         "+r"(dst_width)   // %6
1014       :
1015       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
1016         "v31"  // Clobber List
1017   );
1018 }
1019 
1020 // Add a row of bytes to a row of shorts.  Used for box filter.
1021 // Reads 16 bytes and accumulates to 16 shorts at a time.
ScaleAddRow_NEON(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)1022 void ScaleAddRow_NEON(const uint8_t* src_ptr,
1023                       uint16_t* dst_ptr,
1024                       int src_width) {
1025   asm volatile(
1026       "1:                                        \n"
1027       "ld1         {v1.8h, v2.8h}, [%1]          \n"  // load accumulator
1028       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 bytes
1029       "uaddw2      v2.8h, v2.8h, v0.16b          \n"  // add
1030       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1031       "uaddw       v1.8h, v1.8h, v0.8b           \n"
1032       "st1         {v1.8h, v2.8h}, [%1], #32     \n"  // store accumulator
1033       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
1034       "b.gt        1b                            \n"
1035       : "+r"(src_ptr),   // %0
1036         "+r"(dst_ptr),   // %1
1037         "+r"(src_width)  // %2
1038       :
1039       : "memory", "cc", "v0", "v1", "v2"  // Clobber List
1040   );
1041 }
1042 
1043 // TODO(Yang Zhang): Investigate less load instructions for
1044 // the x/dx stepping
1045 #define LOAD2_DATA8_LANE(n)                      \
1046   "lsr        %5, %3, #16                    \n" \
1047   "add        %6, %1, %5                     \n" \
1048   "add        %3, %3, %4                     \n" \
1049   "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
1050 
1051 // The NEON version mimics this formula (from row_common.cc):
1052 // #define BLENDER(a, b, f) (uint8_t)((int)(a) +
1053 //    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
1054 
ScaleFilterCols_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)1055 void ScaleFilterCols_NEON(uint8_t* dst_ptr,
1056                           const uint8_t* src_ptr,
1057                           int dst_width,
1058                           int x,
1059                           int dx) {
1060   int dx_offset[4] = {0, 1, 2, 3};
1061   int* tmp = dx_offset;
1062   const uint8_t* src_tmp = src_ptr;
1063   int64_t x64 = (int64_t)x;    // NOLINT
1064   int64_t dx64 = (int64_t)dx;  // NOLINT
1065   asm volatile (
1066       "dup         v0.4s, %w3                    \n"  // x
1067       "dup         v1.4s, %w4                    \n"  // dx
1068       "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
1069       "shl         v3.4s, v1.4s, #2              \n"  // 4 * dx
1070       "mul         v1.4s, v1.4s, v2.4s           \n"
1071     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
1072       "add         v1.4s, v1.4s, v0.4s           \n"
1073     // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
1074       "add         v2.4s, v1.4s, v3.4s           \n"
1075       "shl         v0.4s, v3.4s, #1              \n"  // 8 * dx
1076       "1:                                        \n"
1077     LOAD2_DATA8_LANE(0)
1078     LOAD2_DATA8_LANE(1)
1079     LOAD2_DATA8_LANE(2)
1080     LOAD2_DATA8_LANE(3)
1081     LOAD2_DATA8_LANE(4)
1082     LOAD2_DATA8_LANE(5)
1083     LOAD2_DATA8_LANE(6)
1084     LOAD2_DATA8_LANE(7)
1085       "mov         v6.16b, v1.16b                \n"
1086       "mov         v7.16b, v2.16b                \n"
1087       "uzp1        v6.8h, v6.8h, v7.8h           \n"
1088       "ushll       v4.8h, v4.8b, #0              \n"
1089       "ushll       v5.8h, v5.8b, #0              \n"
1090       "ssubl       v16.4s, v5.4h, v4.4h          \n"
1091       "ssubl2      v17.4s, v5.8h, v4.8h          \n"
1092       "ushll       v7.4s, v6.4h, #0              \n"
1093       "ushll2      v6.4s, v6.8h, #0              \n"
1094       "mul         v16.4s, v16.4s, v7.4s         \n"
1095       "mul         v17.4s, v17.4s, v6.4s         \n"
1096       "rshrn       v6.4h, v16.4s, #16            \n"
1097       "rshrn2      v6.8h, v17.4s, #16            \n"
1098       "add         v4.8h, v4.8h, v6.8h           \n"
1099       "xtn         v4.8b, v4.8h                  \n"
1100 
1101       "st1         {v4.8b}, [%0], #8             \n"  // store pixels
1102       "add         v1.4s, v1.4s, v0.4s           \n"
1103       "add         v2.4s, v2.4s, v0.4s           \n"
1104       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
1105       "b.gt        1b                            \n"
1106   : "+r"(dst_ptr),          // %0
1107     "+r"(src_ptr),          // %1
1108     "+r"(dst_width),        // %2
1109     "+r"(x64),              // %3
1110     "+r"(dx64),             // %4
1111     "+r"(tmp),              // %5
1112     "+r"(src_tmp)           // %6
1113   :
1114   : "memory", "cc", "v0", "v1", "v2", "v3",
1115     "v4", "v5", "v6", "v7", "v16", "v17"
1116   );
1117 }
1118 
1119 #undef LOAD2_DATA8_LANE
1120 
ScaleARGBRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1121 void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
1122                             ptrdiff_t src_stride,
1123                             uint8_t* dst,
1124                             int dst_width) {
1125   (void)src_stride;
1126   asm volatile(
1127       "1:                                        \n"
1128       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
1129       "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
1130       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
1131       "mov         v2.16b, v3.16b                \n"
1132       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1133       "st2         {v1.4s,v2.4s}, [%1], #32      \n"  // store 8 odd pixels
1134       "b.gt        1b                            \n"
1135       : "+r"(src_ptr),   // %0
1136         "+r"(dst),       // %1
1137         "+r"(dst_width)  // %2
1138       :
1139       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
1140   );
1141 }
1142 
ScaleARGBRowDown2Linear_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)1143 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
1144                                   ptrdiff_t src_stride,
1145                                   uint8_t* dst_argb,
1146                                   int dst_width) {
1147   (void)src_stride;
1148   asm volatile(
1149       "1:                                        \n"
1150       // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
1151       "ld4         {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
1152       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
1153 
1154       "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
1155       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1156       "urhadd      v1.16b, v2.16b, v3.16b        \n"
1157       "st2         {v0.4s,v1.4s}, [%1], #32      \n"  // store 8 pixels
1158       "b.gt        1b                            \n"
1159       : "+r"(src_argb),  // %0
1160         "+r"(dst_argb),  // %1
1161         "+r"(dst_width)  // %2
1162       :
1163       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
1164   );
1165 }
1166 
ScaleARGBRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1167 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
1168                                ptrdiff_t src_stride,
1169                                uint8_t* dst,
1170                                int dst_width) {
1171   asm volatile(
1172       // change the stride to row 2 pointer
1173       "add         %1, %1, %0                    \n"
1174       "1:                                        \n"
1175       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
1176       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
1177       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
1178       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
1179       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
1180       "uaddlp      v3.8h, v3.16b                 \n"  // A 16 bytes -> 8 shorts.
1181       "ld4         {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
1182       "uadalp      v0.8h, v16.16b                \n"  // B 16 bytes -> 8 shorts.
1183       "uadalp      v1.8h, v17.16b                \n"  // G 16 bytes -> 8 shorts.
1184       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1185       "uadalp      v2.8h, v18.16b                \n"  // R 16 bytes -> 8 shorts.
1186       "uadalp      v3.8h, v19.16b                \n"  // A 16 bytes -> 8 shorts.
1187       "prfm        pldl1keep, [%1, 448]          \n"
1188       "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
1189       "rshrn       v1.8b, v1.8h, #2              \n"
1190       "rshrn       v2.8b, v2.8h, #2              \n"
1191       "rshrn       v3.8b, v3.8h, #2              \n"
1192       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
1193       "b.gt        1b                            \n"
1194       : "+r"(src_ptr),     // %0
1195         "+r"(src_stride),  // %1
1196         "+r"(dst),         // %2
1197         "+r"(dst_width)    // %3
1198       :
1199       : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
1200 }
1201 
1202 // Reads 4 pixels at a time.
1203 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1204 void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
1205                                ptrdiff_t src_stride,
1206                                int src_stepx,
1207                                uint8_t* dst_argb,
1208                                int dst_width) {
1209   (void)src_stride;
1210   asm volatile(
1211       "1:                                        \n"
1212       "ld1         {v0.s}[0], [%0], %3           \n"
1213       "ld1         {v0.s}[1], [%0], %3           \n"
1214       "ld1         {v0.s}[2], [%0], %3           \n"
1215       "ld1         {v0.s}[3], [%0], %3           \n"
1216       "subs        %w2, %w2, #4                  \n"  // 4 pixels per loop.
1217       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1218       "st1         {v0.16b}, [%1], #16           \n"
1219       "b.gt        1b                            \n"
1220       : "+r"(src_argb),                // %0
1221         "+r"(dst_argb),                // %1
1222         "+r"(dst_width)                // %2
1223       : "r"((int64_t)(src_stepx * 4))  // %3
1224       : "memory", "cc", "v0");
1225 }
1226 
1227 // Reads 4 pixels at a time.
1228 // Alignment requirement: src_argb 4 byte aligned.
1229 // TODO(Yang Zhang): Might be worth another optimization pass in future.
1230 // It could be upgraded to 8 pixels at a time to start with.
ScaleARGBRowDownEvenBox_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)1231 void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
1232                                   ptrdiff_t src_stride,
1233                                   int src_stepx,
1234                                   uint8_t* dst_argb,
1235                                   int dst_width) {
1236   asm volatile(
1237       "add         %1, %1, %0                    \n"
1238       "1:                                        \n"
1239       "ld1         {v0.8b}, [%0], %4             \n"  // Read 4 2x2 -> 2x1
1240       "ld1         {v1.8b}, [%1], %4             \n"
1241       "ld1         {v2.8b}, [%0], %4             \n"
1242       "ld1         {v3.8b}, [%1], %4             \n"
1243       "ld1         {v4.8b}, [%0], %4             \n"
1244       "ld1         {v5.8b}, [%1], %4             \n"
1245       "ld1         {v6.8b}, [%0], %4             \n"
1246       "ld1         {v7.8b}, [%1], %4             \n"
1247       "uaddl       v0.8h, v0.8b, v1.8b           \n"
1248       "uaddl       v2.8h, v2.8b, v3.8b           \n"
1249       "uaddl       v4.8h, v4.8b, v5.8b           \n"
1250       "uaddl       v6.8h, v6.8b, v7.8b           \n"
1251       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1252       "mov         v16.d[1], v0.d[1]             \n"  // ab_cd -> ac_bd
1253       "mov         v0.d[1], v2.d[0]              \n"
1254       "mov         v2.d[0], v16.d[1]             \n"
1255       "mov         v16.d[1], v4.d[1]             \n"  // ef_gh -> eg_fh
1256       "mov         v4.d[1], v6.d[0]              \n"
1257       "mov         v6.d[0], v16.d[1]             \n"
1258       "prfm        pldl1keep, [%1, 448]          \n"
1259       "add         v0.8h, v0.8h, v2.8h           \n"  // (a+b)_(c+d)
1260       "add         v4.8h, v4.8h, v6.8h           \n"  // (e+f)_(g+h)
1261       "rshrn       v0.8b, v0.8h, #2              \n"  // first 2 pixels.
1262       "rshrn2      v0.16b, v4.8h, #2             \n"  // next 2 pixels.
1263       "subs        %w3, %w3, #4                  \n"  // 4 pixels per loop.
1264       "st1         {v0.16b}, [%2], #16           \n"
1265       "b.gt        1b                            \n"
1266       : "+r"(src_argb),                // %0
1267         "+r"(src_stride),              // %1
1268         "+r"(dst_argb),                // %2
1269         "+r"(dst_width)                // %3
1270       : "r"((int64_t)(src_stepx * 4))  // %4
1271       : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
1272 }
1273 
1274 // TODO(Yang Zhang): Investigate less load instructions for
1275 // the x/dx stepping
1276 #define LOAD1_DATA32_LANE(vn, n)                 \
1277   "lsr        %5, %3, #16                    \n" \
1278   "add        %6, %1, %5, lsl #2             \n" \
1279   "add        %3, %3, %4                     \n" \
1280   "ld1        {" #vn ".s}[" #n "], [%6]      \n"
1281 
ScaleARGBCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1282 void ScaleARGBCols_NEON(uint8_t* dst_argb,
1283                         const uint8_t* src_argb,
1284                         int dst_width,
1285                         int x,
1286                         int dx) {
1287   const uint8_t* src_tmp = src_argb;
1288   int64_t x64 = (int64_t)x;    // NOLINT
1289   int64_t dx64 = (int64_t)dx;  // NOLINT
1290   int64_t tmp64;
1291   asm volatile(
1292       "1:                                        \n"
1293       // clang-format off
1294       LOAD1_DATA32_LANE(v0, 0)
1295       LOAD1_DATA32_LANE(v0, 1)
1296       LOAD1_DATA32_LANE(v0, 2)
1297       LOAD1_DATA32_LANE(v0, 3)
1298       LOAD1_DATA32_LANE(v1, 0)
1299       LOAD1_DATA32_LANE(v1, 1)
1300       LOAD1_DATA32_LANE(v1, 2)
1301       LOAD1_DATA32_LANE(v1, 3)
1302       "prfm        pldl1keep, [%1, 448]          \n"  // prefetch 7 lines ahead
1303       // clang-format on
1304       "st1         {v0.4s, v1.4s}, [%0], #32     \n"  // store pixels
1305       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
1306       "b.gt        1b                            \n"
1307       : "+r"(dst_argb),   // %0
1308         "+r"(src_argb),   // %1
1309         "+r"(dst_width),  // %2
1310         "+r"(x64),        // %3
1311         "+r"(dx64),       // %4
1312         "=&r"(tmp64),     // %5
1313         "+r"(src_tmp)     // %6
1314       :
1315       : "memory", "cc", "v0", "v1");
1316 }
1317 
1318 #undef LOAD1_DATA32_LANE
1319 
1320 // TODO(Yang Zhang): Investigate less load instructions for
1321 // the x/dx stepping
1322 #define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
1323   "lsr        %5, %3, #16                           \n" \
1324   "add        %6, %1, %5, lsl #2                    \n" \
1325   "add        %3, %3, %4                            \n" \
1326   "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
1327 
ScaleARGBFilterCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)1328 void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
1329                               const uint8_t* src_argb,
1330                               int dst_width,
1331                               int x,
1332                               int dx) {
1333   int dx_offset[4] = {0, 1, 2, 3};
1334   int* tmp = dx_offset;
1335   const uint8_t* src_tmp = src_argb;
1336   int64_t x64 = (int64_t)x;    // NOLINT
1337   int64_t dx64 = (int64_t)dx;  // NOLINT
1338   asm volatile (
1339       "dup         v0.4s, %w3                    \n"  // x
1340       "dup         v1.4s, %w4                    \n"  // dx
1341       "ld1         {v2.4s}, [%5]                 \n"  // 0 1 2 3
1342       "shl         v6.4s, v1.4s, #2              \n"  // 4 * dx
1343       "mul         v1.4s, v1.4s, v2.4s           \n"
1344       "movi        v3.16b, #0x7f                 \n"  // 0x7F
1345       "movi        v4.8h, #0x7f                  \n"  // 0x7F
1346     // x         , x + 1 * dx, x + 2 * dx, x + 3 * dx
1347       "add         v5.4s, v1.4s, v0.4s           \n"
1348       "1:                                        \n"
1349     // d0, d1: a
1350     // d2, d3: b
1351     LOAD2_DATA32_LANE(v0, v1, 0)
1352     LOAD2_DATA32_LANE(v0, v1, 1)
1353     LOAD2_DATA32_LANE(v0, v1, 2)
1354     LOAD2_DATA32_LANE(v0, v1, 3)
1355     "shrn       v2.4h, v5.4s, #9               \n"
1356     "and        v2.8b, v2.8b, v4.8b            \n"
1357     "dup        v16.8b, v2.b[0]                \n"
1358     "dup        v17.8b, v2.b[2]                \n"
1359     "dup        v18.8b, v2.b[4]                \n"
1360     "dup        v19.8b, v2.b[6]                \n"
1361     "ext        v2.8b, v16.8b, v17.8b, #4      \n"
1362     "ext        v17.8b, v18.8b, v19.8b, #4     \n"
1363     "ins        v2.d[1], v17.d[0]              \n"  // f
1364     "eor        v7.16b, v2.16b, v3.16b         \n"  // 0x7f ^ f
1365     "umull      v16.8h, v0.8b, v7.8b           \n"
1366     "umull2     v17.8h, v0.16b, v7.16b         \n"
1367     "umull      v18.8h, v1.8b, v2.8b           \n"
1368     "umull2     v19.8h, v1.16b, v2.16b         \n"
1369     "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
1370     "add        v16.8h, v16.8h, v18.8h         \n"
1371     "add        v17.8h, v17.8h, v19.8h         \n"
1372     "shrn       v0.8b, v16.8h, #7              \n"
1373     "shrn2      v0.16b, v17.8h, #7             \n"
1374     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
1375     "add     v5.4s, v5.4s, v6.4s               \n"
1376     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
1377     "b.gt       1b                             \n"
1378   : "+r"(dst_argb),         // %0
1379     "+r"(src_argb),         // %1
1380     "+r"(dst_width),        // %2
1381     "+r"(x64),              // %3
1382     "+r"(dx64),             // %4
1383     "+r"(tmp),              // %5
1384     "+r"(src_tmp)           // %6
1385   :
1386   : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
1387     "v6", "v7", "v16", "v17", "v18", "v19"
1388   );
1389 }
1390 
1391 #undef LOAD2_DATA32_LANE
1392 
1393 // Read 16x2 average down and write 8x1.
ScaleRowDown2Box_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)1394 void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
1395                               ptrdiff_t src_stride,
1396                               uint16_t* dst,
1397                               int dst_width) {
1398   asm volatile(
1399       // change the stride to row 2 pointer
1400       "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
1401       "1:                                        \n"
1402       "ld1         {v0.8h, v1.8h}, [%0], #32     \n"  // load row 1 and post inc
1403       "ld1         {v2.8h, v3.8h}, [%1], #32     \n"  // load row 2 and post inc
1404       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop
1405       "uaddlp      v0.4s, v0.8h                  \n"  // row 1 add adjacent
1406       "uaddlp      v1.4s, v1.8h                  \n"
1407       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1408       "uadalp      v0.4s, v2.8h                  \n"  // +row 2 add adjacent
1409       "uadalp      v1.4s, v3.8h                  \n"
1410       "prfm        pldl1keep, [%1, 448]          \n"
1411       "rshrn       v0.4h, v0.4s, #2              \n"  // round and pack
1412       "rshrn2      v0.8h, v1.4s, #2              \n"
1413       "st1         {v0.8h}, [%2], #16            \n"
1414       "b.gt        1b                            \n"
1415       : "+r"(src_ptr),     // %0
1416         "+r"(src_stride),  // %1
1417         "+r"(dst),         // %2
1418         "+r"(dst_width)    // %3
1419       :
1420       : "v0", "v1", "v2", "v3"  // Clobber List
1421   );
1422 }
1423 
1424 // Read 8x2 upsample with filtering and write 16x1.
1425 // Actually reads an extra pixel, so 9x2.
ScaleRowUp2_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)1426 void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
1427                          ptrdiff_t src_stride,
1428                          uint16_t* dst,
1429                          int dst_width) {
1430   asm volatile(
1431       "add         %1, %0, %1, lsl #1            \n"  // ptr + stide * 2
1432       "movi        v0.8h, #9                     \n"  // constants
1433       "movi        v1.4s, #3                     \n"
1434 
1435       "1:                                        \n"
1436       "ld1         {v3.8h}, [%0], %4             \n"  // TL read first 8
1437       "ld1         {v4.8h}, [%0], %5             \n"  // TR read 8 offset by 1
1438       "ld1         {v5.8h}, [%1], %4             \n"  // BL read 8 from next row
1439       "ld1         {v6.8h}, [%1], %5             \n"  // BR offset by 1
1440       "subs        %w3, %w3, #16                 \n"  // 16 dst pixels per loop
1441       "umull       v16.4s, v3.4h, v0.4h          \n"
1442       "umull2      v7.4s, v3.8h, v0.8h           \n"
1443       "umull       v18.4s, v4.4h, v0.4h          \n"
1444       "umull2      v17.4s, v4.8h, v0.8h          \n"
1445       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1446       "uaddw       v16.4s, v16.4s, v6.4h         \n"
1447       "uaddl2      v19.4s, v6.8h, v3.8h          \n"
1448       "uaddl       v3.4s, v6.4h, v3.4h           \n"
1449       "uaddw2      v6.4s, v7.4s, v6.8h           \n"
1450       "uaddl2      v7.4s, v5.8h, v4.8h           \n"
1451       "uaddl       v4.4s, v5.4h, v4.4h           \n"
1452       "uaddw       v18.4s, v18.4s, v5.4h         \n"
1453       "prfm        pldl1keep, [%1, 448]          \n"
1454       "mla         v16.4s, v4.4s, v1.4s          \n"
1455       "mla         v18.4s, v3.4s, v1.4s          \n"
1456       "mla         v6.4s, v7.4s, v1.4s           \n"
1457       "uaddw2      v4.4s, v17.4s, v5.8h          \n"
1458       "uqrshrn     v16.4h,  v16.4s, #4           \n"
1459       "mla         v4.4s, v19.4s, v1.4s          \n"
1460       "uqrshrn2    v16.8h, v6.4s, #4             \n"
1461       "uqrshrn     v17.4h, v18.4s, #4            \n"
1462       "uqrshrn2    v17.8h, v4.4s, #4             \n"
1463       "st2         {v16.8h-v17.8h}, [%2], #32    \n"
1464       "b.gt        1b                            \n"
1465       : "+r"(src_ptr),     // %0
1466         "+r"(src_stride),  // %1
1467         "+r"(dst),         // %2
1468         "+r"(dst_width)    // %3
1469       : "r"(2LL),          // %4
1470         "r"(14LL)          // %5
1471       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
1472         "v19"  // Clobber List
1473   );
1474 }
1475 
ScaleUVRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1476 void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
1477                           ptrdiff_t src_stride,
1478                           uint8_t* dst,
1479                           int dst_width) {
1480   (void)src_stride;
1481   asm volatile(
1482       "1:                                        \n"
1483       "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
1484       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1485       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1486       "st1         {v1.8h}, [%1], #16            \n"  // store 8 UV
1487       "b.gt        1b                            \n"
1488       : "+r"(src_ptr),   // %0
1489         "+r"(dst),       // %1
1490         "+r"(dst_width)  // %2
1491       :
1492       : "memory", "cc", "v0", "v1");
1493 }
1494 
ScaleUVRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1495 void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
1496                                 ptrdiff_t src_stride,
1497                                 uint8_t* dst,
1498                                 int dst_width) {
1499   (void)src_stride;
1500   asm volatile(
1501       "1:                                        \n"
1502       "ld2         {v0.8h,v1.8h}, [%0], #32      \n"  // load 16 UV
1503       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1504       "urhadd      v0.16b, v0.16b, v1.16b        \n"  // rounding half add
1505       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1506       "st1         {v0.8h}, [%1], #16            \n"  // store 8 UV
1507       "b.gt        1b                            \n"
1508       : "+r"(src_ptr),   // %0
1509         "+r"(dst),       // %1
1510         "+r"(dst_width)  // %2
1511       :
1512       : "memory", "cc", "v0", "v1");
1513 }
1514 
ScaleUVRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)1515 void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
1516                              ptrdiff_t src_stride,
1517                              uint8_t* dst,
1518                              int dst_width) {
1519   asm volatile(
1520       // change the stride to row 2 pointer
1521       "add         %1, %1, %0                    \n"
1522       "1:                                        \n"
1523       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 UV
1524       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
1525       "uaddlp      v0.8h, v0.16b                 \n"  // U 16 bytes -> 8 shorts.
1526       "uaddlp      v1.8h, v1.16b                 \n"  // V 16 bytes -> 8 shorts.
1527       "ld2         {v16.16b,v17.16b}, [%1], #32  \n"  // load 16
1528       "uadalp      v0.8h, v16.16b                \n"  // U 16 bytes -> 8 shorts.
1529       "uadalp      v1.8h, v17.16b                \n"  // V 16 bytes -> 8 shorts.
1530       "prfm        pldl1keep, [%0, 448]          \n"  // prefetch 7 lines ahead
1531       "rshrn       v0.8b, v0.8h, #2              \n"  // round and pack
1532       "prfm        pldl1keep, [%1, 448]          \n"
1533       "rshrn       v1.8b, v1.8h, #2              \n"
1534       "st2         {v0.8b,v1.8b}, [%2], #16      \n"
1535       "b.gt        1b                            \n"
1536       : "+r"(src_ptr),     // %0
1537         "+r"(src_stride),  // %1
1538         "+r"(dst),         // %2
1539         "+r"(dst_width)    // %3
1540       :
1541       : "memory", "cc", "v0", "v1", "v16", "v17");
1542 }
1543 
1544 // Reads 4 pixels at a time.
ScaleUVRowDownEven_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_ptr,int dst_width)1545 void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
1546                              ptrdiff_t src_stride,
1547                              int src_stepx,  // pixel step
1548                              uint8_t* dst_ptr,
1549                              int dst_width) {
1550   const uint8_t* src1_ptr = src_ptr + src_stepx * 2;
1551   const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
1552   const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
1553   (void)src_stride;
1554   asm volatile(
1555       "1:                                        \n"
1556       "ld1         {v0.h}[0], [%0], %6           \n"
1557       "ld1         {v1.h}[0], [%1], %6           \n"
1558       "ld1         {v2.h}[0], [%2], %6           \n"
1559       "ld1         {v3.h}[0], [%3], %6           \n"
1560       "subs        %w5, %w5, #4                  \n"  // 4 pixels per loop.
1561       "st4         {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
1562       "b.gt        1b                            \n"
1563       : "+r"(src_ptr),                 // %0
1564         "+r"(src1_ptr),                // %1
1565         "+r"(src2_ptr),                // %2
1566         "+r"(src3_ptr),                // %3
1567         "+r"(dst_ptr),                 // %4
1568         "+r"(dst_width)                // %5
1569       : "r"((int64_t)(src_stepx * 8))  // %6
1570       : "memory", "cc", "v0", "v1", "v2", "v3");
1571 }
1572 
1573 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1574 
1575 #ifdef __cplusplus
1576 }  // extern "C"
1577 }  // namespace libyuv
1578 #endif
1579