• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31const right_ext_mask_buf
32        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
33        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
34        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
35        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
36        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
37        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
38right_ext_mask:
39        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
40        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
41        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
42        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
43        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
44        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
45endconst
46
47// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride,
48//                                     const pixel (*left)[4], const pixel *lpf,
49//                                     const int w, int h,
50//                                     const int16_t filter[2][8],
51//                                     const enum LrEdgeFlags edges);
52function wiener_filter7_8bpc_neon, export=1
53        AARCH64_SIGN_LINK_REGISTER
54        stp             x29, x30, [sp, #-16]!
55        mov             x29, sp
56        ld1             {v0.8h, v1.8h},  [x6]
57        tst             w7,  #4               // LR_HAVE_TOP
58        sub_sp          384*2*6
59
60        mov             w17, #(1 << 14) - (1 << 2)
61        dup             v30.8h,  w17
62        movi            v31.8h,  #8, lsl #8
63
64        // x9  - t6
65        // x10 - t5
66        // x11 - t4
67        // x12 - t3
68        // x13 - t2
69        // x14 - t1
70        // x15 - t0
71        mov             x14, sp               // t1
72        b.eq            L(no_top_7)
73
74        mov             x16, x2               // backup left
75        mov             x2,  #0
76        bl              wiener_filter7_h_8bpc_neon
77        add             x3,  x3,  x1          // lpf += stride
78        mov             x9,  x14              // t6
79        mov             x10, x14              // t5
80        add             x14, x14, #384*2      // t1 += 384*2
81        bl              wiener_filter7_h_8bpc_neon
82        add             x3,  x3,  x1,  lsl #2
83        add             x3,  x3,  x1          // lpf += stride*5
84        mov             x11, x14              // t4
85        add             x14, x14, #384*2      // t1 += 384*2
86        mov             x2,  x16              // left
87        mov             x16, x3               // backup lpf
88        mov             x3,  x0               // lpf = p
89        bl              wiener_filter7_h_8bpc_neon
90        subs            w5,  w5,  #1          // h--
91        mov             x12, x14              // t3
92        mov             x13, x14              // t2
93        b.eq            L(v1_7)
94        add             x3,  x3,  x1          // src += stride
95        add             x14, x14, #384*2      // t1 += 384*2
96        bl              wiener_filter7_h_8bpc_neon
97        mov             x13, x14              // t2
98        subs            w5,  w5,  #1          // h--
99        b.eq            L(v2_7)
100        add             x3,  x3,  x1          // src += stride
101        add             x14, x14, #384*2      // t1 += 384*2
102        bl              wiener_filter7_h_8bpc_neon
103        subs            w5,  w5,  #1          // h--
104        b.eq            L(v3_7)
105        add             x3,  x3,  x1          // src += stride
106
107L(main_7):
108        add             x15, x14, #384*2      // t0 = t1 + 384*2
109L(main_loop_7):
110        bl              wiener_filter7_hv_8bpc_neon
111        subs            w5,  w5,  #1          // h--
112        b.ne            L(main_loop_7)
113        tst             w7,  #8 // LR_HAVE_BOTTOM
114        b.eq            L(v3_7)
115
116        mov             x3,  x16              // restore lpf
117        mov             x2,  #0               // left = NULL
118        bl              wiener_filter7_hv_8bpc_neon
119        bl              wiener_filter7_hv_8bpc_neon
120L(v1_7):
121        bl              wiener_filter7_v_8bpc_neon
122
123        mov             sp,  x29
124        ldp             x29, x30, [sp], #16
125        AARCH64_VALIDATE_LINK_REGISTER
126        ret
127
128L(no_top_7):
129        add             x3,  x3,  x1,  lsl #2
130        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
131        mov             x3,  x0               // lpf = p
132
133        bl              wiener_filter7_h_8bpc_neon
134        subs            w5,  w5,  #1          // h--
135        mov             x9,  x14              // t6
136        mov             x10, x14              // t5
137        mov             x11, x14              // t4
138        mov             x12, x14              // t3
139        mov             x13, x14              // t2
140        b.eq            L(v1_7)
141        add             x3,  x3,  x1          // src += stride
142        add             x14, x14, #384*2      // t1 += 384*2
143        bl              wiener_filter7_h_8bpc_neon
144        subs            w5,  w5,  #1          // h--
145        mov             x13, x14              // t2
146        b.eq            L(v2_7)
147        add             x3,  x3,  x1          // src += stride
148        add             x14, x14, #384*2      // t1 += 384*2
149        bl              wiener_filter7_h_8bpc_neon
150        subs            w5,  w5,  #1          // h--
151        b.eq            L(v3_7)
152        add             x3,  x3,  x1          // src += stride
153        add             x15, x14, #384*2      // t0 = t1 + 384*2
154        bl              wiener_filter7_hv_8bpc_neon
155        subs            w5,  w5,  #1          // h--
156        b.eq            L(v3_7)
157        add             x15, x15, #384*2*4    // t0 += 384*2*4
158        bl              wiener_filter7_hv_8bpc_neon
159        subs            w5,  w5,  #1          // h--
160        b.ne            L(main_7)
161L(v3_7):
162        bl              wiener_filter7_v_8bpc_neon
163L(v2_7):
164        bl              wiener_filter7_v_8bpc_neon
165        b               L(v1_7)
166endfunc
167
168
169function wiener_filter7_h_8bpc_neon
170        stp             x3,  x4,  [sp, #-32]!
171        str             x14,      [sp, #16]
172
173        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
174        tst             w7,  #1 // LR_HAVE_LEFT
175        b.eq            1f
176        // LR_HAVE_LEFT
177        cbnz            x2,  0f
178        // left == NULL
179        sub             x3,  x3,  #3
180        ld1             {v3.16b}, [x3], #16
181        b               2f
182
1830:
184        // LR_HAVE_LEFT, left != NULL
185        ld1             {v3.16b},  [x3], #16
186        ld1             {v2.s}[3], [x2], #4
187        // Move x3 back to account for the last 3 bytes we loaded earlier,
188        // which we'll shift out.
189        sub             x3,  x3,  #3
190        ext             v3.16b,  v2.16b,  v3.16b, #13
191        b               2f
192
1931:
194        ld1             {v3.16b}, [x3], #16
195        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
196        // and shift v3 to have 3x the first byte at the front.
197        dup             v2.16b,  v3.b[0]
198        // Move x3 back to account for the last 3 bytes we loaded before,
199        // which we shifted out.
200        sub             x3,  x3,  #3
201        ext             v3.16b,  v2.16b,  v3.16b, #13
202
2032:
204        ld1             {v4.8b}, [x3], #8
205        uxtl            v2.8h,   v3.8b
206        uxtl2           v3.8h,   v3.16b
207        uxtl            v4.8h,   v4.8b
208
209        tst             w7,  #2 // LR_HAVE_RIGHT
210        b.ne            4f
211
2123:      // !LR_HAVE_RIGHT
213
214        // Check whether we need to pad the right edge
215        cmp             w4,  #19
216        b.ge            4f   // If w >= 19, all used input pixels are valid
217
218        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
219        // this ends up called again; it's not strictly needed in those
220        // cases (we pad enough here), but keeping the code as simple as possible.
221
222        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
223        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
224        sub             w17, w4,  #22
225        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
226        // buffer pointer.
227        movrel          x6,  right_ext_mask, -6
228        ldr             b28, [x3,  w17, sxtw]
229        sub             x6,  x6,  w4,  uxtw #1
230        dup             v28.8h,  v28.h[0]
231        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
232
233        bit             v2.16b,  v28.16b, v25.16b
234        bit             v3.16b,  v28.16b, v26.16b
235        bit             v4.16b,  v28.16b, v27.16b
236
2374:      // Loop horizontally
238        // Interleaving the mul/mla chains actually hurts performance
239        // significantly on Cortex A53, thus keeping mul/mla tightly
240        // chained like this.
241        ext             v17.16b, v2.16b,  v3.16b, #4
242        ext             v19.16b, v2.16b,  v3.16b, #8
243        ext             v16.16b, v2.16b,  v3.16b, #2
244        ext             v20.16b, v2.16b,  v3.16b, #10
245        ext             v21.16b, v2.16b,  v3.16b, #12
246        ext             v18.16b, v2.16b,  v3.16b, #6
247        add             v19.8h,  v19.8h,  v17.8h
248        add             v20.8h,  v20.8h,  v16.8h
249        add             v21.8h,  v21.8h,  v2.8h
250        shl             v22.8h,  v18.8h,  #7
251        mul             v6.8h,   v18.8h,  v0.h[3]
252        mla             v6.8h,   v19.8h,  v0.h[4]
253        mla             v6.8h,   v20.8h,  v0.h[5]
254        mla             v6.8h,   v21.8h,  v0.h[6]
255
256        ext             v17.16b, v3.16b,  v4.16b, #4
257        ext             v19.16b, v3.16b,  v4.16b, #8
258        ext             v16.16b, v3.16b,  v4.16b, #2
259        ext             v20.16b, v3.16b,  v4.16b, #10
260        ext             v21.16b, v3.16b,  v4.16b, #12
261        ext             v18.16b, v3.16b,  v4.16b, #6
262
263        add             v19.8h,  v19.8h,  v17.8h
264        add             v20.8h,  v20.8h,  v16.8h
265        add             v21.8h,  v21.8h,  v3.8h
266        shl             v23.8h,  v18.8h,  #7
267        mul             v7.8h,   v18.8h,  v0.h[3]
268        mla             v7.8h,   v19.8h,  v0.h[4]
269        mla             v7.8h,   v20.8h,  v0.h[5]
270        mla             v7.8h,   v21.8h,  v0.h[6]
271
272        sub             v22.8h,  v22.8h,  v30.8h
273        sub             v23.8h,  v23.8h,  v30.8h
274        sqadd           v6.8h,   v6.8h,   v22.8h
275        sqadd           v7.8h,   v7.8h,   v23.8h
276        sshr            v6.8h,   v6.8h,   #3
277        sshr            v7.8h,   v7.8h,   #3
278        add             v6.8h,   v6.8h,   v31.8h
279        add             v7.8h,   v7.8h,   v31.8h
280
281        subs            w4,  w4,  #16
282
283        st1             {v6.8h, v7.8h}, [x14], #32
284
285        b.le            0f
286        mov             v2.16b,  v4.16b
287        ld1             {v4.16b}, [x3], #16
288        tst             w7,  #2 // LR_HAVE_RIGHT
289        uxtl            v3.8h,   v4.8b
290        uxtl2           v4.8h,   v4.16b
291        b.ne            4b // If we don't need to pad, just keep filtering.
292        b               3b // If we need to pad, check how many pixels we have left.
293
2940:
295        ldr             x14,      [sp, #16]
296        ldp             x3,  x4,  [sp], #32
297        ret
298endfunc
299
300function wiener_filter7_v_8bpc_neon
301        // Backing up/restoring registers shifted, so that x9 gets the value
302        // of x10, etc, afterwards.
303        stp             x10, x11, [sp, #-64]!
304        stp             x12, x13, [sp, #16]
305        stp             x14, x14, [sp, #32]
306        stp             x0,  x4,  [sp, #48]
3071:
308        ld1             {v20.8h, v21.8h}, [x11], #32
309        ld1             {v24.8h, v25.8h}, [x13], #32
310
311        ld1             {v18.8h, v19.8h}, [x10], #32
312        add             v24.8h,  v24.8h,  v20.8h
313        ld1             {v26.8h, v27.8h}, [x14], #32
314
315        ld1             {v16.8h, v17.8h}, [x9],  #32
316        add             v28.8h,  v26.8h,  v18.8h
317        ld1             {v22.8h, v23.8h}, [x12], #32
318
319        add             v16.8h,  v26.8h,  v16.8h
320        add             v25.8h,  v25.8h,  v21.8h
321
322        smull           v2.4s,   v22.4h,  v1.h[3]
323        smlal           v2.4s,   v24.4h,  v1.h[4]
324        smlal           v2.4s,   v28.4h,  v1.h[5]
325        smlal           v2.4s,   v16.4h,  v1.h[6]
326        add             v29.8h,  v27.8h,  v19.8h
327        smull2          v3.4s,   v22.8h,  v1.h[3]
328        smlal2          v3.4s,   v24.8h,  v1.h[4]
329        smlal2          v3.4s,   v28.8h,  v1.h[5]
330        smlal2          v3.4s,   v16.8h,  v1.h[6]
331        add             v17.8h,  v27.8h,  v17.8h
332        smull           v4.4s,   v23.4h,  v1.h[3]
333        smlal           v4.4s,   v25.4h,  v1.h[4]
334        smlal           v4.4s,   v29.4h,  v1.h[5]
335        smlal           v4.4s,   v17.4h,  v1.h[6]
336        smull2          v5.4s,   v23.8h,  v1.h[3]
337        smlal2          v5.4s,   v25.8h,  v1.h[4]
338        smlal2          v5.4s,   v29.8h,  v1.h[5]
339        smlal2          v5.4s,   v17.8h,  v1.h[6]
340        sqrshrun        v2.4h,   v2.4s,   #11
341        sqrshrun2       v2.8h,   v3.4s,   #11
342        sqrshrun        v3.4h,   v4.4s,   #11
343        sqrshrun2       v3.8h,   v5.4s,   #11
344        sqxtun          v2.8b,   v2.8h
345        sqxtun2         v2.16b,  v3.8h
346        subs            w4,  w4,  #16
347        st1             {v2.16b}, [x0], #16
348        b.gt            1b
349
350        ldp             x0,  x4,  [sp, #48]
351        ldp             x13, x14, [sp, #32]
352        ldp             x11, x12, [sp, #16]
353        ldp             x9,  x10, [sp], #64
354
355        add             x0,  x0,  x1
356        ret
357endfunc
358
359function wiener_filter7_hv_8bpc_neon
360        // Backing up/restoring registers shifted, so that x9 gets the value
361        // of x10, etc, and x15==x9, afterwards.
362        stp             x10, x11, [sp, #-80]!
363        stp             x12, x13, [sp, #16]
364        stp             x14, x15, [sp, #32]
365        stp             x10, x0,  [sp, #48]
366        stp             x3,  x4,  [sp, #64]
367
368        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
369        tst             w7,  #1 // LR_HAVE_LEFT
370        b.eq            1f
371        // LR_HAVE_LEFT
372        cbnz            x2,  0f
373        // left == NULL
374        sub             x3,  x3,  #3
375        ld1             {v3.16b}, [x3], #16
376        b               2f
377
3780:
379        // LR_HAVE_LEFT, left != NULL
380        ld1             {v3.16b},  [x3], #16
381        ld1             {v2.s}[3], [x2], #4
382        // Move x3 back to account for the last 3 bytes we loaded earlier,
383        // which we'll shift out.
384        sub             x3,  x3,  #3
385        ext             v3.16b,  v2.16b,  v3.16b, #13
386        b               2f
3871:
388        ld1             {v3.16b}, [x3], #16
389        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
390        // and shift v3 to have 3x the first byte at the front.
391        dup             v2.16b,  v3.b[0]
392        // Move x3 back to account for the last 3 bytes we loaded before,
393        // which we shifted out.
394        sub             x3,  x3,  #3
395        ext             v3.16b,  v2.16b,  v3.16b, #13
396
3972:
398        ld1             {v4.8b}, [x3], #8
399        uxtl            v2.8h,   v3.8b
400        uxtl2           v3.8h,   v3.16b
401        uxtl            v4.8h,   v4.8b
402
403        tst             w7,  #2 // LR_HAVE_RIGHT
404        b.ne            4f
405
4063:      // !LR_HAVE_RIGHT
407
408        // Check whether we need to pad the right edge
409        cmp             w4,  #19
410        b.ge            4f   // If w >= 19, all used input pixels are valid
411
412        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
413        // this ends up called again; it's not strictly needed in those
414        // cases (we pad enough here), but keeping the code as simple as possible.
415
416        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
417        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
418        sub             w17, w4,  #22
419        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
420        // buffer pointer.
421        movrel          x6,  right_ext_mask, -6
422        ldr             b28, [x3,  w17, sxtw]
423        sub             x6,  x6,  w4,  uxtw #1
424        dup             v28.8h,  v28.h[0]
425        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
426
427        bit             v2.16b,  v28.16b, v25.16b
428        bit             v3.16b,  v28.16b, v26.16b
429        bit             v4.16b,  v28.16b, v27.16b
430
4314:      // Loop horizontally
432        ext             v17.16b, v2.16b,  v3.16b, #4
433        ext             v19.16b, v2.16b,  v3.16b, #8
434        ext             v16.16b, v2.16b,  v3.16b, #2
435        ext             v20.16b, v2.16b,  v3.16b, #10
436        ext             v21.16b, v2.16b,  v3.16b, #12
437        ext             v18.16b, v2.16b,  v3.16b, #6
438        add             v19.8h,  v19.8h,  v17.8h
439        add             v20.8h,  v20.8h,  v16.8h
440        add             v21.8h,  v21.8h,  v2.8h
441        shl             v22.8h,  v18.8h,  #7
442        mul             v6.8h,   v18.8h,  v0.h[3]
443        mla             v6.8h,   v19.8h,  v0.h[4]
444        mla             v6.8h,   v20.8h,  v0.h[5]
445        mla             v6.8h,   v21.8h,  v0.h[6]
446
447        ext             v17.16b, v3.16b,  v4.16b, #4
448        ext             v19.16b, v3.16b,  v4.16b, #8
449        ext             v16.16b, v3.16b,  v4.16b, #2
450        ext             v20.16b, v3.16b,  v4.16b, #10
451        ext             v21.16b, v3.16b,  v4.16b, #12
452        ext             v18.16b, v3.16b,  v4.16b, #6
453
454        add             v19.8h,  v19.8h,  v17.8h
455        add             v20.8h,  v20.8h,  v16.8h
456        add             v21.8h,  v21.8h,  v3.8h
457        shl             v23.8h,  v18.8h,  #7
458        mul             v7.8h,   v18.8h,  v0.h[3]
459        mla             v7.8h,   v19.8h,  v0.h[4]
460        mla             v7.8h,   v20.8h,  v0.h[5]
461        mla             v7.8h,   v21.8h,  v0.h[6]
462
463        ld1             {v20.8h, v21.8h}, [x11], #32
464
465        sub             v22.8h,  v22.8h,  v30.8h
466        sub             v23.8h,  v23.8h,  v30.8h
467        ld1             {v26.8h, v27.8h}, [x13], #32
468        sqadd           v6.8h,   v6.8h,   v22.8h
469        sqadd           v7.8h,   v7.8h,   v23.8h
470        ld1             {v18.8h, v19.8h}, [x10], #32
471        sshr            v6.8h,   v6.8h,   #3
472        sshr            v7.8h,   v7.8h,   #3
473        ld1             {v28.8h, v29.8h}, [x14], #32
474        add             v6.8h,   v6.8h,   v31.8h
475        add             v7.8h,   v7.8h,   v31.8h
476
477        ld1             {v16.8h, v17.8h}, [x9],  #32
478        add             v26.8h,  v20.8h,  v26.8h
479
480        ld1             {v24.8h, v25.8h}, [x12], #32
481        add             v28.8h,  v18.8h,  v28.8h
482
483        add             v16.8h,  v16.8h,  v6.8h
484        add             v27.8h,  v21.8h,  v27.8h
485
486        smull           v18.4s,  v24.4h,  v1.h[3]
487        smlal           v18.4s,  v26.4h,  v1.h[4]
488        smlal           v18.4s,  v28.4h,  v1.h[5]
489        smlal           v18.4s,  v16.4h,  v1.h[6]
490        add             v29.8h,  v19.8h,  v29.8h
491        smull2          v19.4s,  v24.8h,  v1.h[3]
492        smlal2          v19.4s,  v26.8h,  v1.h[4]
493        smlal2          v19.4s,  v28.8h,  v1.h[5]
494        smlal2          v19.4s,  v16.8h,  v1.h[6]
495        add             v17.8h,  v17.8h,  v7.8h
496        smull           v20.4s,  v25.4h,  v1.h[3]
497        smlal           v20.4s,  v27.4h,  v1.h[4]
498        smlal           v20.4s,  v29.4h,  v1.h[5]
499        smlal           v20.4s,  v17.4h,  v1.h[6]
500        smull2          v21.4s,  v25.8h,  v1.h[3]
501        smlal2          v21.4s,  v27.8h,  v1.h[4]
502        smlal2          v21.4s,  v29.8h,  v1.h[5]
503        smlal2          v21.4s,  v17.8h,  v1.h[6]
504        sqrshrun        v18.4h,  v18.4s,  #11
505        sqrshrun2       v18.8h,  v19.4s,  #11
506        sqrshrun        v19.4h,  v20.4s,  #11
507        sqrshrun2       v19.8h,  v21.4s,  #11
508        st1             {v6.8h, v7.8h}, [x15], #32
509        sqxtun          v18.8b,  v18.8h
510        sqxtun2         v18.16b, v19.8h
511        subs            w4,  w4,  #16
512
513        st1             {v18.16b}, [x0], #16
514
515        b.le            0f
516        mov             v2.16b,  v4.16b
517        ld1             {v4.16b}, [x3], #16
518        tst             w7,  #2 // LR_HAVE_RIGHT
519        uxtl            v3.8h,   v4.8b
520        uxtl2           v4.8h,   v4.16b
521        b.ne            4b // If we don't need to pad, just keep filtering.
522        b               3b // If we need to pad, check how many pixels we have left.
523
5240:
525        ldp             x3,  x4,  [sp, #64]
526        ldp             x15, x0,  [sp, #48]
527        ldp             x13, x14, [sp, #32]
528        ldp             x11, x12, [sp, #16]
529        ldp             x9,  x10, [sp], #80
530
531        add             x3,  x3,  x1
532        add             x0,  x0,  x1
533
534        ret
535endfunc
536
537// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride,
538//                                     const pixel (*left)[4], const pixel *lpf,
539//                                     const int w, int h,
540//                                     const int16_t filter[2][8],
541//                                     const enum LrEdgeFlags edges);
542function wiener_filter5_8bpc_neon, export=1
543        AARCH64_SIGN_LINK_REGISTER
544        stp             x29, x30, [sp, #-16]!
545        mov             x29, sp
546        ld1             {v0.8h, v1.8h},  [x6]
547        tst             w7,  #4               // LR_HAVE_TOP
548        sub_sp          384*2*4
549
550        mov             w17, #(1 << 14) - (1 << 2)
551        dup             v30.8h,  w17
552        movi            v31.8h,  #8, lsl #8
553
554        // x11 - t4
555        // x12 - t3
556        // x13 - t2
557        // x14 - t1
558        // x15 - t0
559        mov             x14, sp               // t1
560        b.eq            L(no_top_5)
561
562        mov             x16, x2               // backup left
563        mov             x2,  #0
564        bl              wiener_filter5_h_8bpc_neon
565        add             x3,  x3,  x1          // lpf += stride
566        mov             x11, x14              // t4
567        add             x14, x14, #384*2      // t1 += 384*2
568        bl              wiener_filter5_h_8bpc_neon
569        add             x3,  x3,  x1,  lsl #2
570        add             x3,  x3,  x1          // lpf += stride*5
571        mov             x12, x14              // t3
572        add             x14, x14, #384*2      // t1 += 384*2
573        mov             x2,  x16              // left
574        mov             x16, x3               // backup lpf
575        mov             x3,  x0               // lpf = p
576        bl              wiener_filter5_h_8bpc_neon
577        subs            w5,  w5,  #1          // h--
578        mov             x13, x14              // t2
579        b.eq            L(v1_5)
580        add             x3,  x3,  x1          // src += stride
581        add             x14, x14, #384*2      // t1 += 384*2
582        bl              wiener_filter5_h_8bpc_neon
583        subs            w5,  w5,  #1          // h--
584        b.eq            L(v2_5)
585        add             x3,  x3,  x1          // src += stride
586
587L(main_5):
588        mov             x15, x11              // t0 = t4
589L(main_loop_5):
590        bl              wiener_filter5_hv_8bpc_neon
591        subs            w5,  w5,  #1          // h--
592        b.ne            L(main_loop_5)
593        tst             w7,  #8 // LR_HAVE_BOTTOM
594        b.eq            L(v2_5)
595
596        mov             x3,  x16              // restore lpf
597        mov             x2,  #0               // left = NULL
598        bl              wiener_filter5_hv_8bpc_neon
599        bl              wiener_filter5_hv_8bpc_neon
600L(end_5):
601
602        mov             sp,  x29
603        ldp             x29, x30, [sp], #16
604        AARCH64_VALIDATE_LINK_REGISTER
605        ret
606
607L(no_top_5):
608        add             x3,  x3,  x1,  lsl #2
609        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
610        mov             x3,  x0               // lpf = p
611
612        bl              wiener_filter5_h_8bpc_neon
613        subs            w5,  w5,  #1          // h--
614        mov             x11, x14              // t4
615        mov             x12, x14              // t3
616        mov             x13, x14              // t2
617        b.eq            L(v1_5)
618        add             x3,  x3,  x1          // src += stride
619        add             x14, x14, #384*2      // t1 += 384*2
620        bl              wiener_filter5_h_8bpc_neon
621        subs            w5,  w5,  #1          // h--
622        b.eq            L(v2_5)
623        add             x3,  x3,  x1          // src += stride
624        add             x15, x14, #384*2      // t0 = t1 + 384*2
625        bl              wiener_filter5_hv_8bpc_neon
626        subs            w5,  w5,  #1          // h--
627        b.eq            L(v2_5)
628        add             x15, x15, #384*2*3    // t0 += 384*2*3
629        bl              wiener_filter5_hv_8bpc_neon
630        subs            w5,  w5,  #1          // h--
631        b.ne            L(main_5)
632L(v2_5):
633        bl              wiener_filter5_v_8bpc_neon
634        add             x0,  x0,  x1
635        mov             x11, x12
636        mov             x12, x13
637        mov             x13, x14
638L(v1_5):
639        bl              wiener_filter5_v_8bpc_neon
640        b               L(end_5)
641endfunc
642
643
644function wiener_filter5_h_8bpc_neon
645        stp             x3,  x4,  [sp, #-32]!
646        str             x14,      [sp, #16]
647
648        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
649        tst             w7,  #1 // LR_HAVE_LEFT
650        b.eq            1f
651        // LR_HAVE_LEFT
652        cbnz            x2,  0f
653        // left == NULL
654        sub             x3,  x3,  #2
655        ld1             {v3.16b}, [x3], #16
656        b               2f
657
6580:
659        // LR_HAVE_LEFT, left != NULL
660        ld1             {v3.16b},  [x3], #16
661        ld1             {v2.s}[3], [x2], #4
662        // Move x3 back to account for the last 2 bytes we loaded earlier,
663        // which we'll shift out.
664        sub             x3,  x3,  #2
665        ext             v3.16b,  v2.16b,  v3.16b, #14
666        b               2f
667
6681:
669        ld1             {v3.16b}, [x3], #16
670        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
671        // and shift v3 to have 3x the first byte at the front.
672        dup             v2.16b,  v3.b[0]
673        // Move x3 back to account for the last 2 bytes we loaded before,
674        // which we shifted out.
675        sub             x3,  x3,  #2
676        ext             v3.16b,  v2.16b,  v3.16b, #14
677
6782:
679        ld1             {v4.8b}, [x3], #8
680        uxtl            v2.8h,   v3.8b
681        uxtl2           v3.8h,   v3.16b
682        uxtl            v4.8h,   v4.8b
683
684        tst             w7,  #2 // LR_HAVE_RIGHT
685        b.ne            4f
686
6873:      // !LR_HAVE_RIGHT
688
689        // Check whether we need to pad the right edge
690        cmp             w4,  #18
691        b.ge            4f   // If w >= 18, all used input pixels are valid
692
693        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
694        // this ends up called again; it's not strictly needed in those
695        // cases (we pad enough here), but keeping the code as simple as possible.
696
697        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
698        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
699        sub             w17, w4,  #23
700        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
701        // buffer pointer.
702        movrel          x6,  right_ext_mask, -4
703        ldr             b28, [x3,  w17, sxtw]
704        sub             x6,  x6,  w4,  uxtw #1
705        dup             v28.8h,  v28.h[0]
706        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
707
708        bit             v2.16b,  v28.16b, v25.16b
709        bit             v3.16b,  v28.16b, v26.16b
710        bit             v4.16b,  v28.16b, v27.16b
711
7124:      // Loop horizontally
713        // Interleaving the mul/mla chains actually hurts performance
714        // significantly on Cortex A53, thus keeping mul/mla tightly
715        // chained like this.
716        ext             v16.16b, v2.16b,  v3.16b, #2
717        ext             v18.16b, v2.16b,  v3.16b, #6
718        ext             v19.16b, v2.16b,  v3.16b, #8
719        ext             v17.16b, v2.16b,  v3.16b, #4
720        add             v18.8h,  v18.8h,  v16.8h
721        add             v19.8h,  v19.8h,  v2.8h
722        shl             v22.8h,  v17.8h,  #7
723        mul             v6.8h,   v17.8h,  v0.h[3]
724        mla             v6.8h,   v18.8h,  v0.h[4]
725        mla             v6.8h,   v19.8h,  v0.h[5]
726
727        ext             v16.16b, v3.16b,  v4.16b, #2
728        ext             v18.16b, v3.16b,  v4.16b, #6
729        ext             v19.16b, v3.16b,  v4.16b, #8
730        ext             v17.16b, v3.16b,  v4.16b, #4
731        add             v18.8h,  v18.8h,  v16.8h
732        add             v19.8h,  v19.8h,  v3.8h
733        shl             v23.8h,  v17.8h,  #7
734        mul             v7.8h,   v17.8h,  v0.h[3]
735        mla             v7.8h,   v18.8h,  v0.h[4]
736        mla             v7.8h,   v19.8h,  v0.h[5]
737
738        sub             v22.8h,  v22.8h,  v30.8h
739        sub             v23.8h,  v23.8h,  v30.8h
740        sqadd           v6.8h,   v6.8h,   v22.8h
741        sqadd           v7.8h,   v7.8h,   v23.8h
742        sshr            v6.8h,   v6.8h,   #3
743        sshr            v7.8h,   v7.8h,   #3
744        add             v6.8h,   v6.8h,   v31.8h
745        add             v7.8h,   v7.8h,   v31.8h
746
747        subs            w4,  w4,  #16
748
749        st1             {v6.8h, v7.8h}, [x14], #32
750
751        b.le            0f
752        mov             v2.16b,  v4.16b
753        ld1             {v4.16b}, [x3], #16
754        tst             w7,  #2 // LR_HAVE_RIGHT
755        uxtl            v3.8h,   v4.8b
756        uxtl2           v4.8h,   v4.16b
757        b.ne            4b // If we don't need to pad, just keep filtering.
758        b               3b // If we need to pad, check how many pixels we have left.
759
7600:
761        ldr             x14,      [sp, #16]
762        ldp             x3,  x4,  [sp], #32
763        ret
764endfunc
765
766function wiener_filter5_v_8bpc_neon
767        stp             x11, x12, [sp, #-48]!
768        stp             x13, x14, [sp, #16]
769        stp             x0,  x4,  [sp, #32]
7701:
771        ld1             {v18.8h, v19.8h}, [x12], #32
772        ld1             {v22.8h, v23.8h}, [x14], #32
773        ld1             {v16.8h, v17.8h}, [x11], #32
774
775        add             v24.8h,  v22.8h,  v18.8h
776        ld1             {v20.8h, v21.8h}, [x13], #32
777        add             v16.8h,  v22.8h,  v16.8h
778        add             v25.8h,  v23.8h,  v19.8h
779
780        smull           v2.4s,   v20.4h,  v1.h[3]
781        smlal           v2.4s,   v24.4h,  v1.h[4]
782        smlal           v2.4s,   v16.4h,  v1.h[5]
783        add             v17.8h,  v23.8h,  v17.8h
784        smull2          v3.4s,   v20.8h,  v1.h[3]
785        smlal2          v3.4s,   v24.8h,  v1.h[4]
786        smlal2          v3.4s,   v16.8h,  v1.h[5]
787        smull           v4.4s,   v21.4h,  v1.h[3]
788        smlal           v4.4s,   v25.4h,  v1.h[4]
789        smlal           v4.4s,   v17.4h,  v1.h[5]
790        smull2          v5.4s,   v21.8h,  v1.h[3]
791        smlal2          v5.4s,   v25.8h,  v1.h[4]
792        smlal2          v5.4s,   v17.8h,  v1.h[5]
793        sqrshrun        v2.4h,   v2.4s,   #11
794        sqrshrun2       v2.8h,   v3.4s,   #11
795        sqrshrun        v3.4h,   v4.4s,   #11
796        sqrshrun2       v3.8h,   v5.4s,   #11
797        sqxtun          v2.8b,   v2.8h
798        sqxtun2         v2.16b,  v3.8h
799        subs            w4,  w4,  #16
800        st1             {v2.16b}, [x0], #16
801        b.gt            1b
802
803        ldp             x0,  x4,  [sp, #32]
804        ldp             x13, x14, [sp, #16]
805        ldp             x11, x12, [sp], #48
806
807        ret
808endfunc
809
810function wiener_filter5_hv_8bpc_neon
811        // Backing up/restoring registers shifted, so that x11 gets the value
812        // of x12, etc, and x15==x11, afterwards.
813        stp             x12, x13, [sp, #-64]!
814        stp             x14, x15, [sp, #16]
815        stp             x12, x0,  [sp, #32]
816        stp             x3,  x4,  [sp, #48]
817
818        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
819        tst             w7,  #1 // LR_HAVE_LEFT
820        b.eq            1f
821        // LR_HAVE_LEFT
822        cbnz            x2,  0f
823        // left == NULL
824        sub             x3,  x3,  #2
825        ld1             {v3.16b}, [x3], #16
826        b               2f
827
8280:
829        // LR_HAVE_LEFT, left != NULL
830        ld1             {v3.16b},  [x3], #16
831        ld1             {v2.s}[3], [x2], #4
832        // Move x3 back to account for the last 2 bytes we loaded earlier,
833        // which we'll shift out.
834        sub             x3,  x3,  #2
835        ext             v3.16b,  v2.16b,  v3.16b, #14
836        b               2f
8371:
838        ld1             {v3.16b}, [x3], #16
839        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
840        // and shift v3 to have 2x the first byte at the front.
841        dup             v2.16b,  v3.b[0]
842        // Move x3 back to account for the last 2 bytes we loaded before,
843        // which we shifted out.
844        sub             x3,  x3,  #2
845        ext             v3.16b, v2.16b, v3.16b, #14
846
8472:
848        ld1             {v4.8b}, [x3], #8
849        uxtl            v2.8h,  v3.8b
850        uxtl2           v3.8h,  v3.16b
851        uxtl            v4.8h,  v4.8b
852
853        tst             w7,  #2 // LR_HAVE_RIGHT
854        b.ne            4f
855
8563:      // !LR_HAVE_RIGHT
857
858        // Check whether we need to pad the right edge
859        cmp             w4,  #18
860        b.ge            4f   // If w >= 18, all used input pixels are valid
861
862        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
863        // this ends up called again; it's not strictly needed in those
864        // cases (we pad enough here), but keeping the code as simple as possible.
865
866        // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
867        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
868        sub             w17, w4,  #23
869        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
870        // buffer pointer.
871        movrel          x6,  right_ext_mask, -4
872        ldr             b28, [x3,  w17, sxtw]
873        sub             x6,  x6,  w4,  uxtw #1
874        dup             v28.8h,  v28.h[0]
875        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
876
877        bit             v2.16b,  v28.16b, v25.16b
878        bit             v3.16b,  v28.16b, v26.16b
879        bit             v4.16b,  v28.16b, v27.16b
880
8814:      // Loop horizontally
882
883        ext             v16.16b, v2.16b,  v3.16b, #2
884        ext             v18.16b, v2.16b,  v3.16b, #6
885        ext             v19.16b, v2.16b,  v3.16b, #8
886        ext             v17.16b, v2.16b,  v3.16b, #4
887        add             v18.8h,  v18.8h,  v16.8h
888        add             v19.8h,  v19.8h,  v2.8h
889        shl             v22.8h,  v17.8h,  #7
890        mul             v6.8h,   v17.8h,  v0.h[3]
891        mla             v6.8h,   v18.8h,  v0.h[4]
892        mla             v6.8h,   v19.8h,  v0.h[5]
893
894        ext             v16.16b, v3.16b,  v4.16b, #2
895        ext             v18.16b, v3.16b,  v4.16b, #6
896        ext             v19.16b, v3.16b,  v4.16b, #8
897        ext             v17.16b, v3.16b,  v4.16b, #4
898        add             v18.8h,  v18.8h,  v16.8h
899        add             v19.8h,  v19.8h,  v3.8h
900        shl             v23.8h,  v17.8h,  #7
901        mul             v7.8h,   v17.8h,  v0.h[3]
902        mla             v7.8h,   v18.8h,  v0.h[4]
903        mla             v7.8h,   v19.8h,  v0.h[5]
904
905        ld1             {v18.8h, v19.8h}, [x12], #32
906
907        sub             v22.8h,  v22.8h,  v30.8h
908        sub             v23.8h,  v23.8h,  v30.8h
909        ld1             {v24.8h, v25.8h}, [x14], #32
910        sqadd           v6.8h,   v6.8h,   v22.8h
911        sqadd           v7.8h,   v7.8h,   v23.8h
912        ld1             {v16.8h, v17.8h}, [x11], #32
913        sshr            v6.8h,   v6.8h,   #3
914        sshr            v7.8h,   v7.8h,   #3
915        ld1             {v20.8h, v21.8h}, [x13], #32
916        add             v6.8h,   v6.8h,   v31.8h
917        add             v7.8h,   v7.8h,   v31.8h
918
919        add             v24.8h,  v24.8h,  v18.8h
920        add             v16.8h,  v16.8h,  v6.8h
921
922        smull           v18.4s,  v20.4h,  v1.h[3]
923        smlal           v18.4s,  v24.4h,  v1.h[4]
924        smlal           v18.4s,  v16.4h,  v1.h[5]
925        add             v25.8h,  v25.8h,  v19.8h
926        smull2          v19.4s,  v20.8h,  v1.h[3]
927        smlal2          v19.4s,  v24.8h,  v1.h[4]
928        smlal2          v19.4s,  v16.8h,  v1.h[5]
929        add             v17.8h,  v17.8h,  v7.8h
930        smull           v20.4s,  v21.4h,  v1.h[3]
931        smlal           v20.4s,  v25.4h,  v1.h[4]
932        smlal           v20.4s,  v17.4h,  v1.h[5]
933        smull2          v21.4s,  v21.8h,  v1.h[3]
934        smlal2          v21.4s,  v25.8h,  v1.h[4]
935        smlal2          v21.4s,  v17.8h,  v1.h[5]
936        sqrshrun        v18.4h,  v18.4s,  #11
937        sqrshrun2       v18.8h,  v19.4s,  #11
938        sqrshrun        v19.4h,  v20.4s,  #11
939        sqrshrun2       v19.8h,  v21.4s,  #11
940        st1             {v6.8h, v7.8h}, [x15], #32
941        sqxtun          v18.8b,  v18.8h
942        sqxtun2         v18.16b, v19.8h
943        subs            w4,  w4,  #16
944
945        st1             {v18.16b}, [x0], #16
946
947        b.le            0f
948        mov             v2.16b,  v4.16b
949        ld1             {v4.16b}, [x3], #16
950        tst             w7,  #2 // LR_HAVE_RIGHT
951        uxtl            v3.8h,   v4.8b
952        uxtl2           v4.8h,   v4.16b
953        b.ne            4b // If we don't need to pad, just keep filtering.
954        b               3b // If we need to pad, check how many pixels we have left.
955
9560:
957        ldp             x3,  x4,  [sp, #48]
958        ldp             x15, x0,  [sp, #32]
959        ldp             x13, x14, [sp, #16]
960        ldp             x11, x12, [sp], #64
961
962        add             x3,  x3,  x1
963        add             x0,  x0,  x1
964
965        ret
966endfunc
967
968#include "looprestoration_tmpl.S"
969
970// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
971//                                     const pixel (*left)[4],
972//                                     const pixel *src, const int w,
973//                                     const enum LrEdgeFlags edges);
974function sgr_box3_row_h_8bpc_neon, export=1
975        add             w4,  w4,  #2 // w += 2
976
977        tst             w5,  #1 // LR_HAVE_LEFT
978        b.eq            1f
979        cbnz            x2,  0f
980
981        // LR_HAVE_LEFT && left == NULL
982        sub             x3,  x3,  #2
983        ld1             {v0.16b}, [x3], #16
984        b               2f
985
9860:
987        // LR_HAVE_LEFT, left != NULL
988        ld1             {v0.16b},  [x3], #16
989        ld1             {v1.s}[3], [x2]
990        // Move x3 back to account for the last 2 bytes we loaded earlier,
991        // which we'll shift out.
992        sub             x3,  x3,  #2
993        ext             v0.16b, v1.16b, v0.16b, #14
994        b               2f
995
9961:
997        ld1             {v0.16b}, [x3], #16
998        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
999        // and shift v0 to have 2x the first byte at the front.
1000        dup             v1.16b, v0.b[0]
1001        // Move x3 back to account for the last 2 bytes we loaded before,
1002        // which we shifted out.
1003        sub             x3,  x3,  #2
1004        ext             v0.16b, v1.16b, v0.16b, #14
1005
10062:
1007        umull           v1.8h,   v0.8b,   v0.8b
1008        umull2          v2.8h,   v0.16b,  v0.16b
1009
1010        tst             w5,  #2 // LR_HAVE_RIGHT
1011        b.ne            4f
1012        // If we'll need to pad the right edge, load that byte to pad with
1013        // here since we can find it pretty easily from here.
1014        sub             w13, w4, #(2 + 16 - 2 + 1)
1015        ldr             b30, [x3,  w13, sxtw]
1016        // Fill v30 with the right padding pixel
1017        dup             v30.16b, v30.b[0]
10183:      // !LR_HAVE_RIGHT
1019
1020        // Check whether we need to pad the right edge
1021        cmp             w4,  #10
1022        b.ge            4f   // If w >= 10, all used input pixels are valid
1023
1024        // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
1025        // again; it's not strictly needed in those cases (we pad enough here),
1026        // but keeping the code as simple as possible.
1027
1028        // Insert padding in v0.b[w] onwards
1029        movrel          x13, right_ext_mask
1030        sub             x13, x13, w4,  uxtw
1031        ld1             {v29.16b}, [x13]
1032
1033        bit             v0.16b,  v30.16b, v29.16b
1034
1035        // Update the precalculated squares
1036        umull           v1.8h,   v0.8b,   v0.8b
1037        umull2          v2.8h,   v0.16b,  v0.16b
1038
10394:      // Loop horizontally
1040        ext             v16.16b, v0.16b,  v0.16b, #1
1041        ext             v17.16b, v0.16b,  v0.16b, #2
1042        uaddl           v3.8h,   v0.8b,   v16.8b
1043        ext             v20.16b, v1.16b,  v2.16b, #2
1044        uaddw           v3.8h,   v3.8h,   v17.8b
1045
1046        ext             v21.16b, v1.16b,  v2.16b, #4
1047
1048        uaddl           v26.4s,  v1.4h,   v20.4h
1049        uaddl2          v27.4s,  v1.8h,   v20.8h
1050        uaddw           v26.4s,  v26.4s,  v21.4h
1051        uaddw2          v27.4s,  v27.4s,  v21.8h
1052
1053        subs            w4,  w4,  #8
1054
1055        st1             {v3.8h},         [x1],  #16
1056        st1             {v26.4s,v27.4s}, [x0],  #32
1057
1058        b.le            9f
1059        tst             w5,  #2 // LR_HAVE_RIGHT
1060        ld1             {v3.8b},  [x3],  #8
1061        mov             v1.16b,  v2.16b
1062        ext             v0.16b,  v0.16b,  v3.16b, #8
1063        umull           v2.8h,   v3.8b,   v3.8b
1064
1065        b.ne            4b // If we don't need to pad, just keep summing.
1066        b               3b // If we need to pad, check how many pixels we have left.
1067
10689:
1069        ret
1070endfunc
1071
1072// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
1073//                                     const pixel (*left)[4],
1074//                                     const pixel *src, const int w,
1075//                                     const enum LrEdgeFlags edges);
1076function sgr_box5_row_h_8bpc_neon, export=1
1077        add             w4,  w4,  #2 // w += 2
1078
1079        tst             w5,  #1 // LR_HAVE_LEFT
1080        b.eq            1f
1081        cbnz            x2,  0f
1082
1083        // LR_HAVE_LEFT && left == NULL
1084        sub             x3,  x3,  #3
1085        ld1             {v0.16b}, [x3], #16
1086        b               2f
1087
10880:
1089        // LR_HAVE_LEFT, left != NULL
1090        ld1             {v0.16b},  [x3], #16
1091        ld1             {v1.s}[3], [x2], #4
1092        // Move x3 back to account for the last 3 bytes we loaded earlier,
1093        // which we'll shift out.
1094        sub             x3,  x3,  #3
1095        ext             v0.16b, v1.16b, v0.16b, #13
1096        b               2f
1097
10981:
1099        ld1             {v0.16b}, [x3], #16
1100        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
1101        // and shift v0 to have 3x the first byte at the front.
1102        dup             v1.16b, v0.b[0]
1103        // Move x3 back to account for the last 3 bytes we loaded before,
1104        // which we shifted out.
1105        sub             x3,  x3,  #3
1106        ext             v0.16b, v1.16b, v0.16b, #13
1107
11082:
1109        umull           v1.8h,   v0.8b,   v0.8b
1110        umull2          v2.8h,   v0.16b,  v0.16b
1111
1112        tst             w5,  #2 // LR_HAVE_RIGHT
1113        b.ne            4f
1114        // If we'll need to pad the right edge, load that byte to pad with
1115        // here since we can find it pretty easily from here.
1116        sub             w13, w4, #(2 + 16 - 3 + 1)
1117        ldr             b30, [x3,  w13, sxtw]
1118        // Fill v30 with the right padding pixel
1119        dup             v30.16b, v30.b[0]
11203:      // !LR_HAVE_RIGHT
1121
1122        // Check whether we need to pad the right edge
1123        cmp             w4,  #11
1124        b.ge            4f   // If w >= 11, all used input pixels are valid
1125
1126        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
1127        // this ends up called again; it's not strictly needed in those
1128        // cases (we pad enough here), but keeping the code as simple as possible.
1129
1130        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
1131        // buffer pointer.
1132        movrel          x13, right_ext_mask, -1
1133        sub             x13, x13, w4,  uxtw
1134        ld1             {v29.16b}, [x13]
1135
1136        bit             v0.16b,  v30.16b, v29.16b
1137
1138        // Update the precalculated squares
1139        umull           v1.8h,   v0.8b,   v0.8b
1140        umull2          v2.8h,   v0.16b,  v0.16b
1141
11424:      // Loop horizontally
1143        ext             v16.16b, v0.16b,  v0.16b, #1
1144        ext             v17.16b, v0.16b,  v0.16b, #2
1145        ext             v18.16b, v0.16b,  v0.16b, #3
1146        ext             v19.16b, v0.16b,  v0.16b, #4
1147        uaddl           v3.8h,   v0.8b,   v16.8b
1148        uaddl           v24.8h,  v17.8b,  v18.8b
1149        uaddw           v3.8h,   v3.8h,   v19.8b
1150        add             v3.8h,   v3.8h,   v24.8h
1151
1152        ext             v16.16b, v1.16b,  v2.16b, #2
1153        ext             v17.16b, v1.16b,  v2.16b, #4
1154        ext             v18.16b, v1.16b,  v2.16b, #6
1155        ext             v19.16b, v1.16b,  v2.16b, #8
1156
1157        uaddl           v26.4s,  v1.4h,   v16.4h
1158        uaddl2          v27.4s,  v1.8h,   v16.8h
1159        uaddl           v16.4s,  v17.4h,  v18.4h
1160        uaddl2          v17.4s,  v17.8h,  v18.8h
1161        uaddw           v26.4s,  v26.4s,  v19.4h
1162        uaddw2          v27.4s,  v27.4s,  v19.8h
1163        add             v26.4s,  v26.4s,  v16.4s
1164        add             v27.4s,  v27.4s,  v17.4s
1165
1166        subs            w4,  w4,  #8
1167
1168        st1             {v3.8h},         [x1],  #16
1169        st1             {v26.4s,v27.4s}, [x0],  #32
1170
1171        b.le            9f
1172        tst             w5,  #2 // LR_HAVE_RIGHT
1173        ld1             {v3.8b},  [x3],  #8
1174        mov             v1.16b,  v2.16b
1175        ext             v0.16b,  v0.16b,  v3.16b, #8
1176        umull           v2.8h,   v3.8b,   v3.8b
1177
1178        b.ne            4b // If we don't need to pad, just keep summing.
1179        b               3b // If we need to pad, check how many pixels we have left.
1180
11819:
1182        ret
1183endfunc
1184
1185// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3,
1186//                                      int32_t *sumsq5, int16_t *sum5,
1187//                                      const pixel (*left)[4],
1188//                                      const pixel *src, const int w,
1189//                                      const enum LrEdgeFlags edges);
1190function sgr_box35_row_h_8bpc_neon, export=1
1191        add             w6,  w6,  #2 // w += 2
1192
1193        tst             w7,  #1 // LR_HAVE_LEFT
1194        b.eq            1f
1195        cbnz            x4,  0f
1196
1197        // LR_HAVE_LEFT && left == NULL
1198        sub             x5,  x5,  #3
1199        ld1             {v0.16b},  [x5], #16
1200        b               2f
1201
12020:
1203        // LR_HAVE_LEFT, left != NULL
1204        ld1             {v0.16b},  [x5], #16
1205        ld1             {v1.s}[3], [x4], #4
1206        // Move x3 back to account for the last 3 bytes we loaded earlier,
1207        // which we'll shift out.
1208        sub             x5,  x5,  #3
1209        ext             v0.16b, v1.16b, v0.16b, #13
1210        b               2f
1211
12121:
1213        ld1             {v0.16b}, [x5], #16
1214        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
1215        // and shift v0 to have 3x the first byte at the front.
1216        dup             v1.16b, v0.b[0]
1217        // Move x3 back to account for the last 3 bytes we loaded before,
1218        // which we shifted out.
1219        sub             x5,  x5,  #3
1220        ext             v0.16b, v1.16b, v0.16b, #13
1221
12222:
1223        umull           v1.8h,   v0.8b,   v0.8b
1224        umull2          v2.8h,   v0.16b,  v0.16b
1225
1226        tst             w7,  #2 // LR_HAVE_RIGHT
1227        b.ne            4f
1228        // If we'll need to pad the right edge, load that byte to pad with
1229        // here since we can find it pretty easily from here.
1230        sub             w13, w6, #(2 + 16 - 3 + 1)
1231        ldr             b30, [x5,  w13, sxtw]
1232        // Fill v30 with the right padding pixel
1233        dup             v30.16b, v30.b[0]
12343:      // !LR_HAVE_RIGHT
1235
1236        // Check whether we need to pad the right edge
1237        cmp             w6,  #11
1238        b.ge            4f   // If w >= 11, all used input pixels are valid
1239
1240        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
1241        // this ends up called again; it's not strictly needed in those
1242        // cases (we pad enough here), but keeping the code as simple as possible.
1243
1244        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
1245        // buffer pointer.
1246        movrel          x13, right_ext_mask, -1
1247        sub             x13, x13, w6,  uxtw
1248        ld1             {v29.16b}, [x13]
1249
1250        bit             v0.16b,  v30.16b, v29.16b
1251
1252        // Update the precalculated squares
1253        umull           v1.8h,   v0.8b,   v0.8b
1254        umull2          v2.8h,   v0.16b,  v0.16b
1255
12564:      // Loop horizontally
1257        ext             v16.16b, v0.16b,  v0.16b, #1
1258        ext             v17.16b, v0.16b,  v0.16b, #2
1259        ext             v19.16b, v0.16b,  v0.16b, #4
1260        ext             v18.16b, v0.16b,  v0.16b, #3
1261        uaddl           v3.8h,   v16.8b,  v17.8b
1262        uaddl           v24.8h,  v0.8b,   v19.8b
1263        uaddw           v3.8h,   v3.8h,   v18.8b
1264
1265        ext             v16.16b, v1.16b,  v2.16b, #2
1266        ext             v17.16b, v1.16b,  v2.16b, #4
1267        ext             v19.16b, v1.16b,  v2.16b, #8
1268        ext             v18.16b, v1.16b,  v2.16b, #6
1269
1270        st1             {v3.8h},         [x1], #16
1271        add             v3.8h,   v3.8h,   v24.8h
1272
1273        uaddl           v26.4s,  v16.4h,  v17.4h
1274        uaddl2          v27.4s,  v16.8h,  v17.8h
1275        uaddl           v16.4s,  v1.4h,   v19.4h
1276        uaddl2          v17.4s,  v1.8h,   v19.8h
1277        uaddw           v26.4s,  v26.4s,  v18.4h
1278        uaddw2          v27.4s,  v27.4s,  v18.8h
1279
1280        st1             {v26.4s,v27.4s}, [x0], #32
1281        add             v26.4s,  v26.4s,  v16.4s
1282        add             v27.4s,  v27.4s,  v17.4s
1283
1284        subs            w6,  w6,  #8
1285
1286        st1             {v3.8h},         [x3], #16
1287        st1             {v26.4s,v27.4s}, [x2], #32
1288
1289        b.le            9f
1290        tst             w7,  #2 // LR_HAVE_RIGHT
1291        ld1             {v3.8b},  [x5],  #8
1292        mov             v1.16b,  v2.16b
1293        ext             v0.16b,  v0.16b,  v3.16b, #8
1294        umull           v2.8h,   v3.8b,   v3.8b
1295
1296        b.ne            4b // If we don't need to pad, just keep summing.
1297        b               3b // If we need to pad, check how many pixels we have left.
1298
12999:
1300        ret
1301endfunc
1302
1303sgr_funcs 8
1304