• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2021, VideoLAN and dav1d authors
3 * Copyright © 2021, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30#include "src/arm/asm-offsets.h"
31
32#define GRAIN_WIDTH 82
33#define GRAIN_HEIGHT 73
34
35#define SUB_GRAIN_WIDTH 44
36#define SUB_GRAIN_HEIGHT 38
37
38.macro increment_seed steps, shift=1
39        lsr             w11, w2,  #3
40        lsr             w12, w2,  #12
41        lsr             w13, w2,  #1
42        eor             w11, w2,  w11                     // (r >> 0) ^ (r >> 3)
43        eor             w12, w12, w13                     // (r >> 12) ^ (r >> 1)
44        eor             w11, w11, w12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
45.if \shift
46        lsr             w2,  w2,  #\steps
47.endif
48        and             w11, w11, #((1 << \steps) - 1)    // bit
49.if \shift
50        orr             w2,  w2,  w11, lsl #(16 - \steps) // *state
51.else
52        orr             w2,  w2,  w11, lsl #16            // *state
53.endif
54.endm
55
56.macro read_rand dest, bits, age
57        ubfx            \dest,  x2,   #16 - \bits - \age, #\bits
58.endm
59
60.macro read_shift_rand dest, bits
61        ubfx            \dest,  x2,   #17 - \bits, #\bits
62        lsr             w2,  w2,  #1
63.endm
64
65// special calling convention:
66// w2 holds seed
67// x3 holds dav1d_gaussian_sequence
68// clobbers x11-x15
69// returns in v0.8h
70function get_gaussian_neon
71        increment_seed  4
72        read_rand       x14, 11,  3
73        read_rand       x15, 11,  2
74        add             x14, x3,  x14, lsl #1
75        add             x15, x3,  x15, lsl #1
76        ld1             {v0.h}[0], [x14]
77        read_rand       x14, 11,  1
78        ld1             {v0.h}[1], [x15]
79        add             x14, x3,  x14, lsl #1
80        read_rand       x15, 11,  0
81        increment_seed  4
82        add             x15, x3,  x15, lsl #1
83        ld1             {v0.h}[2], [x14]
84        read_rand       x14, 11,  3
85        ld1             {v0.h}[3], [x15]
86        add             x14, x3,  x14, lsl #1
87        read_rand       x15, 11,  2
88        ld1             {v0.h}[4], [x14]
89        add             x15, x3,  x15, lsl #1
90        read_rand       x14, 11,  1
91        ld1             {v0.h}[5], [x15]
92        read_rand       x15, 11,  0
93        add             x14, x3,  x14, lsl #1
94        add             x15, x3,  x15, lsl #1
95        ld1             {v0.h}[6], [x14]
96        ld1             {v0.h}[7], [x15]
97        ret
98endfunc
99
100.macro store_grain_row r0, r1, r2, r3, r4, r5
101        st1             {\r0\().16b,\r1\().16b}, [x0], #32
102        st1             {\r2\().16b,\r3\().16b}, [x0], #32
103        st1             {\r4\().16b},  [x0], #16
104        st1             {\r5\().h}[0], [x0], #2
105.endm
106
107function get_grain_2_neon
108        increment_seed  2
109        read_rand       x14, 11,  1
110        read_rand       x15, 11,  0
111        add             x14, x3,  x14, lsl #1
112        add             x15, x3,  x15, lsl #1
113        ld1             {v0.h}[0], [x14]
114        ld1             {v0.h}[1], [x15]
115        srshl           v0.4h,   v0.4h,   v31.4h
116        ret
117endfunc
118
119.macro get_grain_2 dst
120        bl              get_grain_2_neon
121.ifnc \dst, v0
122        mov             \dst\().8b, v0.8b
123.endif
124.endm
125
126function get_grain_4_neon
127        increment_seed  4
128        read_rand       x14, 11,  3
129        read_rand       x15, 11,  2
130        add             x14, x3,  x14, lsl #1
131        add             x15, x3,  x15, lsl #1
132        ld1             {v0.h}[0], [x14]
133        read_rand       x14, 11,  1
134        ld1             {v0.h}[1], [x15]
135        add             x14, x3,  x14, lsl #1
136        read_rand       x15, 11,  0
137        add             x15, x3,  x15, lsl #1
138        ld1             {v0.h}[2], [x14]
139        ld1             {v0.h}[3], [x15]
140        srshl           v0.4h,   v0.4h,   v31.4h
141        ret
142endfunc
143
144.macro get_grain_4 dst
145        bl              get_grain_4_neon
146.ifnc \dst, v0
147        mov             \dst\().8b, v0.8b
148.endif
149.endm
150
151// w15 holds the number of entries to produce
152// w14, w16 and w17 hold the previous output entries
153// v0 holds the vector of produced entries
154// v1 holds the input vector of sums from above
155.macro output_lag n
156function output_lag\n\()_neon
1571:
158        read_shift_rand x13, 11
159        mov             w11, v1.s[0]
160        ldrsh           w12, [x3, x13, lsl #1]
161        ext             v0.16b,  v0.16b,  v0.16b,  #2
162.if \n == 1
163        madd            w11, w14, w4,  w11        // sum (above) + *coeff * prev output
164.elseif \n == 2
165        madd            w11, w16, w4,  w11        // sum (above) + *coeff * prev output 1
166        madd            w11, w14, w17, w11        // += *coeff * prev output 2
167        mov             w16, w14
168.else
169        madd            w11, w17, w4,  w11        // sum (above) + *coeff * prev output 1
170        madd            w11, w16, w20, w11        // sum (above) + *coeff * prev output 2
171        madd            w11, w14, w21, w11        // += *coeff * prev output 3
172        mov             w17, w16
173        mov             w16, w14
174.endif
175        add             w14, w11, w8              // 1 << (ar_coeff_shift - 1)
176        add             w12, w12, w10             // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
177        asr             w14, w14, w7              // >> ar_coeff_shift
178        asr             w12, w12, w9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
179        add             w14, w14, w12
180        cmp             w14, w5
181        csel            w14, w14, w5,  le
182        cmp             w14, w6
183        csel            w14, w14, w6,  ge
184        subs            w15, w15, #1
185        ext             v1.16b,  v1.16b,  v1.16b,  #4
186        ins             v0.h[7], w14
187        b.gt            1b
188        ret
189endfunc
190.endm
191
192output_lag 1
193output_lag 2
194output_lag 3
195
196
197function sum_lag1_above_neon
198        sub             x12, x0,  #1*GRAIN_WIDTH*2 - 16
199        ld1             {v18.8h}, [x12] // load top right
200
201        ext             v0.16b,  v16.16b, v17.16b, #14 // top left, top mid
202        ext             v1.16b,  v17.16b, v18.16b, #2  // top mid, top right
203
204        smull           v4.4s,   v17.4h,  v28.4h
205        smlal           v4.4s,   v0.4h,   v27.4h
206        smlal           v4.4s,   v1.4h,   v29.4h
207        smull2          v5.4s,   v17.8h,  v28.8h
208        smlal2          v5.4s,   v0.8h,   v27.8h
209        smlal2          v5.4s,   v1.8h,   v29.8h
210
211        mov             v16.16b, v17.16b
212        mov             v17.16b, v18.16b
213
214        ret
215endfunc
216
217.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
218        bl              sum_\lag\()_above_neon
219.ifc \type, uv_420
220        add             x12, x19, #GRAIN_WIDTH*2
221        ld1             {v22.8h, v23.8h}, [x19], #32
222        ld1             {v24.8h, v25.8h}, [x12]
223        addp            v22.8h,  v22.8h,  v23.8h
224        addp            v23.8h,  v24.8h,  v25.8h
225        add             v22.8h,  v22.8h,  v23.8h
226        srshr           v0.8h,   v22.8h,  #2
227.endif
228.ifc \type, uv_422
229        ld1             {v22.8h, v23.8h}, [x19], #32
230        addp            v22.8h,  v22.8h,  v23.8h
231        srshr           v0.8h,   v22.8h,  #1
232.endif
233.ifc \type, uv_444
234        ld1             {v0.8h}, [x19], #16
235.endif
236.if \uv_layout
237.ifnb \uv_coeff
238        dup             v1.8b,   \uv_coeff
239        sxtl            v1.8h,   v1.8b
240        smlal           v4.4s,   v0.4h,   v1.4h
241        smlal2          v5.4s,   v0.8h,   v1.8h
242.else
243        smlal           v4.4s,   v0.4h,   v30.4h
244        smlal2          v5.4s,   v0.8h,   v30.8h
245.endif
246.endif
247.if \uv_layout && \elems == 8
248        b               sum_\lag\()_y_\edge\()_start
249.elseif \uv_layout == 444 && \elems == 7
250        b               sum_\lag\()_y_\edge\()_start
251.elseif \uv_layout == 422 && \elems == 1
252        b               sum_\lag\()_uv_420_\edge\()_start
253.else
254sum_\lag\()_\type\()_\edge\()_start:
255.if \elems > 4
256.ifc \edge, left
257        increment_seed  4
258        read_rand       x12, 11,  3
259        read_rand       x13, 11,  2
260        read_rand       x14, 11,  1
261        add             x12, x3,  x12, lsl #1
262        add             x13, x3,  x13, lsl #1
263        add             x14, x3,  x14, lsl #1
264        ld1             {v0.h}[5], [x12]
265        ld1             {v0.h}[6], [x13]
266        ld1             {v0.h}[7], [x14]
267        lsl             x2,  x2,  #1             // shift back the state as if we'd done increment_seed with shift=0
268        srshl           v0.8h,   v0.8h,   v31.8h
269        ext             v4.16b,  v4.16b,  v4.16b,  #12
270.ifc \lag, lag3
271        smov            w17, v0.h[5]
272.endif
273.ifnc \lag, lag1
274        smov            w16, v0.h[6]
275.endif
276        smov            w14, v0.h[7]
277
278        mov             v1.16b,  v4.16b
279        mov             w15, #1
280        bl              output_\lag\()_neon
281.else
282        increment_seed  4, shift=0
283        mov             v1.16b,  v4.16b
284        mov             w15, #4
285        bl              output_\lag\()_neon
286.endif
287
288        increment_seed  4, shift=0
289        mov             v1.16b,  v5.16b
290.ifc \edge, right
291        mov             w15, #3
292        bl              output_\lag\()_neon
293        read_shift_rand x15, 11
294        add             x15, x3,  x15, lsl #1
295        ld1             {v1.h}[0], [x15]
296        srshl           v1.4h,   v1.4h,   v31.4h
297        ext             v0.16b,  v0.16b,  v1.16b,  #2
298.else
299        mov             w15, #4
300        bl              output_\lag\()_neon
301.endif
302.else
303        // elems == 1
304        increment_seed  4, shift=0
305        mov             v1.16b,  v4.16b
306        mov             w15, #1
307        bl              output_\lag\()_neon
308        lsr             w2,  w2,  #3
309
310        read_rand       x12, 11,  2
311        read_rand       x13, 11,  1
312        read_rand       x14, 11,  0
313        add             x12, x3,  x12, lsl #1
314        add             x13, x3,  x13, lsl #1
315        add             x14, x3,  x14, lsl #1
316        ld1             {v1.h}[0], [x12]
317        ld1             {v1.h}[1], [x13]
318        ld1             {v1.h}[2], [x14]
319        srshl           v1.4h,   v1.4h,   v31.4h
320        ext             v0.16b,  v0.16b,  v1.16b,  #14
321.endif
322        st1             {v0.8h}, [x0], #16
323        ldr             x30,     [sp], #16
324        AARCH64_VALIDATE_LINK_REGISTER
325        ret
326.endif
327.endm
328
329.macro sum_lag1_func type, uv_layout, edge, elems=8
330function sum_\type\()_lag1_\edge\()_neon
331        AARCH64_SIGN_LINK_REGISTER
332        str             x30, [sp, #-16]!
333.ifc \edge, left
334        sub             x12, x0,  #1*GRAIN_WIDTH*2
335        ld1             {v17.8h}, [x12] // load the previous block right above
336.endif
337        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems
338endfunc
339.endm
340
341sum_lag1_func y,      0,   left
342sum_lag1_func y,      0,   mid
343sum_lag1_func y,      0,   right, 7
344sum_lag1_func uv_444, 444, left
345sum_lag1_func uv_444, 444, mid
346sum_lag1_func uv_444, 444, right, 7
347sum_lag1_func uv_422, 422, left
348sum_lag1_func uv_422, 422, mid
349sum_lag1_func uv_422, 422, right, 1
350sum_lag1_func uv_420, 420, left
351sum_lag1_func uv_420, 420, mid
352sum_lag1_func uv_420, 420, right, 1
353
354
355function sum_lag2_above_neon
356        sub             x12, x0,  #2*GRAIN_WIDTH*2 - 16
357        sub             x13, x0,  #1*GRAIN_WIDTH*2 - 16
358        ld1             {v18.8h}, [x12] // load top right
359        ld1             {v21.8h}, [x13]
360
361        dup             v26.8b,  v30.b[0]
362        ext             v22.16b, v16.16b, v17.16b, #12 // top left, top mid
363        dup             v27.8b,  v30.b[1]
364        ext             v23.16b, v16.16b, v17.16b, #14
365        sxtl            v26.8h,  v26.8b
366        dup             v28.8b,  v30.b[3]
367        ext             v0.16b,  v17.16b, v18.16b, #2  // top mid, top right
368        sxtl            v27.8h,  v27.8b
369        dup             v29.8b,  v30.b[4]
370        ext             v1.16b,  v17.16b, v18.16b, #4
371        sxtl            v28.8h,  v28.8b
372        sxtl            v29.8h,  v29.8b
373
374        smull           v4.4s,   v22.4h,  v26.4h
375        smlal           v4.4s,   v23.4h,  v27.4h
376        smlal           v4.4s,   v0.4h,   v28.4h
377        smlal           v4.4s,   v1.4h,   v29.4h
378        smull2          v5.4s,   v22.8h,  v26.8h
379        smlal2          v5.4s,   v23.8h,  v27.8h
380        smlal2          v5.4s,   v0.8h,   v28.8h
381        smlal2          v5.4s,   v1.8h,   v29.8h
382
383        dup             v26.16b, v30.b[5]
384        ext             v22.16b, v19.16b, v20.16b, #12 // top left, top mid
385        dup             v27.16b, v30.b[6]
386        ext             v23.16b, v19.16b, v20.16b, #14
387        sxtl            v26.8h,  v26.8b
388        dup             v28.16b, v30.b[8]
389        ext             v0.16b,  v20.16b, v21.16b, #2  // top mid, top right
390        sxtl            v27.8h,  v27.8b
391        dup             v29.16b, v30.b[9]
392        ext             v1.16b,  v20.16b, v21.16b, #4
393        sxtl            v28.8h,  v28.8b
394        sxtl            v29.8h,  v29.8b
395
396        smlal           v4.4s,   v22.4h,  v26.4h
397        smlal           v4.4s,   v23.4h,  v27.4h
398        smlal           v4.4s,   v0.4h,   v28.4h
399        smlal           v4.4s,   v1.4h,   v29.4h
400        smlal2          v5.4s,   v22.8h,  v26.8h
401        smlal2          v5.4s,   v23.8h,  v27.8h
402        smlal2          v5.4s,   v0.8h,   v28.8h
403        smlal2          v5.4s,   v1.8h,   v29.8h
404
405        dup             v26.16b, v30.b[2]
406        dup             v27.16b, v30.b[7]
407        sxtl            v26.8h,  v26.8b
408        sxtl            v27.8h,  v27.8b
409
410        smlal           v4.4s,   v17.4h,  v26.4h
411        smlal           v4.4s,   v20.4h,  v27.4h
412        smlal2          v5.4s,   v17.8h,  v26.8h
413        smlal2          v5.4s,   v20.8h,  v27.8h
414        mov             v16.16b, v17.16b
415        mov             v17.16b, v18.16b
416
417        mov             v19.16b, v20.16b
418        mov             v20.16b, v21.16b
419        ret
420endfunc
421
422.macro sum_lag2_func type, uv_layout, edge, elems=8
423function sum_\type\()_lag2_\edge\()_neon
424        AARCH64_SIGN_LINK_REGISTER
425        str             x30, [sp, #-16]!
426.ifc \edge, left
427        sub             x12, x0,  #2*GRAIN_WIDTH*2
428        sub             x13, x0,  #1*GRAIN_WIDTH*2
429        ld1             {v17.8h}, [x12] // load the previous block right above
430        ld1             {v20.8h}, [x13]
431.endif
432        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, v30.b[12]
433endfunc
434.endm
435
436sum_lag2_func y,      0,   left
437sum_lag2_func y,      0,   mid
438sum_lag2_func y,      0,   right, 7
439sum_lag2_func uv_444, 444, left
440sum_lag2_func uv_444, 444, mid
441sum_lag2_func uv_444, 444, right, 7
442sum_lag2_func uv_422, 422, left
443sum_lag2_func uv_422, 422, mid
444sum_lag2_func uv_422, 422, right, 1
445sum_lag2_func uv_420, 420, left
446sum_lag2_func uv_420, 420, mid
447sum_lag2_func uv_420, 420, right, 1
448
449
450function sum_lag3_above_neon
451        sub             x11, x0,  #3*GRAIN_WIDTH*2 - 16
452        sub             x12, x0,  #2*GRAIN_WIDTH*2 - 16
453        sub             x13, x0,  #1*GRAIN_WIDTH*2 - 16
454        ld1             {v15.8h}, [x11] // load top right
455        ld1             {v18.8h}, [x12]
456        ld1             {v21.8h}, [x13]
457
458        dup             v22.8b,  v29.b[0]
459        ext             v8.16b,  v13.16b, v14.16b, #10 // top left, top mid
460        dup             v23.8b,  v29.b[1]
461        ext             v9.16b,  v13.16b, v14.16b, #12
462        sxtl            v22.8h,  v22.8b
463        dup             v24.8b,  v29.b[2]
464        sxtl            v23.8h,  v23.8b
465        dup             v25.8b,  v29.b[3]
466        ext             v10.16b, v13.16b, v14.16b, #14
467        sxtl            v24.8h,  v24.8b
468        dup             v26.8b,  v29.b[4]
469        ext             v11.16b, v14.16b, v15.16b, #2  // top mid, top right
470        sxtl            v25.8h,  v25.8b
471        dup             v27.8b,  v29.b[5]
472        ext             v12.16b, v14.16b, v15.16b, #4
473        sxtl            v26.8h,  v26.8b
474        dup             v28.8b,  v29.b[6]
475        ext             v13.16b, v14.16b, v15.16b, #6
476        sxtl            v27.8h,  v27.8b
477        sxtl            v28.8h,  v28.8b
478
479        smull           v4.4s,   v8.4h,   v22.4h
480        smlal           v4.4s,   v9.4h,   v23.4h
481        smlal           v4.4s,   v10.4h,  v24.4h
482        smlal           v4.4s,   v11.4h,  v26.4h
483        smlal           v4.4s,   v12.4h,  v27.4h
484        smlal           v4.4s,   v13.4h,  v28.4h
485        smlal           v4.4s,   v14.4h,  v25.4h
486        smull2          v5.4s,   v8.8h,   v22.8h
487        smlal2          v5.4s,   v9.8h,   v23.8h
488        smlal2          v5.4s,   v10.8h,  v24.8h
489        smlal2          v5.4s,   v11.8h,  v26.8h
490        smlal2          v5.4s,   v12.8h,  v27.8h
491        smlal2          v5.4s,   v13.8h,  v28.8h
492        smlal2          v5.4s,   v14.8h,  v25.8h
493
494        dup             v22.8b,  v29.b[7]
495        ext             v8.16b,  v16.16b, v17.16b, #10 // top left, top mid
496        dup             v23.8b,  v29.b[8]
497        ext             v9.16b,  v16.16b, v17.16b, #12
498        sxtl            v22.8h,  v22.8b
499        dup             v24.8b,  v29.b[9]
500        sxtl            v23.8h,  v23.8b
501        dup             v25.8b,  v29.b[10]
502        ext             v10.16b, v16.16b, v17.16b, #14
503        sxtl            v24.8h,  v24.8b
504        dup             v26.8b,  v29.b[11]
505        ext             v11.16b, v17.16b, v18.16b, #2  // top mid, top right
506        sxtl            v25.8h,  v25.8b
507        dup             v27.8b,  v29.b[12]
508        ext             v12.16b, v17.16b, v18.16b, #4
509        sxtl            v26.8h,  v26.8b
510        dup             v28.8b,  v29.b[13]
511        ext             v13.16b, v17.16b, v18.16b, #6
512        sxtl            v27.8h,  v27.8b
513        sxtl            v28.8h,  v28.8b
514
515        smlal           v4.4s,   v8.4h,   v22.4h
516        smlal           v4.4s,   v9.4h,   v23.4h
517        smlal           v4.4s,   v10.4h,  v24.4h
518        smlal           v4.4s,   v11.4h,  v26.4h
519        smlal           v4.4s,   v12.4h,  v27.4h
520        smlal           v4.4s,   v13.4h,  v28.4h
521        smlal           v4.4s,   v17.4h,  v25.4h
522        smlal2          v5.4s,   v8.8h,   v22.8h
523        smlal2          v5.4s,   v9.8h,   v23.8h
524        smlal2          v5.4s,   v10.8h,  v24.8h
525        smlal2          v5.4s,   v11.8h,  v26.8h
526        smlal2          v5.4s,   v12.8h,  v27.8h
527        smlal2          v5.4s,   v13.8h,  v28.8h
528        smlal2          v5.4s,   v17.8h,  v25.8h
529
530        dup             v22.8b,  v29.b[14]
531        ext             v8.16b,  v19.16b, v20.16b, #10 // top left, top mid
532        dup             v23.8b,  v29.b[15]
533        ext             v9.16b,  v19.16b, v20.16b, #12
534        sxtl            v22.8h,  v22.8b
535        dup             v24.8b,  v30.b[0]
536        sxtl            v23.8h,  v23.8b
537        dup             v25.8b,  v30.b[1]
538        ext             v10.16b, v19.16b, v20.16b, #14
539        sxtl            v24.8h,  v24.8b
540        dup             v26.8b,  v30.b[2]
541        ext             v11.16b, v20.16b, v21.16b, #2  // top mid, top right
542        sxtl            v25.8h,  v25.8b
543        dup             v27.8b,  v30.b[3]
544        ext             v12.16b, v20.16b, v21.16b, #4
545        sxtl            v26.8h,  v26.8b
546        dup             v28.8b,  v30.b[4]
547        ext             v13.16b, v20.16b, v21.16b, #6
548        sxtl            v27.8h,  v27.8b
549        sxtl            v28.8h,  v28.8b
550
551        smlal           v4.4s,   v8.4h,   v22.4h
552        smlal           v4.4s,   v9.4h,   v23.4h
553        smlal           v4.4s,   v10.4h,  v24.4h
554        smlal           v4.4s,   v11.4h,  v26.4h
555        smlal           v4.4s,   v12.4h,  v27.4h
556        smlal           v4.4s,   v13.4h,  v28.4h
557        smlal           v4.4s,   v20.4h,  v25.4h
558        mov             v16.16b, v17.16b
559        mov             v17.16b, v18.16b
560        smlal2          v5.4s,   v8.8h,   v22.8h
561        smlal2          v5.4s,   v9.8h,   v23.8h
562        smlal2          v5.4s,   v10.8h,  v24.8h
563        smlal2          v5.4s,   v11.8h,  v26.8h
564        smlal2          v5.4s,   v12.8h,  v27.8h
565        smlal2          v5.4s,   v13.8h,  v28.8h
566        smlal2          v5.4s,   v20.8h,  v25.8h
567
568        mov             v13.16b, v14.16b
569        mov             v14.16b, v15.16b
570
571        mov             v19.16b, v20.16b
572        mov             v20.16b, v21.16b
573        ret
574endfunc
575
576.macro sum_lag3_func type, uv_layout, edge, elems=8
577function sum_\type\()_lag3_\edge\()_neon
578        AARCH64_SIGN_LINK_REGISTER
579        str             x30, [sp, #-16]!
580.ifc \edge, left
581        sub             x11, x0,  #3*GRAIN_WIDTH*2
582        sub             x12, x0,  #2*GRAIN_WIDTH*2
583        sub             x13, x0,  #1*GRAIN_WIDTH*2
584        ld1             {v14.8h}, [x11] // load the previous block right above
585        ld1             {v17.8h}, [x12]
586        ld1             {v20.8h}, [x13]
587.endif
588        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, v30.b[8]
589endfunc
590.endm
591
592sum_lag3_func y,      0,   left
593sum_lag3_func y,      0,   mid
594sum_lag3_func y,      0,   right, 7
595sum_lag3_func uv_444, 444, left
596sum_lag3_func uv_444, 444, mid
597sum_lag3_func uv_444, 444, right, 7
598sum_lag3_func uv_422, 422, left
599sum_lag3_func uv_422, 422, mid
600sum_lag3_func uv_422, 422, right, 1
601sum_lag3_func uv_420, 420, left
602sum_lag3_func uv_420, 420, mid
603sum_lag3_func uv_420, 420, right, 1
604
605function generate_grain_rows_neon
606        AARCH64_SIGN_LINK_REGISTER
607        str             x30, [sp, #-16]!
6081:
609        mov             w16, #80
6102:
611        bl              get_gaussian_neon
612        srshl           v0.8h,   v0.8h,   v31.8h
613        subs            w16, w16, #8
614        st1             {v0.8h}, [x0], #16
615        b.gt            2b
616        get_grain_2     v0
617        subs            w1,  w1,  #1
618        st1             {v0.s}[0], [x0], #4
619        b.gt            1b
620        ldr             x30, [sp], #16
621        AARCH64_VALIDATE_LINK_REGISTER
622        ret
623endfunc
624
625function generate_grain_rows_44_neon
626        AARCH64_SIGN_LINK_REGISTER
627        str             x30, [sp, #-16]!
6281:
629        mov             w16, #40
6302:
631        bl              get_gaussian_neon
632        srshl           v0.8h,   v0.8h,   v31.8h
633        subs            w16, w16, #8
634        st1             {v0.8h}, [x0], #16
635        b.gt            2b
636        get_grain_4     v0
637        subs            w1,  w1,  #1
638        st1             {v0.4h}, [x0]
639        add             x0,  x0,  #GRAIN_WIDTH*2-80
640        b.gt            1b
641        ldr             x30, [sp], #16
642        AARCH64_VALIDATE_LINK_REGISTER
643        ret
644endfunc
645
646function gen_grain_uv_444_lag0_neon
647        AARCH64_SIGN_LINK_REGISTER
648        str             x30, [sp, #-16]!
649        ld1             {v4.8h}, [x19], #16
650gen_grain_uv_lag0_8_start:
651        bl              get_gaussian_neon
652        srshl           v0.8h,   v0.8h,   v31.8h
653gen_grain_uv_lag0_8_add:
654        and             v4.16b,  v4.16b,  v1.16b
655        smull           v2.4s,   v4.4h,   v27.4h
656        smull2          v3.4s,   v4.8h,   v27.8h
657        srshl           v2.4s,   v2.4s,   v28.4s
658        srshl           v3.4s,   v3.4s,   v28.4s
659        sqxtn           v2.4h,   v2.4s
660        sqxtn2          v2.8h,   v3.4s
661        sqadd           v2.8h,   v2.8h,   v0.8h
662        smin            v2.8h,   v2.8h,   v25.8h
663        smax            v2.8h,   v2.8h,   v26.8h
664        st1             {v2.8h}, [x0], #16
665        ldr             x30, [sp], #16
666        AARCH64_VALIDATE_LINK_REGISTER
667        ret
668endfunc
669
670function gen_grain_uv_420_lag0_8_neon
671        AARCH64_SIGN_LINK_REGISTER
672        add             x12, x19, #GRAIN_WIDTH*2
673        str             x30, [sp, #-16]!
674        ld1             {v16.8h, v17.8h}, [x19], #32
675        ld1             {v18.8h, v19.8h}, [x12]
676        addp            v16.8h,  v16.8h,  v17.8h
677        addp            v17.8h,  v18.8h,  v19.8h
678        add             v16.8h,  v16.8h,  v17.8h
679        srshr           v4.8h,   v16.8h,  #2
680        b               gen_grain_uv_lag0_8_start
681endfunc
682
683function gen_grain_uv_422_lag0_8_neon
684        AARCH64_SIGN_LINK_REGISTER
685        str             x30, [sp, #-16]!
686        ld1             {v16.8h, v17.8h}, [x19], #32
687        addp            v16.8h,  v16.8h,  v17.8h
688        srshr           v4.8h,   v16.8h,  #1
689        b               gen_grain_uv_lag0_8_start
690endfunc
691
692function gen_grain_uv_420_lag0_4_neon
693        add             x12, x19, #GRAIN_WIDTH*2
694        AARCH64_SIGN_LINK_REGISTER
695        str             x30, [sp, #-16]!
696        ld1             {v16.4h, v17.4h}, [x19]
697        ld1             {v18.4h, v19.4h}, [x12]
698        add             x19,  x19,  #32
699        addp            v16.4h,  v16.4h,  v17.4h
700        addp            v17.4h,  v18.4h,  v19.4h
701        add             v16.4h,  v16.4h,  v17.4h
702        srshr           v4.4h,   v16.4h,  #2
703        get_grain_4     v0
704        b               gen_grain_uv_lag0_8_add
705endfunc
706
707function gen_grain_uv_422_lag0_4_neon
708        AARCH64_SIGN_LINK_REGISTER
709        str             x30, [sp, #-16]!
710        ld1             {v16.4h, v17.4h}, [x19]
711        add             x19,  x19,  #32
712        addp            v16.4h,  v16.4h,  v17.4h
713        srshr           v4.4h,   v16.4h,  #1
714        get_grain_4     v0
715        b               gen_grain_uv_lag0_8_add
716endfunc
717
718.macro gen_grain_82 type
719function generate_grain_\type\()_16bpc_neon, export=1
720        AARCH64_SIGN_LINK_REGISTER
721        stp             x30, x19, [sp, #-96]!
722
723.ifc \type, uv_444
724        mov             w13, w3
725        mov             w14, #28
726        add             x19, x1,  #3*GRAIN_WIDTH*2
727        mov             x1,  x2
728        mul             w13, w13, w14
729        clz             w15, w4
730.else
731        clz             w15, w2
732.endif
733        movrel          x3,  X(gaussian_sequence)
734        sub             w15, w15, #24 // -bitdepth_min_8
735        ldr             w2,  [x1, #FGD_SEED]
736        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
737.ifc \type, y
738        add             x4,  x1,  #FGD_AR_COEFFS_Y
739.else
740        add             x4,  x1,  #FGD_AR_COEFFS_UV
741.endif
742        add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
743        adr             x16, L(gen_grain_\type\()_tbl)
744        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
745        add             w9,  w9,  #4
746        ldrh            w17, [x16, w17, uxtw #1]
747        dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
748        sub             x16, x16, w17, uxtw
749        neg             v31.8h,  v31.8h
750
751.ifc \type, uv_444
752        cmp             w13, #0
753        mov             w11, #0x49d8
754        mov             w14, #0xb524
755        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
756        csel            w11, w11, w14, ne
757.endif
758
759        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
760        neg             w15, w15            // bitdepth_min_8
761        mov             w8,  #1
762        mov             w10, #1
763        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
764        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
765        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
766        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
767        mov             w5,  #128
768        lsl             w5,  w5,  w15       //   128 << bitdepth_min_8
769        neg             w6,  w5             // -(128 << bitpdeth_min_8)
770        sub             w5,  w5,  #1        //  (128 << bitdepth_min_8) - 1
771
772.ifc \type, uv_444
773        eor             w2,  w2,  w11
774.endif
775
776        br              x16
777
778L(generate_grain_\type\()_lag0):
779        AARCH64_VALID_JUMP_TARGET
780.ifc \type, y
781        mov             w1,  #GRAIN_HEIGHT
782        bl              generate_grain_rows_neon
783.else
784        dup             v28.4s,  w7
785        ld1r            {v27.8b}, [x4]      // ar_coeffs_uv[0]
786        movi            v0.16b,  #0
787        movi            v1.16b,  #255
788        dup             v25.8h,  w5
789        dup             v26.8h,  w6
790        ext             v29.16b, v0.16b,  v1.16b,  #10
791        ext             v30.16b, v1.16b,  v0.16b,  #2
792        neg             v28.4s,  v28.4s
793        sxtl            v27.8h,  v27.8b
794
795        mov             w1,  #3
796        bl              generate_grain_rows_neon
797        mov             w1,  #GRAIN_HEIGHT-3
7981:
799        mov             v1.16b,  v29.16b
800        bl              gen_grain_uv_444_lag0_neon // 8
801        movi            v1.16b,  #255
802        bl              gen_grain_uv_444_lag0_neon // 16
803        bl              gen_grain_uv_444_lag0_neon // 24
804        bl              gen_grain_uv_444_lag0_neon // 32
805        bl              gen_grain_uv_444_lag0_neon // 40
806        bl              gen_grain_uv_444_lag0_neon // 48
807        bl              gen_grain_uv_444_lag0_neon // 56
808        bl              gen_grain_uv_444_lag0_neon // 64
809        bl              gen_grain_uv_444_lag0_neon // 72
810        mov             v1.16b,  v30.16b
811        bl              gen_grain_uv_444_lag0_neon // 80
812        get_grain_2     v16
813        subs            w1,  w1,  #1
814        add             x19, x19, #4
815        st1             {v16.s}[0], [x0], #4
816        b.gt            1b
817.endif
818        ldp             x30, x19, [sp], #96
819        AARCH64_VALIDATE_LINK_REGISTER
820        ret
821
822L(generate_grain_\type\()_lag1):
823        AARCH64_VALID_JUMP_TARGET
824        ld1r            {v27.8b}, [x4], #1  // ar_coeffs_y[0]
825        ld1r            {v28.8b}, [x4], #1  // ar_coeffs_y[1]
826        ld1r            {v29.8b}, [x4]      // ar_coeffs_y[2]
827.ifc \type, y
828        ldrsb           w4,  [x4, #1]       // ar_coeffs_y[3]
829.else
830        add             x4,  x4,  #2
831.endif
832
833        mov             w1,  #3
834.ifc \type, uv_444
835        ld1r            {v30.8b}, [x4]      // ar_coeffs_uv[4]
836        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
837.endif
838        bl              generate_grain_rows_neon
839        sxtl            v27.8h,  v27.8b
840        sxtl            v28.8h,  v28.8b
841        sxtl            v29.8h,  v29.8b
842.ifc \type, uv_444
843        sxtl            v30.8h,  v30.8b
844.endif
845
846        mov             w1,  #GRAIN_HEIGHT - 3
8471:
848        bl              sum_\type\()_lag1_left_neon  // 8
849        bl              sum_\type\()_lag1_mid_neon   // 16
850        bl              sum_\type\()_lag1_mid_neon   // 24
851        bl              sum_\type\()_lag1_mid_neon   // 32
852        bl              sum_\type\()_lag1_mid_neon   // 40
853        bl              sum_\type\()_lag1_mid_neon   // 48
854        bl              sum_\type\()_lag1_mid_neon   // 56
855        bl              sum_\type\()_lag1_mid_neon   // 64
856        bl              sum_\type\()_lag1_mid_neon   // 72
857        bl              sum_\type\()_lag1_right_neon // 80
858        get_grain_2     v16
859        subs            w1,  w1,  #1
860.ifc \type, uv_444
861        add             x19, x19, #4
862.endif
863        st1             {v16.s}[0], [x0], #4
864        b.gt            1b
865
866        ldp             x30, x19, [sp], #96
867        AARCH64_VALIDATE_LINK_REGISTER
868        ret
869
870L(generate_grain_\type\()_lag2):
871        AARCH64_VALID_JUMP_TARGET
872        ld1             {v30.16b}, [x4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
873
874        smov            w4,  v30.b[10]
875        smov            w17, v30.b[11]
876
877        mov             w1,  #3
878        bl              generate_grain_rows_neon
879
880        mov             w1,  #GRAIN_HEIGHT - 3
8811:
882        bl              sum_\type\()_lag2_left_neon  // 8
883        bl              sum_\type\()_lag2_mid_neon   // 16
884        bl              sum_\type\()_lag2_mid_neon   // 24
885        bl              sum_\type\()_lag2_mid_neon   // 32
886        bl              sum_\type\()_lag2_mid_neon   // 40
887        bl              sum_\type\()_lag2_mid_neon   // 48
888        bl              sum_\type\()_lag2_mid_neon   // 56
889        bl              sum_\type\()_lag2_mid_neon   // 64
890        bl              sum_\type\()_lag2_mid_neon   // 72
891        bl              sum_\type\()_lag2_right_neon // 80
892        get_grain_2     v16
893        subs            w1,  w1,  #1
894.ifc \type, uv_444
895        add             x19, x19, #4
896.endif
897        st1             {v16.s}[0], [x0], #4
898        b.gt            1b
899
900        ldp             x30, x19, [sp], #96
901        AARCH64_VALIDATE_LINK_REGISTER
902        ret
903
904L(generate_grain_\type\()_lag3):
905        AARCH64_VALID_JUMP_TARGET
906        ld1             {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
907        stp             d8,  d9,  [sp, #16]
908        stp             d10, d11, [sp, #32]
909        stp             d12, d13, [sp, #48]
910        stp             d14, d15, [sp, #64]
911        stp             x20, x21, [sp, #80]
912
913        smov            w4,  v30.b[5]
914        smov            w20, v30.b[6]
915        smov            w21, v30.b[7]
916
917        mov             w1,  #3
918        bl              generate_grain_rows_neon
919
920        mov             w1,  #GRAIN_HEIGHT - 3
9211:
922        bl              sum_\type\()_lag3_left_neon  // 8
923        bl              sum_\type\()_lag3_mid_neon   // 16
924        bl              sum_\type\()_lag3_mid_neon   // 24
925        bl              sum_\type\()_lag3_mid_neon   // 32
926        bl              sum_\type\()_lag3_mid_neon   // 40
927        bl              sum_\type\()_lag3_mid_neon   // 48
928        bl              sum_\type\()_lag3_mid_neon   // 56
929        bl              sum_\type\()_lag3_mid_neon   // 64
930        bl              sum_\type\()_lag3_mid_neon   // 72
931        bl              sum_\type\()_lag3_right_neon // 80
932        get_grain_2     v16
933        subs            w1,  w1,  #1
934.ifc \type, uv_444
935        add             x19, x19, #4
936.endif
937        st1             {v16.s}[0], [x0], #4
938        b.gt            1b
939
940        ldp             x20, x21, [sp, #80]
941        ldp             d14, d15, [sp, #64]
942        ldp             d12, d13, [sp, #48]
943        ldp             d10, d11, [sp, #32]
944        ldp             d8,  d9,  [sp, #16]
945        ldp             x30, x19, [sp], #96
946        AARCH64_VALIDATE_LINK_REGISTER
947        ret
948
949L(gen_grain_\type\()_tbl):
950        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
951        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
952        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
953        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
954endfunc
955.endm
956
957gen_grain_82 y
958gen_grain_82 uv_444
959
960.macro set_height dst, type
961.ifc \type, uv_420
962        mov             \dst,  #SUB_GRAIN_HEIGHT-3
963.else
964        mov             \dst,  #GRAIN_HEIGHT-3
965.endif
966.endm
967
968.macro increment_y_ptr reg, type
969.ifc \type, uv_420
970        add             \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
971.else
972        sub             \reg, \reg, #6*32-GRAIN_WIDTH*2
973.endif
974.endm
975
976.macro gen_grain_44 type
977function generate_grain_\type\()_16bpc_neon, export=1
978        AARCH64_SIGN_LINK_REGISTER
979        stp             x30, x19, [sp, #-96]!
980
981        mov             w13, w3
982        mov             w14, #28
983        add             x19, x1,  #(3*GRAIN_WIDTH-3)*2
984        mov             x1,  x2
985        mul             w13, w13, w14
986        clz             w15, w4
987
988        movrel          x3,  X(gaussian_sequence)
989        sub             w15, w15, #24 // -bitdepth_min_8
990        ldr             w2,  [x1, #FGD_SEED]
991        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
992        add             x4,  x1,  #FGD_AR_COEFFS_UV
993        add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
994        adr             x16, L(gen_grain_\type\()_tbl)
995        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
996        add             w9,  w9,  #4
997        ldrh            w17, [x16, w17, uxtw #1]
998        dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
999        sub             x16, x16, w17, uxtw
1000        neg             v31.8h,  v31.8h
1001
1002        cmp             w13, #0
1003        mov             w11, #0x49d8
1004        mov             w14, #0xb524
1005        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
1006        csel            w11, w11, w14, ne
1007
1008        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
1009        neg             w15, w15            // bitdepth_min_8
1010        mov             w8,  #1
1011        mov             w10, #1
1012        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
1013        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
1014        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
1015        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
1016        mov             w5,  #128
1017        lsl             w5,  w5,  w15       //   128 << bitdepth_min_8
1018        neg             w6,  w5             // -(128 << bitpdeth_min_8)
1019        sub             w5,  w5,  #1        //  (128 << bitdepth_min_8) - 1
1020
1021        eor             w2,  w2,  w11
1022
1023        br              x16
1024
1025L(generate_grain_\type\()_lag0):
1026        AARCH64_VALID_JUMP_TARGET
1027        dup             v28.4s,  w7
1028        ld1r            {v27.8b}, [x4]      // ar_coeffs_uv[0]
1029        movi            v0.16b,  #0
1030        movi            v1.16b,  #255
1031        dup             v25.8h,  w5
1032        dup             v26.8h,  w6
1033        ext             v29.16b, v0.16b,  v1.16b,  #10
1034        ext             v30.16b, v1.16b,  v0.16b,  #14
1035        neg             v28.4s,  v28.4s
1036        sxtl            v27.8h,  v27.8b
1037
1038        mov             w1,  #3
1039        bl              generate_grain_rows_44_neon
1040        set_height      w1,  \type
10411:
1042        mov             v1.16b,  v29.16b
1043        bl              gen_grain_\type\()_lag0_8_neon // 8
1044        movi            v1.16b,  #255
1045        bl              gen_grain_\type\()_lag0_8_neon // 16
1046        bl              gen_grain_\type\()_lag0_8_neon // 24
1047        bl              gen_grain_\type\()_lag0_8_neon // 32
1048        bl              gen_grain_\type\()_lag0_8_neon // 40
1049        mov             v1.16b,  v30.16b
1050        bl              gen_grain_\type\()_lag0_4_neon // 44
1051        subs            w1,  w1,  #1
1052        increment_y_ptr x19, \type
1053        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
1054        b.gt            1b
1055
1056        ldp             x30, x19, [sp], #96
1057        AARCH64_VALIDATE_LINK_REGISTER
1058        ret
1059
1060L(generate_grain_\type\()_lag1):
1061        AARCH64_VALID_JUMP_TARGET
1062        ld1r            {v27.8b}, [x4], #1  // ar_coeffs_uv[0]
1063        ld1r            {v28.8b}, [x4], #1  // ar_coeffs_uv[1]
1064        ld1r            {v29.8b}, [x4]      // ar_coeffs_uv[2]
1065        add             x4,  x4,  #2
1066
1067        mov             w1,  #3
1068        ld1r            {v30.8b}, [x4]      // ar_coeffs_u4[4]
1069        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
1070        bl              generate_grain_rows_44_neon
1071
1072        sxtl            v27.8h,  v27.8b
1073        sxtl            v28.8h,  v28.8b
1074        sxtl            v29.8h,  v29.8b
1075        sxtl            v30.8h,  v30.8b
1076        set_height      w1,  \type
10771:
1078        bl              sum_\type\()_lag1_left_neon  // 8
1079        bl              sum_\type\()_lag1_mid_neon   // 16
1080        bl              sum_\type\()_lag1_mid_neon   // 24
1081        bl              sum_\type\()_lag1_mid_neon   // 32
1082        bl              sum_\type\()_lag1_mid_neon   // 40
1083        bl              sum_\type\()_lag1_right_neon // 44
1084        subs            w1,  w1,  #1
1085        increment_y_ptr x19, \type
1086        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
1087        b.gt            1b
1088
1089        ldp             x30, x19, [sp], #96
1090        AARCH64_VALIDATE_LINK_REGISTER
1091        ret
1092
1093L(generate_grain_\type\()_lag2):
1094        AARCH64_VALID_JUMP_TARGET
1095        ld1             {v30.16b}, [x4]     // ar_coeffs_uv[0-12]
1096
1097        smov            w4,  v30.b[10]
1098        smov            w17, v30.b[11]
1099
1100        mov             w1,  #3
1101        bl              generate_grain_rows_44_neon
1102
1103        set_height      w1,  \type
11041:
1105        bl              sum_\type\()_lag2_left_neon  // 8
1106        bl              sum_\type\()_lag2_mid_neon   // 16
1107        bl              sum_\type\()_lag2_mid_neon   // 24
1108        bl              sum_\type\()_lag2_mid_neon   // 32
1109        bl              sum_\type\()_lag2_mid_neon   // 40
1110        bl              sum_\type\()_lag2_right_neon // 44
1111        subs            w1,  w1,  #1
1112        increment_y_ptr x19, \type
1113        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
1114        b.gt            1b
1115
1116        ldp             x30, x19, [sp], #96
1117        AARCH64_VALIDATE_LINK_REGISTER
1118        ret
1119
1120L(generate_grain_\type\()_lag3):
1121        AARCH64_VALID_JUMP_TARGET
1122        ldr             q29,      [x4]      // ar_coeffs_uv[0-15]
1123        ldr             q30,      [x4, #16] // ar_coeffs_uv[16-24]
1124        stp             d8,  d9,  [sp, #16]
1125        stp             d10, d11, [sp, #32]
1126        stp             d12, d13, [sp, #48]
1127        stp             d14, d15, [sp, #64]
1128        stp             x20, x21, [sp, #80]
1129
1130        smov            w4,  v30.b[5]
1131        smov            w20, v30.b[6]
1132        smov            w21, v30.b[7]
1133
1134        mov             w1,  #3
1135        bl              generate_grain_rows_44_neon
1136
1137        set_height      w1,  \type
11381:
1139        bl              sum_\type\()_lag3_left_neon  // 8
1140        bl              sum_\type\()_lag3_mid_neon   // 16
1141        bl              sum_\type\()_lag3_mid_neon   // 24
1142        bl              sum_\type\()_lag3_mid_neon   // 32
1143        bl              sum_\type\()_lag3_mid_neon   // 40
1144        bl              sum_\type\()_lag3_right_neon // 44
1145        subs            w1,  w1,  #1
1146        increment_y_ptr x19, \type
1147        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
1148        b.gt            1b
1149
1150        ldp             x20, x21, [sp, #80]
1151        ldp             d14, d15, [sp, #64]
1152        ldp             d12, d13, [sp, #48]
1153        ldp             d10, d11, [sp, #32]
1154        ldp             d8,  d9,  [sp, #16]
1155        ldp             x30, x19, [sp], #96
1156        AARCH64_VALIDATE_LINK_REGISTER
1157        ret
1158
1159L(gen_grain_\type\()_tbl):
1160        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
1161        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
1162        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
1163        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
1164endfunc
1165.endm
1166
1167gen_grain_44 uv_420
1168gen_grain_44 uv_422
1169
1170.macro gather_interleaved dst1, dst2, src1, src2, off
1171        umov            w14, \src1[0]
1172        umov            w15, \src2[1]
1173        umov            w16, \src1[2]
1174        add             x14, x14, x3
1175        umov            w17, \src2[3]
1176        add             x15, x15, x3
1177        ld1             {\dst1}[0+\off], [x14]
1178        umov            w14, \src1[4]
1179        add             x16, x16, x3
1180        ld1             {\dst2}[1+\off], [x15]
1181        umov            w15, \src2[5]
1182        add             x17, x17, x3
1183        ld1             {\dst1}[2+\off], [x16]
1184        umov            w16, \src1[6]
1185        add             x14, x14, x3
1186        ld1             {\dst2}[3+\off], [x17]
1187        umov            w17, \src2[7]
1188        add             x15, x15, x3
1189        ld1             {\dst1}[4+\off], [x14]
1190        add             x16, x16, x3
1191        ld1             {\dst2}[5+\off], [x15]
1192        add             x17, x17, x3
1193        ld1             {\dst1}[6+\off], [x16]
1194        ld1             {\dst2}[7+\off], [x17]
1195.endm
1196
1197.macro gather dst1, dst2, src1, src2, src3, src4
1198        gather_interleaved \dst1, \dst2, \src1, \src3, 0
1199        gather_interleaved \dst2, \dst1, \src3, \src1, 0
1200        gather_interleaved \dst1, \dst2, \src2, \src4, 8
1201        gather_interleaved \dst2, \dst1, \src4, \src2, 8
1202.endm
1203
1204function gather32_neon
1205        gather          v6.b, v7.b, v0.h, v1.h, v2.h, v3.h
1206        ret
1207endfunc
1208
1209function gather16_neon
1210        gather_interleaved v6.b, v7.b, v0.h, v1.h, 0
1211        gather_interleaved v7.b, v6.b, v1.h, v0.h, 0
1212        ins             v6.d[1], v7.d[0]
1213        ret
1214endfunc
1215
1216const overlap_coeffs_0, align=4
1217        .short 27, 17, 0,  0
1218        .short 17, 27, 32, 32
1219endconst
1220
1221const overlap_coeffs_1, align=4
1222        .short 23, 0,  0,  0
1223        .short 22, 32, 32, 32
1224endconst
1225
1226.macro calc_offset offx, offy, src, sx, sy
1227        and             \offy, \src,  #0xF     // randval & 0xF
1228        lsr             \offx, \src,  #4       // randval >> 4
1229.if \sy == 0
1230        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
1231.endif
1232.if \sx == 0
1233        add             \offx, \offx, \offx    // 2 * (randval >> 4)
1234.endif
1235.endm
1236
1237.macro add_offset dst, offx, offy, src, stride
1238        madd            \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
1239        add             \dst, \dst, \offx, uxtw #1 // grain_lut += offx
1240.endm
1241
1242// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
1243//                                 const ptrdiff_t stride,
1244//                                 const uint8_t scaling[SCALING_SIZE],
1245//                                 const int scaling_shift,
1246//                                 const entry grain_lut[][GRAIN_WIDTH],
1247//                                 const int offsets[][2],
1248//                                 const int h, const ptrdiff_t clip,
1249//                                 const ptrdiff_t type,
1250//                                 const int bitdepth_max);
1251function fgy_32x32_16bpc_neon, export=1
1252        AARCH64_SIGN_LINK_REGISTER
1253        str             x30, [sp, #-80]!
1254        stp             d8,  d9,  [sp, #16]
1255        stp             d10, d11, [sp, #32]
1256        stp             d12, d13, [sp, #48]
1257        str             d14,      [sp, #64]
1258        eor             w4,  w4,  #15          // 15 - scaling_shift
1259        ldr             w11, [x6, #8]          // offsets[1][0]
1260        ldr             w13, [x6, #4]          // offsets[0][1]
1261        ldr             w15, [x6, #12]         // offsets[1][1]
1262        ldr             w10, [sp, #96]         // bitdepth_max
1263        ldr             w6,  [x6]              // offsets[0][0]
1264        dup             v26.8h,  w10           // bitdepth_max
1265        clz             w10, w10
1266        ldr             w8,  [sp, #80]         // clip
1267        sub             w10, w10, #24          // -bitdepth_min_8
1268        mov             x9,  #GRAIN_WIDTH*2    // grain_lut stride
1269        neg             w10, w10               // bitdepth_min_8
1270
1271        dup             v29.8h,  w4            // 15 - scaling_shift
1272        dup             v27.8h,  w10           // bitdepth_min_8
1273
1274        movrel          x16, overlap_coeffs_0
1275
1276        cbz             w8,  1f
1277        // clip
1278        movi            v30.8h,  #16
1279        movi            v31.8h,  #235
1280        sshl            v30.8h,  v30.8h,  v27.8h
1281        sshl            v31.8h,  v31.8h,  v27.8h
1282        b               2f
12831:
1284        // no clip
1285        movi            v30.8h,  #0
1286        mov             v31.16b, v26.16b       // bitdepth_max
12872:
1288
1289        ushr            v26.8h,  v26.8h,  #1   // grain_max
1290        not             v25.16b, v26.16b       // grain_min
1291
1292        ld1             {v27.4h, v28.4h}, [x16] // overlap_coeffs
1293
1294        add             x5,  x5,  #18          // grain_lut += 9
1295        add             x5,  x5,  x9,  lsl #3  // grain_lut += 8 * grain_stride
1296        add             x5,  x5,  x9           // grain_lut += grain_stride
1297
1298        calc_offset     w11, w12, w11, 0,  0
1299        calc_offset     w13, w14, w13, 0,  0
1300        calc_offset     w15, w16, w15, 0,  0
1301        calc_offset     w6,  w10, w6,  0,  0
1302
1303        add_offset      x12, w11, x12, x5,  x9
1304        add_offset      x14, w13, x14, x5,  x9
1305        add_offset      x16, w15, x16, x5,  x9
1306        add_offset      x5,  w6,  x10, x5,  x9
1307
1308        ldr             w11, [sp, #88]         // type
1309        adr             x13, L(fgy_loop_tbl)
1310
1311        add             x4,  x12, #32*2        // grain_lut += FG_BLOCK_SIZE * bx
1312        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1313
1314        tst             w11, #1
1315        ldrh            w11, [x13, w11, uxtw #1]
1316
1317        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1318        add             x8,  x8,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx
1319
1320        sub             x11, x13, w11, uxtw
1321
1322        b.eq            1f
1323        // y overlap
1324        dup             v8.8h,   v27.h[0]
1325        dup             v9.8h,   v27.h[1]
1326        mov             w10, w7                // backup actual h
1327        mov             w7,  #2
13281:
1329        br              x11
1330endfunc
1331
1332function fgy_loop_neon
1333.macro fgy ox, oy
1334L(loop_\ox\oy):
1335        AARCH64_VALID_JUMP_TARGET
13361:
1337        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x1],  x2 // src
1338.if \ox
1339        ld1             {v20.4h},                         [x4],  x9 // grain_lut old
1340.endif
1341.if \oy
1342        ld1             {v21.8h, v22.8h, v23.8h, v24.8h}, [x6],  x9 // grain_lut top
1343.endif
1344.if \ox && \oy
1345        ld1             {v14.4h},                         [x8],  x9 // grain_lut top old
1346.endif
1347        mvni            v4.8h,   #0xf0, lsl #8 // 0x0fff
1348        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5],  x9 // grain_lut
1349
1350        // Make sure that uninitialized pixels out of range past the right
1351        // edge are in range; their actual values shouldn't matter.
1352        and             v0.16b,  v0.16b,  v4.16b
1353        and             v1.16b,  v1.16b,  v4.16b
1354        and             v2.16b,  v2.16b,  v4.16b
1355        and             v3.16b,  v3.16b,  v4.16b
1356        bl              gather32_neon
1357
1358.if \ox
1359        smull           v20.4s,  v20.4h,  v27.4h
1360        smlal           v20.4s,  v16.4h,  v28.4h
1361.endif
1362
1363.if \oy
1364.if \ox
1365        smull           v14.4s,  v14.4h,  v27.4h
1366        smlal           v14.4s,  v21.4h,  v28.4h
1367        sqrshrn         v20.4h,  v20.4s,  #5
1368        sqrshrn         v14.4h,  v14.4s,  #5
1369        smin            v20.4h,  v20.4h,  v26.4h
1370        smin            v14.4h,  v14.4h,  v26.4h
1371        smax            v20.4h,  v20.4h,  v25.4h
1372        smax            v14.4h,  v14.4h,  v25.4h
1373.endif
1374
1375.if \ox
1376        smull           v10.4s,  v20.4h,  v9.4h
1377.else
1378        smull           v10.4s,  v16.4h,  v9.4h
1379.endif
1380        smull2          v11.4s,  v16.8h,  v9.8h
1381        smull           v12.4s,  v17.4h,  v9.4h
1382        smull2          v13.4s,  v17.8h,  v9.8h
1383        smull           v16.4s,  v18.4h,  v9.4h
1384        smull2          v17.4s,  v18.8h,  v9.8h
1385        smull           v18.4s,  v19.4h,  v9.4h
1386        smull2          v19.4s,  v19.8h,  v9.8h
1387.if \ox
1388        smlal           v10.4s,  v14.4h,  v8.4h
1389.else
1390        smlal           v10.4s,  v21.4h,  v8.4h
1391.endif
1392        smlal2          v11.4s,  v21.8h,  v8.8h
1393        smlal           v12.4s,  v22.4h,  v8.4h
1394        smlal2          v13.4s,  v22.8h,  v8.8h
1395        smlal           v16.4s,  v23.4h,  v8.4h
1396        smlal2          v17.4s,  v23.8h,  v8.8h
1397        smlal           v18.4s,  v24.4h,  v8.4h
1398        smlal2          v19.4s,  v24.8h,  v8.8h
1399        sqrshrn         v10.4h,  v10.4s,  #5
1400        sqrshrn2        v10.8h,  v11.4s,  #5
1401        sqrshrn         v11.4h,  v12.4s,  #5
1402        sqrshrn2        v11.8h,  v13.4s,  #5
1403        sqrshrn         v12.4h,  v16.4s,  #5
1404        sqrshrn2        v12.8h,  v17.4s,  #5
1405        sqrshrn         v13.4h,  v18.4s,  #5
1406        sqrshrn2        v13.8h,  v19.4s,  #5
1407        smin            v16.8h,  v10.8h,  v26.8h
1408        smin            v17.8h,  v11.8h,  v26.8h
1409        smin            v18.8h,  v12.8h,  v26.8h
1410        smin            v19.8h,  v13.8h,  v26.8h
1411        smax            v16.8h,  v16.8h,  v25.8h
1412        smax            v17.8h,  v17.8h,  v25.8h
1413        smax            v18.8h,  v18.8h,  v25.8h
1414        smax            v19.8h,  v19.8h,  v25.8h
1415.endif
1416
1417        uxtl            v4.8h,   v6.8b            // scaling
1418.if \ox && !\oy
1419        sqrshrn         v20.4h,  v20.4s,  #5
1420.endif
1421        uxtl2           v5.8h,   v6.16b
1422.if \ox && !\oy
1423        smin            v20.4h,  v20.4h,  v26.4h
1424.endif
1425        uxtl            v6.8h,   v7.8b
1426.if \ox && !\oy
1427        smax            v20.4h,  v20.4h,  v25.4h
1428.endif
1429        uxtl2           v7.8h,   v7.16b
1430.if \ox && !\oy
1431        ins             v16.d[0], v20.d[0]
1432.endif
1433        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
1434        ushl            v5.8h,   v5.8h,   v29.8h
1435        ushl            v6.8h,   v6.8h,   v29.8h
1436        ushl            v7.8h,   v7.8h,   v29.8h
1437
1438        sqrdmulh        v20.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
1439        sqrdmulh        v21.8h,  v17.8h,  v5.8h
1440        sqrdmulh        v22.8h,  v18.8h,  v6.8h
1441        sqrdmulh        v23.8h,  v19.8h,  v7.8h
1442
1443        usqadd          v0.8h,   v20.8h           // *src + noise
1444        usqadd          v1.8h,   v21.8h
1445        usqadd          v2.8h,   v22.8h
1446        usqadd          v3.8h,   v23.8h
1447
1448        umax            v0.8h,   v0.8h,   v30.8h
1449        umax            v1.8h,   v1.8h,   v30.8h
1450        umax            v2.8h,   v2.8h,   v30.8h
1451        umax            v3.8h,   v3.8h,   v30.8h
1452        umin            v0.8h,   v0.8h,   v31.8h
1453        umin            v1.8h,   v1.8h,   v31.8h
1454        umin            v2.8h,   v2.8h,   v31.8h
1455        umin            v3.8h,   v3.8h,   v31.8h
1456
1457        subs            w7,  w7,  #1
1458.if \oy
1459        dup             v8.8h,   v28.h[0]
1460        dup             v9.8h,   v28.h[1]
1461.endif
1462        st1             {v0.8h, v1.8h, v2.8h, v3.8h},  [x0], x2 // dst
1463        b.gt            1b
1464
1465.if \oy
1466        cmp             w10, #2
1467        sub             w7,  w10, #2           // restore actual remaining h
1468        b.gt            L(loop_\ox\()0)
1469.endif
1470        ldr             d14,      [sp, #64]
1471        ldp             d12, d13, [sp, #48]
1472        ldp             d10, d11, [sp, #32]
1473        ldp             d8,  d9,  [sp, #16]
1474        ldr             x30, [sp], #80
1475        AARCH64_VALIDATE_LINK_REGISTER
1476        ret
1477.endm
1478
1479        fgy             0, 0
1480        fgy             0, 1
1481        fgy             1, 0
1482        fgy             1, 1
1483
1484L(fgy_loop_tbl):
1485        .hword L(fgy_loop_tbl) - L(loop_00)
1486        .hword L(fgy_loop_tbl) - L(loop_01)
1487        .hword L(fgy_loop_tbl) - L(loop_10)
1488        .hword L(fgy_loop_tbl) - L(loop_11)
1489endfunc
1490
1491// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
1492//                                      const pixel *const src,
1493//                                      const ptrdiff_t stride,
1494//                                      const uint8_t scaling[SCALING_SIZE],
1495//                                      const Dav1dFilmGrainData *const data,
1496//                                      const entry grain_lut[][GRAIN_WIDTH],
1497//                                      const pixel *const luma_row,
1498//                                      const ptrdiff_t luma_stride,
1499//                                      const int offsets[][2],
1500//                                      const ptrdiff_t h, const ptrdiff_t uv,
1501//                                      const ptrdiff_t is_id,
1502//                                      const ptrdiff_t type,
1503//                                      const int bitdepth_max);
1504.macro fguv layout, sx, sy
1505function fguv_32x32_\layout\()_16bpc_neon, export=1
1506        AARCH64_SIGN_LINK_REGISTER
1507        str             x30,      [sp, #-80]!
1508        stp             d8,  d9,  [sp, #16]
1509        stp             d10, d11, [sp, #32]
1510        stp             d12, d13, [sp, #48]
1511        stp             d14, d15, [sp, #64]
1512
1513        ldp             x8,  x9,  [sp, #80]    // offsets, h
1514        ldp             x10, x11, [sp, #96]    // uv, is_id
1515        ldr             w16,      [sp, #120]   // bitdepth_max
1516
1517        ldr             w13, [x4, #FGD_SCALING_SHIFT]
1518        ldr             w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
1519        dup             v23.8h,  w16           // bitdepth_max
1520        clz             w16, w16
1521        eor             w13, w13, #15          // 15 - scaling_shift
1522        sub             w16, w16, #24          // -bitdepth_min_8
1523
1524        // !csfl
1525        add             x10, x4,  x10, lsl #2  // + 4*uv
1526        add             x14, x10, #FGD_UV_LUMA_MULT
1527        add             x15, x10, #FGD_UV_MULT
1528        add             x10, x10, #FGD_UV_OFFSET
1529        neg             w16, w16               // bitdepth_min_8
1530        ld1r            {v8.8h},  [x14]        // uv_luma_mult
1531        ld1r            {v24.8h}, [x10]        // uv_offset
1532        ld1r            {v9.8h},  [x15]        // uv_mult
1533
1534        dup             v29.8h,  w13           // 15 - scaling_shift
1535        dup             v27.8h,  w16           // bitdepth_min_8
1536
1537        cbz             w12, 1f
1538        // clip
1539        movi            v30.8h,  #16
1540        movi            v31.8h,  #240
1541        sshl            v30.8h,  v30.8h,  v27.8h
1542        sshl            v31.8h,  v31.8h,  v27.8h
1543        cbz             w11, 2f
1544        // is_id
1545        movi            v31.8h,  #235
1546        sshl            v31.8h,  v31.8h,  v27.8h
1547        b               2f
15481:
1549        // no clip
1550        movi            v30.8h,  #0
1551        mov             v31.16b, v23.16b       // bitdepth_max
15522:
1553
1554        ushr            v15.8h,  v23.8h,  #1   // grain_max
1555        sshl            v24.8h,  v24.8h,  v27.8h // uv_offset << bitdepth_min_8
1556        not             v14.16b, v15.16b       // grain_min
1557
1558        ldr             w12, [x8, #8]          // offsets[1][0]
1559        ldr             w14, [x8, #4]          // offsets[0][1]
1560        ldr             w16, [x8, #12]         // offsets[1][1]
1561        ldr             w8,  [x8]              // offsets[0][0]
1562
1563        mov             x10, #GRAIN_WIDTH*2    // grain_lut stride
1564
1565        add             x5,  x5,  #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
1566.if \sy
1567        add             x5,  x5,  x10, lsl #2  // grain_lut += 4 * grain_stride
1568        add             x5,  x5,  x10, lsl #1  // grain_lut += 2 * grain_stride
1569.else
1570        add             x5,  x5,  x10, lsl #3  // grain_lut += 8 * grain_stride
1571        add             x5,  x5,  x10          // grain_lut += grain_stride
1572.endif
1573
1574        calc_offset     w12, w13, w12, \sx, \sy
1575        calc_offset     w14, w15, w14, \sx, \sy
1576        calc_offset     w16, w17, w16, \sx, \sy
1577        calc_offset     w8,  w11, w8,  \sx, \sy
1578
1579        add_offset      x13, w12, x13, x5,  x10
1580        add_offset      x15, w14, x15, x5,  x10
1581        add_offset      x17, w16, x17, x5,  x10
1582        add_offset      x5,  w8,  x11, x5,  x10
1583
1584        add             x4,  x13, #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
1585        add             x8,  x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1586        add             x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1587        add             x11, x11, #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
1588
1589        ldr             w13, [sp, #112]        // type
1590
1591        movrel          x16, overlap_coeffs_\sx
1592        adr             x14, L(fguv_loop_sx\sx\()_tbl)
1593
1594        ld1             {v27.4h, v28.4h}, [x16] // overlap_coeffs
1595        tst             w13, #1
1596        ldrh            w13, [x14, w13, uxtw #1]
1597
1598        b.eq            1f
1599        // y overlap
1600        sub             w12, w9,  #(2 >> \sy)  // backup remaining h
1601        mov             w9,  #(2 >> \sy)
1602
16031:
1604        sub             x13, x14, w13, uxtw
1605
1606.if \sy
1607        movi            v25.8h,  #23
1608        movi            v26.8h,  #22
1609.else
1610        movi            v25.8h,  #27
1611        movi            v26.8h,  #17
1612.endif
1613
1614.if \sy
1615        add             x7,  x7,  x7           // luma_stride *= 2
1616.endif
1617
1618        br              x13
1619endfunc
1620.endm
1621
1622fguv 420, 1, 1
1623fguv 422, 1, 0
1624fguv 444, 0, 0
1625
1626function fguv_loop_sx0_neon
1627.macro fguv_loop_sx0 csfl, ox, oy
1628L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
1629        AARCH64_VALID_JUMP_TARGET
16301:
1631.if \ox
1632        ld1             {v4.4h}, [x4],  x10  // grain_lut old
1633.endif
1634.if \oy
1635        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x8],  x10 // grain_lut top
1636.endif
1637.if \ox && \oy
1638        ld1             {v5.4h}, [x11], x10  // grain_lut top old
1639.endif
1640        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5],  x10 // grain_lut
1641
1642.if \ox
1643        smull           v4.4s,   v4.4h,   v27.4h
1644        smlal           v4.4s,   v16.4h,  v28.4h
1645.endif
1646
1647.if \oy
1648.if \ox
1649        smull           v5.4s,   v5.4h,   v27.4h
1650        smlal           v5.4s,   v0.4h,   v28.4h
1651        sqrshrn         v4.4h,   v4.4s,   #5
1652        sqrshrn         v5.4h,   v5.4s,   #5
1653        smin            v4.4h,   v4.4h,   v15.4h
1654        smin            v5.4h,   v5.4h,   v15.4h
1655        smax            v4.4h,   v4.4h,   v14.4h
1656        smax            v5.4h,   v5.4h,   v14.4h
1657        ins             v16.d[0], v4.d[0]
1658        ins             v0.d[0],  v5.d[0]
1659.endif
1660
1661        smull           v6.4s,   v16.4h,  v26.4h
1662        smull2          v7.4s,   v16.8h,  v26.8h
1663        smull           v10.4s,  v17.4h,  v26.4h
1664        smull2          v11.4s,  v17.8h,  v26.8h
1665        smull           v16.4s,  v18.4h,  v26.4h
1666        smull2          v17.4s,  v18.8h,  v26.8h
1667        smull           v18.4s,  v19.4h,  v26.4h
1668        smull2          v19.4s,  v19.8h,  v26.8h
1669        smlal           v6.4s,   v0.4h,   v25.4h
1670        smlal2          v7.4s,   v0.8h,   v25.8h
1671        smlal           v10.4s,  v1.4h,   v25.4h
1672        smlal2          v11.4s,  v1.8h,   v25.8h
1673        smlal           v16.4s,  v2.4h,   v25.4h
1674        smlal2          v17.4s,  v2.8h,   v25.8h
1675        smlal           v18.4s,  v3.4h,   v25.4h
1676        smlal2          v19.4s,  v3.8h,   v25.8h
1677        sqrshrn         v6.4h,   v6.4s,   #5
1678        sqrshrn2        v6.8h,   v7.4s,   #5
1679        sqrshrn         v7.4h,   v10.4s,  #5
1680        sqrshrn2        v7.8h,   v11.4s,  #5
1681        sqrshrn         v10.4h,  v16.4s,  #5
1682        sqrshrn2        v10.8h,  v17.4s,  #5
1683        sqrshrn         v11.4h,  v18.4s,  #5
1684        sqrshrn2        v11.8h,  v19.4s,  #5
1685.endif
1686
1687.if \ox && !\oy
1688        sqrshrn         v4.4h,   v4.4s,   #5
1689        smin            v4.4h,   v4.4h,   v15.4h
1690.endif
1691        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x6],  x7 // luma
1692.if \oy
1693        smin            v16.8h,  v6.8h,   v15.8h
1694        smin            v17.8h,  v7.8h,   v15.8h
1695        smin            v18.8h,  v10.8h,  v15.8h
1696        smin            v19.8h,  v11.8h,  v15.8h
1697        smax            v16.8h,  v16.8h,  v14.8h
1698        smax            v17.8h,  v17.8h,  v14.8h
1699        smax            v18.8h,  v18.8h,  v14.8h
1700        smax            v19.8h,  v19.8h,  v14.8h
1701.endif
1702
1703.if \ox && !\oy
1704        smax            v4.4h,   v4.4h,   v14.4h
1705.endif
1706        ld1             {v10.8h, v11.8h, v12.8h, v13.8h}, [x1],  x2 // src
1707.if \ox && !\oy
1708        ins             v16.d[0], v4.d[0]
1709.endif
1710
1711.if !\csfl
1712        smull           v4.4s,   v0.4h,   v8.4h
1713        smull2          v5.4s,   v0.8h,   v8.8h
1714        smull           v6.4s,   v1.4h,   v8.4h
1715        smull2          v7.4s,   v1.8h,   v8.8h
1716        smull           v0.4s,   v2.4h,   v8.4h
1717        smull2          v1.4s,   v2.8h,   v8.8h
1718        smull           v2.4s,   v3.4h,   v8.4h
1719        smull2          v3.4s,   v3.8h,   v8.8h
1720        smlal           v4.4s,   v10.4h,  v9.4h
1721        smlal2          v5.4s,   v10.8h,  v9.8h
1722        smlal           v6.4s,   v11.4h,  v9.4h
1723        smlal2          v7.4s,   v11.8h,  v9.8h
1724        smlal           v0.4s,   v12.4h,  v9.4h
1725        smlal2          v1.4s,   v12.8h,  v9.8h
1726        smlal           v2.4s,   v13.4h,  v9.4h
1727        smlal2          v3.4s,   v13.8h,  v9.8h
1728        shrn            v4.4h,   v4.4s,   #6
1729        shrn2           v4.8h,   v5.4s,   #6
1730        shrn            v5.4h,   v6.4s,   #6
1731        shrn2           v5.8h,   v7.4s,   #6
1732        shrn            v6.4h,   v0.4s,   #6
1733        shrn2           v6.8h,   v1.4s,   #6
1734        shrn            v7.4h,   v2.4s,   #6
1735        shrn2           v7.8h,   v3.4s,   #6
1736        add             v0.8h,   v4.8h,   v24.8h
1737        add             v1.8h,   v5.8h,   v24.8h
1738        add             v2.8h,   v6.8h,   v24.8h
1739        add             v3.8h,   v7.8h,   v24.8h
1740        movi            v20.8h,  #0
1741        smin            v0.8h,   v0.8h,   v23.8h
1742        smin            v1.8h,   v1.8h,   v23.8h
1743        smin            v2.8h,   v2.8h,   v23.8h
1744        smin            v3.8h,   v3.8h,   v23.8h
1745        smax            v0.8h,   v0.8h,   v20.8h
1746        smax            v1.8h,   v1.8h,   v20.8h
1747        smax            v2.8h,   v2.8h,   v20.8h
1748        smax            v3.8h,   v3.8h,   v20.8h
1749.else
1750        // Make sure that uninitialized pixels out of range past the right
1751        // edge are in range; their actual values shouldn't matter.
1752        and             v0.16b,  v0.16b,  v23.16b
1753        and             v1.16b,  v1.16b,  v23.16b
1754        and             v2.16b,  v2.16b,  v23.16b
1755        and             v3.16b,  v3.16b,  v23.16b
1756.endif
1757
1758        bl              gather32_neon
1759
1760        uxtl            v4.8h,   v6.8b            // scaling
1761        uxtl2           v5.8h,   v6.16b
1762        uxtl            v6.8h,   v7.8b
1763        uxtl2           v7.8h,   v7.16b
1764
1765        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
1766        ushl            v5.8h,   v5.8h,   v29.8h
1767        ushl            v6.8h,   v6.8h,   v29.8h
1768        ushl            v7.8h,   v7.8h,   v29.8h
1769
1770        sqrdmulh        v16.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
1771        sqrdmulh        v17.8h,  v17.8h,  v5.8h
1772        sqrdmulh        v18.8h,  v18.8h,  v6.8h
1773        sqrdmulh        v19.8h,  v19.8h,  v7.8h
1774
1775        usqadd          v10.8h,  v16.8h           // *src + noise
1776        usqadd          v11.8h,  v17.8h
1777        usqadd          v12.8h,  v18.8h
1778        usqadd          v13.8h,  v19.8h
1779
1780        umax            v0.8h,   v10.8h,  v30.8h
1781        umax            v1.8h,   v11.8h,  v30.8h
1782        umax            v2.8h,   v12.8h,  v30.8h
1783        umax            v3.8h,   v13.8h,  v30.8h
1784        umin            v0.8h,   v0.8h,   v31.8h
1785        umin            v1.8h,   v1.8h,   v31.8h
1786        umin            v2.8h,   v2.8h,   v31.8h
1787        umin            v3.8h,   v3.8h,   v31.8h
1788
1789        subs            w9,  w9,  #1
1790.if \oy
1791        dup             v25.8h,  v28.h[0]
1792        dup             v26.8h,  v28.h[1]
1793.endif
1794        st1             {v0.8h, v1.8h, v2.8h, v3.8h},  [x0], x2 // dst
1795        b.gt            1b
1796
1797.if \oy
1798        cmp             w12, #0
1799        mov             w9,  w12               // restore actual remaining h
1800        b.gt            L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
1801.endif
1802        b               9f
1803.endm
1804        fguv_loop_sx0   0, 0, 0
1805        fguv_loop_sx0   0, 0, 1
1806        fguv_loop_sx0   0, 1, 0
1807        fguv_loop_sx0   0, 1, 1
1808        fguv_loop_sx0   1, 0, 0
1809        fguv_loop_sx0   1, 0, 1
1810        fguv_loop_sx0   1, 1, 0
1811        fguv_loop_sx0   1, 1, 1
1812
18139:
1814        ldp             d14, d15, [sp, #64]
1815        ldp             d12, d13, [sp, #48]
1816        ldp             d10, d11, [sp, #32]
1817        ldp             d8,  d9,  [sp, #16]
1818        ldr             x30,      [sp], #80
1819        AARCH64_VALIDATE_LINK_REGISTER
1820        ret
1821
1822L(fguv_loop_sx0_tbl):
1823        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
1824        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
1825        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
1826        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
1827        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
1828        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
1829        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
1830        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
1831endfunc
1832
1833function fguv_loop_sx1_neon
1834.macro fguv_loop_sx1 csfl, ox, oy
1835L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
1836        AARCH64_VALID_JUMP_TARGET
18371:
1838.if \ox
1839        ld1             {v18.4h}, [x4],  x10  // grain_lut old
1840.endif
1841.if \oy
1842        ld1             {v20.8h, v21.8h},  [x8],  x10 // grain_lut top
1843.endif
1844.if \ox && \oy
1845        ld1             {v19.4h}, [x11], x10  // grain_lut top old
1846.endif
1847        ld1             {v16.8h, v17.8h}, [x5],  x10 // grain_lut
1848
1849.if \ox
1850        smull           v18.4s,  v18.4h,  v27.4h
1851        smlal           v18.4s,  v16.4h,  v28.4h
1852.endif
1853
1854.if \oy
1855.if \ox
1856        smull           v19.4s,  v19.4h,  v27.4h
1857        smlal           v19.4s,  v20.4h,  v28.4h
1858        sqrshrn         v18.4h,  v18.4s,  #5
1859        sqrshrn         v19.4h,  v19.4s,  #5
1860        smin            v18.4h,  v18.4h,  v15.4h
1861        smin            v19.4h,  v19.4h,  v15.4h
1862        smax            v18.4h,  v18.4h,  v14.4h
1863        smax            v19.4h,  v19.4h,  v14.4h
1864        ins             v16.d[0], v18.d[0]
1865        ins             v20.d[0], v19.d[0]
1866.endif
1867
1868        smull           v0.4s,   v16.4h,  v26.4h
1869        smull2          v1.4s,   v16.8h,  v26.8h
1870        smull           v2.4s,   v17.4h,  v26.4h
1871        smull2          v3.4s,   v17.8h,  v26.8h
1872        smlal           v0.4s,   v20.4h,  v25.4h
1873        smlal2          v1.4s,   v20.8h,  v25.8h
1874        smlal           v2.4s,   v21.4h,  v25.4h
1875        smlal2          v3.4s,   v21.8h,  v25.8h
1876        sqrshrn         v16.4h,  v0.4s,   #5
1877        sqrshrn2        v16.8h,  v1.4s,   #5
1878        sqrshrn         v17.4h,  v2.4s,   #5
1879        sqrshrn2        v17.8h,  v3.4s,   #5
1880.endif
1881
1882.if \ox && !\oy
1883        sqrshrn         v18.4h,  v18.4s,  #5
1884        smin            v18.4h,  v18.4h,  v15.4h
1885.endif
1886        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x6],  x7 // luma
1887.if \oy
1888        smin            v16.8h,  v16.8h,  v15.8h
1889        smin            v17.8h,  v17.8h,  v15.8h
1890        smax            v16.8h,  v16.8h,  v14.8h
1891        smax            v17.8h,  v17.8h,  v14.8h
1892.endif
1893
1894.if \ox && !\oy
1895        smax            v18.4h,  v18.4h,  v14.4h
1896.endif
1897        ld1             {v10.8h, v11.8h},  [x1],  x2 // src
1898.if \ox && !\oy
1899        ins             v16.d[0], v18.d[0]
1900.endif
1901        addp            v0.8h,   v0.8h,   v1.8h
1902        addp            v1.8h,   v2.8h,   v3.8h
1903        urshr           v0.8h,   v0.8h,   #1
1904        urshr           v1.8h,   v1.8h,   #1
1905.if !\csfl
1906        smull           v2.4s,   v0.4h,   v8.4h
1907        smull2          v3.4s,   v0.8h,   v8.8h
1908        smull           v0.4s,   v1.4h,   v8.4h
1909        smull2          v1.4s,   v1.8h,   v8.8h
1910        smlal           v2.4s,   v10.4h,  v9.4h
1911        smlal2          v3.4s,   v10.8h,  v9.8h
1912        smlal           v0.4s,   v11.4h,  v9.4h
1913        smlal2          v1.4s,   v11.8h,  v9.8h
1914        shrn            v2.4h,   v2.4s,   #6
1915        shrn2           v2.8h,   v3.4s,   #6
1916        shrn            v3.4h,   v0.4s,   #6
1917        shrn2           v3.8h,   v1.4s,   #6
1918        add             v0.8h,   v2.8h,   v24.8h
1919        add             v1.8h,   v3.8h,   v24.8h
1920        movi            v2.8h,   #0
1921        smin            v0.8h,   v0.8h,   v23.8h
1922        smin            v1.8h,   v1.8h,   v23.8h
1923        smax            v0.8h,   v0.8h,   v2.8h
1924        smax            v1.8h,   v1.8h,   v2.8h
1925.else
1926        // Make sure that uninitialized pixels out of range past the right
1927        // edge are in range; their actual values shouldn't matter.
1928        and             v0.16b,  v0.16b,  v23.16b
1929        and             v1.16b,  v1.16b,  v23.16b
1930.endif
1931
1932        bl              gather16_neon
1933
1934        uxtl            v4.8h,   v6.8b            // scaling
1935        uxtl2           v5.8h,   v6.16b
1936
1937        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
1938        ushl            v5.8h,   v5.8h,   v29.8h
1939
1940        sqrdmulh        v16.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
1941        sqrdmulh        v17.8h,  v17.8h,  v5.8h
1942
1943        usqadd          v10.8h,  v16.8h           // *src + noise
1944        usqadd          v11.8h,  v17.8h
1945
1946        umax            v0.8h,   v10.8h,  v30.8h
1947        umax            v1.8h,   v11.8h,  v30.8h
1948        umin            v0.8h,   v0.8h,   v31.8h
1949        umin            v1.8h,   v1.8h,   v31.8h
1950
1951.if \oy
1952        mov             v16.16b, v25.16b
1953.endif
1954        subs            w9,  w9,  #1
1955.if \oy
1956        mov             v25.16b, v26.16b
1957        mov             v26.16b, v16.16b
1958.endif
1959        st1             {v0.8h, v1.8h},  [x0], x2 // dst
1960        b.gt            1b
1961
1962.if \oy
1963        cmp             w12, #0
1964        mov             w9,  w12               // restore actual remaining h
1965        b.gt            L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
1966.endif
1967
1968        b               9f
1969.endm
1970        fguv_loop_sx1   0, 0, 0
1971        fguv_loop_sx1   0, 0, 1
1972        fguv_loop_sx1   0, 1, 0
1973        fguv_loop_sx1   0, 1, 1
1974        fguv_loop_sx1   1, 0, 0
1975        fguv_loop_sx1   1, 0, 1
1976        fguv_loop_sx1   1, 1, 0
1977        fguv_loop_sx1   1, 1, 1
1978
19799:
1980        ldp             d14, d15, [sp, #64]
1981        ldp             d12, d13, [sp, #48]
1982        ldp             d10, d11, [sp, #32]
1983        ldp             d8,  d9,  [sp, #16]
1984        ldr             x30,      [sp], #80
1985        AARCH64_VALIDATE_LINK_REGISTER
1986        ret
1987
1988L(fguv_loop_sx1_tbl):
1989        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
1990        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
1991        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
1992        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
1993        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
1994        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
1995        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
1996        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
1997endfunc
1998