• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2021, VideoLAN and dav1d authors
3 * Copyright © 2021, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30#include "src/arm/asm-offsets.h"
31
32#define GRAIN_WIDTH 82
33#define GRAIN_HEIGHT 73
34
35#define SUB_GRAIN_WIDTH 44
36#define SUB_GRAIN_HEIGHT 38
37
38.macro increment_seed steps, shift=1
39        lsr             w11, w2,  #3
40        lsr             w12, w2,  #12
41        lsr             w13, w2,  #1
42        eor             w11, w2,  w11                     // (r >> 0) ^ (r >> 3)
43        eor             w12, w12, w13                     // (r >> 12) ^ (r >> 1)
44        eor             w11, w11, w12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
45.if \shift
46        lsr             w2,  w2,  #\steps
47.endif
48        and             w11, w11, #((1 << \steps) - 1)    // bit
49.if \shift
50        orr             w2,  w2,  w11, lsl #(16 - \steps) // *state
51.else
52        orr             w2,  w2,  w11, lsl #16            // *state
53.endif
54.endm
55
56.macro read_rand dest, bits, age
57        ubfx            \dest,  x2,   #16 - \bits - \age, #\bits
58.endm
59
60.macro read_shift_rand dest, bits
61        ubfx            \dest,  x2,   #17 - \bits, #\bits
62        lsr             w2,  w2,  #1
63.endm
64
65// special calling convention:
66// w2 holds seed
67// x3 holds dav1d_gaussian_sequence
68// clobbers x11-x15
69// returns in v0.8h
70function get_gaussian_neon
71        increment_seed  4
72        read_rand       x14, 11,  3
73        read_rand       x15, 11,  2
74        add             x14, x3,  x14, lsl #1
75        add             x15, x3,  x15, lsl #1
76        ld1             {v0.h}[0], [x14]
77        read_rand       x14, 11,  1
78        ld1             {v0.h}[1], [x15]
79        add             x14, x3,  x14, lsl #1
80        read_rand       x15, 11,  0
81        increment_seed  4
82        add             x15, x3,  x15, lsl #1
83        ld1             {v0.h}[2], [x14]
84        read_rand       x14, 11,  3
85        ld1             {v0.h}[3], [x15]
86        add             x14, x3,  x14, lsl #1
87        read_rand       x15, 11,  2
88        ld1             {v0.h}[4], [x14]
89        add             x15, x3,  x15, lsl #1
90        read_rand       x14, 11,  1
91        ld1             {v0.h}[5], [x15]
92        read_rand       x15, 11,  0
93        add             x14, x3,  x14, lsl #1
94        add             x15, x3,  x15, lsl #1
95        ld1             {v0.h}[6], [x14]
96        ld1             {v0.h}[7], [x15]
97        ret
98endfunc
99
100.macro get_grain_row r0, r1, r2, r3, r4, r5
101        bl              get_gaussian_neon
102        srshl           \r5\().8h,  v0.8h,  v31.8h
103        xtn             \r0\().8b,  \r5\().8h
104        bl              get_gaussian_neon
105        srshl           \r5\().8h,  v0.8h,  v31.8h
106        xtn2            \r0\().16b, \r5\().8h
107        bl              get_gaussian_neon
108        srshl           \r5\().8h,  v0.8h,  v31.8h
109        xtn             \r1\().8b,  \r5\().8h
110        bl              get_gaussian_neon
111        srshl           \r5\().8h,  v0.8h,  v31.8h
112        xtn2            \r1\().16b, \r5\().8h
113        bl              get_gaussian_neon
114        srshl           \r5\().8h,  v0.8h,  v31.8h
115        xtn             \r2\().8b,  \r5\().8h
116        bl              get_gaussian_neon
117        srshl           \r5\().8h,  v0.8h,  v31.8h
118        xtn2            \r2\().16b, \r5\().8h
119        bl              get_gaussian_neon
120        srshl           \r5\().8h,  v0.8h,  v31.8h
121        xtn             \r3\().8b,  \r5\().8h
122        bl              get_gaussian_neon
123        srshl           \r5\().8h,  v0.8h,  v31.8h
124        xtn2            \r3\().16b, \r5\().8h
125        bl              get_gaussian_neon
126        srshl           \r5\().8h,  v0.8h,  v31.8h
127        xtn             \r4\().8b,  \r5\().8h
128        bl              get_gaussian_neon
129        srshl           \r5\().8h,  v0.8h,  v31.8h
130        xtn2            \r4\().16b, \r5\().8h
131        increment_seed  2
132        read_rand       x14, 11,  1
133        read_rand       x15, 11,  0
134        add             x14, x3,  x14, lsl #1
135        add             x15, x3,  x15, lsl #1
136        ld1             {\r5\().h}[0], [x14]
137        ld1             {\r5\().h}[1], [x15]
138        srshl           v0.4h,      \r5\().4h,  v31.4h
139        xtn             \r5\().8b,  v0.8h
140.endm
141
142.macro store_grain_row r0, r1, r2, r3, r4, r5
143        st1             {\r0\().16b,\r1\().16b}, [x0], #32
144        st1             {\r2\().16b,\r3\().16b}, [x0], #32
145        st1             {\r4\().16b},  [x0], #16
146        st1             {\r5\().h}[0], [x0], #2
147.endm
148
149.macro get_grain_row_44 r0, r1, r2
150        bl              get_gaussian_neon
151        srshl           \r2\().8h,  v0.8h,  v31.8h
152        xtn             \r0\().8b,  \r2\().8h
153        bl              get_gaussian_neon
154        srshl           \r2\().8h,  v0.8h,  v31.8h
155        xtn2            \r0\().16b, \r2\().8h
156        bl              get_gaussian_neon
157        srshl           \r2\().8h,  v0.8h,  v31.8h
158        xtn             \r1\().8b,  \r2\().8h
159        bl              get_gaussian_neon
160        srshl           \r2\().8h,  v0.8h,  v31.8h
161        xtn2            \r1\().16b, \r2\().8h
162        bl              get_gaussian_neon
163        srshl           \r2\().8h,  v0.8h,  v31.8h
164        xtn             \r2\().8b,  \r2\().8h
165
166        increment_seed  4
167        read_rand       x14, 11,  3
168        read_rand       x15, 11,  2
169        add             x14, x3,  x14, lsl #1
170        add             x15, x3,  x15, lsl #1
171        ld1             {v0.h}[0], [x14]
172        read_rand       x14, 11,  1
173        ld1             {v0.h}[1], [x15]
174        read_rand       x15, 11,  0
175        add             x14, x3,  x14, lsl #1
176        add             x15, x3,  x15, lsl #1
177        ld1             {v0.h}[2], [x14]
178        ld1             {v0.h}[3], [x15]
179        srshl           v0.4h,      v0.4h,  v31.4h
180        xtn2            \r2\().16b, v0.8h
181.endm
182
183.macro store_grain_row_44 r0, r1, r2
184        st1             {\r0\().16b,\r1\().16b}, [x0], #32
185        st1             {\r2\().16b},  [x0]
186        add             x0,  x0,  #GRAIN_WIDTH-32
187.endm
188
189function get_grain_2_neon
190        increment_seed  2
191        read_rand       x14, 11,  1
192        read_rand       x15, 11,  0
193        add             x14, x3,  x14, lsl #1
194        add             x15, x3,  x15, lsl #1
195        ld1             {v0.h}[0], [x14]
196        ld1             {v0.h}[1], [x15]
197        srshl           v0.4h,   v0.4h,   v31.4h
198        xtn             v0.8b,   v0.8h
199        ret
200endfunc
201
202.macro get_grain_2 dst
203        bl              get_grain_2_neon
204.ifnc \dst, v0
205        mov             \dst\().8b, v0.8b
206.endif
207.endm
208
209// w15 holds the number of entries to produce
210// w14, w16 and w17 hold the previous output entries
211// v0 holds the vector of produced entries
212// v1 holds the input vector of sums from above
213.macro output_lag n
214function output_lag\n\()_neon
2151:
216        read_shift_rand x13, 11
217        mov             w11, v1.s[0]
218        ldrsh           w12, [x3, x13, lsl #1]
219        ext             v0.16b,  v0.16b,  v0.16b,  #1
220.if \n == 1
221        madd            w11, w14, w4,  w11        // sum (above) + *coeff * prev output
222.elseif \n == 2
223        madd            w11, w16, w4,  w11        // sum (above) + *coeff * prev output 1
224        madd            w11, w14, w17, w11        // += *coeff * prev output 2
225        mov             w16, w14
226.else
227        madd            w11, w17, w4,  w11        // sum (above) + *coeff * prev output 1
228        madd            w11, w16, w20, w11        // sum (above) + *coeff * prev output 2
229        madd            w11, w14, w21, w11        // += *coeff * prev output 3
230        mov             w17, w16
231        mov             w16, w14
232.endif
233        add             w14, w11, w8              // 1 << (ar_coeff_shift - 1)
234        add             w12, w12, w10             // 1 << (4 + grain_scale_shift - 1)
235        asr             w14, w14, w7              // >> ar_coeff_shift
236        asr             w12, w12, w9              // >> (4 + grain_scale_shift)
237        add             w14, w14, w12
238        cmp             w14, w5
239        csel            w14, w14, w5,  le
240        cmp             w14, w6
241        csel            w14, w14, w6,  ge
242        subs            w15, w15, #1
243        ext             v1.16b,  v1.16b,  v1.16b,  #4
244        ins             v0.b[15], w14
245        b.gt            1b
246        ret
247endfunc
248.endm
249
250output_lag 1
251output_lag 2
252output_lag 3
253
254
255function sum_lag1_above_neon
256        smull           v2.8h,   v3.8b,   v28.8b
257        smull2          v3.8h,   v3.16b,  v28.16b
258        smull           v4.8h,   v0.8b,   v27.8b
259        smull2          v5.8h,   v0.16b,  v27.16b
260        smull           v6.8h,   v1.8b,   v29.8b
261        smull2          v7.8h,   v1.16b,  v29.16b
262        saddl           v0.4s,   v2.4h,   v4.4h
263        saddl2          v1.4s,   v2.8h,   v4.8h
264        saddl           v2.4s,   v3.4h,   v5.4h
265        saddl2          v3.4s,   v3.8h,   v5.8h
266        saddw           v4.4s,   v0.4s,   v6.4h
267        saddw2          v5.4s,   v1.4s,   v6.8h
268        saddw           v6.4s,   v2.4s,   v7.4h
269        saddw2          v7.4s,   v3.4s,   v7.8h
270        ret
271endfunc
272
273.macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
274        bl              sum_\lag\()_above_neon
275.ifc \type, uv_420
276        add             x12, x19, #GRAIN_WIDTH
277        ld1             {v22.16b, v23.16b}, [x19], #32
278        ld1             {v24.16b, v25.16b}, [x12]
279        saddlp          v22.8h,  v22.16b
280        saddlp          v23.8h,  v23.16b
281        saddlp          v24.8h,  v24.16b
282        saddlp          v25.8h,  v25.16b
283        add             v22.8h,  v22.8h,  v24.8h
284        add             v23.8h,  v23.8h,  v25.8h
285        rshrn           v0.8b,   v22.8h,  #2
286        rshrn2          v0.16b,  v23.8h,  #2
287.endif
288.ifc \type, uv_422
289        ld1             {v22.16b, v23.16b}, [x19], #32
290        saddlp          v22.8h,  v22.16b
291        saddlp          v23.8h,  v23.16b
292        rshrn           v0.8b,   v22.8h,  #1
293        rshrn2          v0.16b,  v23.8h,  #1
294.endif
295.ifc \type, uv_444
296        ld1             {v0.16b}, [x19], #16
297.endif
298.if \uv_layout
299.ifnb \uv_coeff
300        dup             v1.16b,  \uv_coeff
301        smull           v2.8h,   v0.8b,   v1.8b
302        smull2          v3.8h,   v0.16b,  v1.16b
303.else
304        smull           v2.8h,   v0.8b,   v30.8b
305        smull2          v3.8h,   v0.16b,  v30.16b
306.endif
307        saddw           v4.4s,   v4.4s,   v2.4h
308        saddw2          v5.4s,   v5.4s,   v2.8h
309        saddw           v6.4s,   v6.4s,   v3.4h
310        saddw2          v7.4s,   v7.4s,   v3.8h
311.endif
312.if \uv_layout && \elems == 16
313        b               sum_\lag\()_y_\edge\()_start
314.elseif \uv_layout == 444 && \elems == 15
315        b               sum_\lag\()_y_\edge\()_start
316.elseif \uv_layout == 422 && \elems == 9
317        b               sum_\lag\()_uv_420_\edge\()_start
318.else
319sum_\lag\()_\type\()_\edge\()_start:
320.ifc \edge, left
321        increment_seed  4
322        read_rand       x12, 11,  3
323        read_rand       x13, 11,  2
324        read_rand       x14, 11,  1
325        add             x12, x3,  x12, lsl #1
326        add             x13, x3,  x13, lsl #1
327        add             x14, x3,  x14, lsl #1
328        ld1             {v0.h}[5], [x12]
329        ld1             {v0.h}[6], [x13]
330        ld1             {v0.h}[7], [x14]
331        lsl             x2,  x2,  #1             // shift back the state as if we'd done increment_seed with shift=0
332        srshl           v0.8h,   v0.8h,   v31.8h
333        xtn2            v0.16b,  v0.8h
334        ext             v4.16b,  v4.16b,  v4.16b,  #12
335.ifc \lag, lag3
336        smov            w17, v0.b[13]
337.endif
338.ifnc \lag, lag1
339        smov            w16, v0.b[14]
340.endif
341        smov            w14, v0.b[15]
342
343        mov             v1.16b,  v4.16b
344        mov             w15, #1
345        bl              output_\lag\()_neon
346.else
347        increment_seed  4, shift=0
348        mov             v1.16b,  v4.16b
349        mov             w15, #4
350        bl              output_\lag\()_neon
351.endif
352
353        increment_seed  4, shift=0
354        mov             v1.16b,  v5.16b
355        mov             w15, #4
356        bl              output_\lag\()_neon
357
358        increment_seed  4, shift=0
359        mov             v1.16b,  v6.16b
360.if \elems == 9
361        mov             w15, #1
362        bl              output_\lag\()_neon
363        lsr             w2,  w2,  #3
364
365        read_rand       x12, 11,  2
366        read_rand       x13, 11,  1
367        read_rand       x14, 11,  0
368        add             x12, x3,  x12, lsl #1
369        add             x13, x3,  x13, lsl #1
370        add             x14, x3,  x14, lsl #1
371        ld1             {v1.h}[0], [x12]
372        ld1             {v1.h}[1], [x13]
373        ld1             {v1.h}[2], [x14]
374        srshl           v1.4h,   v1.4h,   v31.4h
375        xtn             v1.8b,   v1.8h
376        ext             v0.16b,  v0.16b,  v1.16b,  #7
377.else
378        mov             w15, #4
379        bl              output_\lag\()_neon
380
381        increment_seed  4, shift=0
382        mov             v1.16b,  v7.16b
383
384.ifc \edge, right
385        mov             w15, #3
386        bl              output_\lag\()_neon
387        read_shift_rand x15, 11
388        add             x15, x3,  x15, lsl #1
389        ld1             {v1.h}[0], [x15]
390        srshl           v1.4h,   v1.4h,   v31.4h
391        ext             v0.16b,  v0.16b,  v1.16b,  #1
392.else
393        mov             w15, #4
394        bl              output_\lag\()_neon
395.endif
396.endif
397.if \store
398        st1             {v0.16b}, [x0], #16
399.endif
400        ldr             x30, [sp], #16
401        AARCH64_VALIDATE_LINK_REGISTER
402        ret
403.endif
404.endm
405
406.macro sum_lag1_func type, uv_layout, edge, elems=16
407function sum_\type\()_lag1_\edge\()_neon
408        AARCH64_SIGN_LINK_REGISTER
409        str             x30, [sp, #-16]!
410        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems, store=0
411endfunc
412.endm
413
414sum_lag1_func y,      0,   left
415sum_lag1_func y,      0,   mid
416sum_lag1_func y,      0,   right, 15
417sum_lag1_func uv_444, 444, left
418sum_lag1_func uv_444, 444, mid
419sum_lag1_func uv_444, 444, right, 15
420sum_lag1_func uv_422, 422, left
421sum_lag1_func uv_422, 422, mid
422sum_lag1_func uv_422, 422, right, 9
423sum_lag1_func uv_420, 420, left
424sum_lag1_func uv_420, 420, mid
425sum_lag1_func uv_420, 420, right, 9
426
427.macro sum_lag1 type, dst, left, mid, right, edge=mid
428        mov             v3.16b,  \mid\().16b
429        ext             v0.16b,  \left\().16b, \mid\().16b,   #15
430        ext             v1.16b,  \mid\().16b,  \right\().16b, #1
431        bl              sum_\type\()_lag1_\edge\()_neon
432        mov             \dst\().16b, v0.16b
433.endm
434
435.macro sum_y_lag1 dst, left, mid, right, edge=mid
436        sum_lag1        y, \dst, \left, \mid, \right, \edge
437.endm
438
439.macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
440        sum_lag1        uv_444, \dst, \left, \mid, \right, \edge
441.endm
442
443.macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
444        sum_lag1        uv_422, \dst, \left, \mid, \right, \edge
445.endm
446
447.macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
448        sum_lag1        uv_420, \dst, \left, \mid, \right, \edge
449.endm
450
451
452function sum_lag2_above_neon
453        sub             x12, x0,  #2*GRAIN_WIDTH - 16
454        sub             x13, x0,  #1*GRAIN_WIDTH - 16
455        ld1             {v18.16b}, [x12] // load top right
456        ld1             {v21.16b}, [x13]
457
458        ext             v22.16b, v16.16b, v17.16b, #14 // top left, top mid
459        dup             v26.16b, v30.b[0]
460        ext             v23.16b, v16.16b, v17.16b, #15
461        dup             v27.16b, v30.b[1]
462        ext             v0.16b,  v17.16b, v18.16b, #1  // top mid, top right
463        dup             v28.16b, v30.b[3]
464        ext             v1.16b,  v17.16b, v18.16b, #2
465        dup             v29.16b, v30.b[4]
466
467        smull           v2.8h,   v22.8b,  v26.8b
468        smull2          v3.8h,   v22.16b, v26.16b
469        smull           v4.8h,   v23.8b,  v27.8b
470        smull2          v5.8h,   v23.16b, v27.16b
471        smull           v6.8h,   v0.8b,   v28.8b
472        smull2          v7.8h,   v0.16b,  v28.16b
473        smull           v0.8h,   v1.8b,   v29.8b
474        smull2          v1.8h,   v1.16b,  v29.16b
475        saddl           v22.4s,  v2.4h,   v4.4h
476        saddl2          v23.4s,  v2.8h,   v4.8h
477        saddl           v26.4s,  v3.4h,   v5.4h
478        saddl2          v27.4s,  v3.8h,   v5.8h
479        saddl           v2.4s,   v0.4h,   v6.4h
480        saddl2          v3.4s,   v0.8h,   v6.8h
481        saddl           v6.4s,   v1.4h,   v7.4h
482        saddl2          v7.4s,   v1.8h,   v7.8h
483        add             v4.4s,   v22.4s,  v2.4s
484        add             v5.4s,   v23.4s,  v3.4s
485        add             v6.4s,   v26.4s,  v6.4s
486        add             v7.4s,   v27.4s,  v7.4s
487
488        ext             v22.16b, v19.16b, v20.16b, #14 // top left, top mid
489        dup             v26.16b, v30.b[5]
490        ext             v23.16b, v19.16b, v20.16b, #15
491        dup             v27.16b, v30.b[6]
492        ext             v0.16b,  v20.16b, v21.16b, #1  // top mid, top right
493        dup             v28.16b, v30.b[8]
494        ext             v1.16b,  v20.16b, v21.16b, #2
495        dup             v29.16b, v30.b[9]
496
497        smull           v2.8h,   v22.8b,  v26.8b
498        smull2          v3.8h,   v22.16b, v26.16b
499        smull           v22.8h,  v23.8b,  v27.8b
500        smull2          v23.8h,  v23.16b, v27.16b
501        smull           v26.8h,  v0.8b,   v28.8b
502        smull2          v27.8h,  v0.16b,  v28.16b
503        smull           v28.8h,  v1.8b,   v29.8b
504        smull2          v29.8h,  v1.16b,  v29.16b
505        saddl           v0.4s,   v2.4h,   v22.4h
506        saddl2          v1.4s,   v2.8h,   v22.8h
507        saddl           v2.4s,   v3.4h,   v23.4h
508        saddl2          v3.4s,   v3.8h,   v23.8h
509        saddl           v22.4s,  v26.4h,  v28.4h
510        saddl2          v23.4s,  v26.8h,  v28.8h
511        saddl           v26.4s,  v27.4h,  v29.4h
512        saddl2          v27.4s,  v27.8h,  v29.8h
513        add             v0.4s,   v0.4s,   v22.4s
514        add             v1.4s,   v1.4s,   v23.4s
515        add             v2.4s,   v2.4s,   v26.4s
516        add             v3.4s,   v3.4s,   v27.4s
517        dup             v26.16b, v30.b[2]
518        dup             v27.16b, v30.b[7]
519        smull           v22.8h,  v17.8b,  v26.8b
520        smull2          v23.8h,  v17.16b, v26.16b
521        smull           v24.8h,  v20.8b,  v27.8b
522        smull2          v25.8h,  v20.16b, v27.16b
523        add             v4.4s,   v4.4s,   v0.4s
524        add             v5.4s,   v5.4s,   v1.4s
525        add             v6.4s,   v6.4s,   v2.4s
526        add             v7.4s,   v7.4s,   v3.4s
527
528        mov             v16.16b, v17.16b
529        mov             v17.16b, v18.16b
530
531        saddl           v0.4s,   v22.4h,  v24.4h
532        saddl2          v1.4s,   v22.8h,  v24.8h
533        saddl           v2.4s,   v23.4h,  v25.4h
534        saddl2          v3.4s,   v23.8h,  v25.8h
535        mov             v19.16b, v20.16b
536        mov             v20.16b, v21.16b
537        add             v4.4s,   v4.4s,   v0.4s
538        add             v5.4s,   v5.4s,   v1.4s
539        add             v6.4s,   v6.4s,   v2.4s
540        add             v7.4s,   v7.4s,   v3.4s
541        ret
542endfunc
543
544.macro sum_lag2_func type, uv_layout, edge, elems=16
545function sum_\type\()_lag2_\edge\()_neon
546        AARCH64_SIGN_LINK_REGISTER
547        str             x30, [sp, #-16]!
548.ifc \edge, left
549        sub             x12, x0,  #2*GRAIN_WIDTH
550        sub             x13, x0,  #1*GRAIN_WIDTH
551        ld1             {v17.16b}, [x12] // load the previous block right above
552        ld1             {v20.16b}, [x13]
553.endif
554        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12]
555endfunc
556.endm
557
558sum_lag2_func y,      0,   left
559sum_lag2_func y,      0,   mid
560sum_lag2_func y,      0,   right, 15
561sum_lag2_func uv_444, 444, left
562sum_lag2_func uv_444, 444, mid
563sum_lag2_func uv_444, 444, right, 15
564sum_lag2_func uv_422, 422, left
565sum_lag2_func uv_422, 422, mid
566sum_lag2_func uv_422, 422, right, 9
567sum_lag2_func uv_420, 420, left
568sum_lag2_func uv_420, 420, mid
569sum_lag2_func uv_420, 420, right, 9
570
571
572function sum_lag3_above_neon
573        sub             x11, x0,  #3*GRAIN_WIDTH - 16
574        sub             x12, x0,  #2*GRAIN_WIDTH - 16
575        sub             x13, x0,  #1*GRAIN_WIDTH - 16
576        ld1             {v15.16b}, [x11] // load top right
577        ld1             {v18.16b}, [x12]
578        ld1             {v21.16b}, [x13]
579
580        ext             v8.16b,  v13.16b, v14.16b, #13 // top left, top mid
581        dup             v22.16b, v29.b[0]
582        ext             v9.16b,  v13.16b, v14.16b, #14
583        dup             v23.16b, v29.b[1]
584        ext             v10.16b, v13.16b, v14.16b, #15
585        dup             v24.16b, v29.b[2]
586        dup             v25.16b, v29.b[3]
587        ext             v11.16b, v14.16b, v15.16b, #1  // top mid, top right
588        dup             v26.16b, v29.b[4]
589        ext             v12.16b, v14.16b, v15.16b, #2
590        dup             v27.16b, v29.b[5]
591        ext             v13.16b, v14.16b, v15.16b, #3
592        dup             v28.16b, v29.b[6]
593
594        smull           v0.8h,   v8.8b,   v22.8b
595        smull2          v1.8h,   v8.16b,  v22.16b
596        smull           v2.8h,   v9.8b,   v23.8b
597        smull2          v3.8h,   v9.16b,  v23.16b
598        smull           v8.8h,   v10.8b,  v24.8b
599        smull2          v9.8h,   v10.16b, v24.16b
600        smull           v10.8h,  v11.8b,  v26.8b
601        smull2          v11.8h,  v11.16b, v26.16b
602        saddl           v22.4s,  v0.4h,   v2.4h
603        saddl2          v23.4s,  v0.8h,   v2.8h
604        saddl           v24.4s,  v1.4h,   v3.4h
605        saddl2          v26.4s,  v1.8h,   v3.8h
606        saddl           v0.4s,   v8.4h,   v10.4h
607        saddl2          v1.4s,   v8.8h,   v10.8h
608        saddl           v2.4s,   v9.4h,   v11.4h
609        saddl2          v3.4s,   v9.8h,   v11.8h
610        smull           v8.8h,   v12.8b,  v27.8b
611        smull2          v9.8h,   v12.16b, v27.16b
612        smull           v10.8h,  v13.8b,  v28.8b
613        smull2          v11.8h,  v13.16b, v28.16b
614        smull           v12.8h,  v14.8b,  v25.8b
615        smull2          v13.8h,  v14.16b, v25.16b
616        add             v4.4s,   v22.4s,  v0.4s
617        add             v5.4s,   v23.4s,  v1.4s
618        add             v6.4s,   v24.4s,  v2.4s
619        add             v7.4s,   v26.4s,  v3.4s
620        saddl           v0.4s,   v8.4h,   v10.4h
621        saddl2          v1.4s,   v8.8h,   v10.8h
622        saddl           v2.4s,   v9.4h,   v11.4h
623        saddl2          v3.4s,   v9.8h,   v11.8h
624        add             v4.4s,   v4.4s,   v0.4s
625        add             v5.4s,   v5.4s,   v1.4s
626        add             v6.4s,   v6.4s,   v2.4s
627        add             v7.4s,   v7.4s,   v3.4s
628        saddw           v4.4s,   v4.4s,   v12.4h
629        saddw2          v5.4s,   v5.4s,   v12.8h
630        saddw           v6.4s,   v6.4s,   v13.4h
631        saddw2          v7.4s,   v7.4s,   v13.8h
632
633        ext             v8.16b,  v16.16b, v17.16b, #13 // top left, top mid
634        dup             v22.16b, v29.b[7]
635        ext             v9.16b,  v16.16b, v17.16b, #14
636        dup             v23.16b, v29.b[8]
637        ext             v10.16b, v16.16b, v17.16b, #15
638        dup             v24.16b, v29.b[9]
639        dup             v25.16b, v29.b[10]
640        ext             v11.16b, v17.16b, v18.16b, #1  // top mid, top right
641        dup             v26.16b, v29.b[11]
642        ext             v12.16b, v17.16b, v18.16b, #2
643        dup             v27.16b, v29.b[12]
644        ext             v13.16b, v17.16b, v18.16b, #3
645        dup             v28.16b, v29.b[13]
646
647        smull           v0.8h,   v8.8b,   v22.8b
648        smull2          v1.8h,   v8.16b,  v22.16b
649        smull           v2.8h,   v9.8b,   v23.8b
650        smull2          v3.8h,   v9.16b,  v23.16b
651        smull           v8.8h,   v10.8b,  v24.8b
652        smull2          v9.8h,   v10.16b, v24.16b
653        smull           v10.8h,  v11.8b,  v26.8b
654        smull2          v11.8h,  v11.16b, v26.16b
655        saddl           v22.4s,  v0.4h,   v2.4h
656        saddl2          v23.4s,  v0.8h,   v2.8h
657        saddl           v24.4s,  v1.4h,   v3.4h
658        saddl2          v26.4s,  v1.8h,   v3.8h
659        saddl           v0.4s,   v8.4h,   v10.4h
660        saddl2          v1.4s,   v8.8h,   v10.8h
661        saddl           v2.4s,   v9.4h,   v11.4h
662        saddl2          v3.4s,   v9.8h,   v11.8h
663        smull           v8.8h,   v12.8b,  v27.8b
664        smull2          v9.8h,   v12.16b, v27.16b
665        smull           v10.8h,  v13.8b,  v28.8b
666        smull2          v11.8h,  v13.16b, v28.16b
667        smull           v12.8h,  v17.8b,  v25.8b
668        smull2          v13.8h,  v17.16b, v25.16b
669        add             v22.4s,  v22.4s,  v0.4s
670        add             v23.4s,  v23.4s,  v1.4s
671        add             v24.4s,  v24.4s,  v2.4s
672        add             v26.4s,  v26.4s,  v3.4s
673        saddl           v0.4s,   v8.4h,   v10.4h
674        saddl2          v1.4s,   v8.8h,   v10.8h
675        saddl           v2.4s,   v9.4h,   v11.4h
676        saddl2          v3.4s,   v9.8h,   v11.8h
677        add             v4.4s,   v4.4s,   v22.4s
678        add             v5.4s,   v5.4s,   v23.4s
679        add             v6.4s,   v6.4s,   v24.4s
680        add             v7.4s,   v7.4s,   v26.4s
681        add             v4.4s,   v4.4s,   v0.4s
682        add             v5.4s,   v5.4s,   v1.4s
683        add             v6.4s,   v6.4s,   v2.4s
684        add             v7.4s,   v7.4s,   v3.4s
685        saddw           v4.4s,   v4.4s,   v12.4h
686        saddw2          v5.4s,   v5.4s,   v12.8h
687        saddw           v6.4s,   v6.4s,   v13.4h
688        saddw2          v7.4s,   v7.4s,   v13.8h
689
690        ext             v8.16b,  v19.16b, v20.16b, #13 // top left, top mid
691        dup             v22.16b, v29.b[14]
692        ext             v9.16b,  v19.16b, v20.16b, #14
693        dup             v23.16b, v29.b[15]
694        ext             v10.16b, v19.16b, v20.16b, #15
695        dup             v24.16b, v30.b[0]
696        dup             v25.16b, v30.b[1]
697        ext             v11.16b, v20.16b, v21.16b, #1  // top mid, top right
698        dup             v26.16b, v30.b[2]
699        ext             v12.16b, v20.16b, v21.16b, #2
700        dup             v27.16b, v30.b[3]
701        ext             v13.16b, v20.16b, v21.16b, #3
702        dup             v28.16b, v30.b[4]
703
704        smull           v0.8h,   v8.8b,   v22.8b
705        smull2          v1.8h,   v8.16b,  v22.16b
706        smull           v2.8h,   v9.8b,   v23.8b
707        smull2          v3.8h,   v9.16b,  v23.16b
708        smull           v8.8h,   v10.8b,  v24.8b
709        smull2          v9.8h,   v10.16b, v24.16b
710        smull           v10.8h,  v11.8b,  v26.8b
711        smull2          v11.8h,  v11.16b, v26.16b
712        saddl           v22.4s,  v0.4h,   v2.4h
713        saddl2          v23.4s,  v0.8h,   v2.8h
714        saddl           v24.4s,  v1.4h,   v3.4h
715        saddl2          v26.4s,  v1.8h,   v3.8h
716        saddl           v0.4s,   v8.4h,   v10.4h
717        saddl2          v1.4s,   v8.8h,   v10.8h
718        saddl           v2.4s,   v9.4h,   v11.4h
719        saddl2          v3.4s,   v9.8h,   v11.8h
720        smull           v8.8h,   v12.8b,  v27.8b
721        smull2          v9.8h,   v12.16b, v27.16b
722        smull           v10.8h,  v13.8b,  v28.8b
723        smull2          v11.8h,  v13.16b, v28.16b
724        smull           v12.8h,  v20.8b,  v25.8b
725        smull2          v19.8h,  v20.16b, v25.16b
726        add             v22.4s,  v22.4s,  v0.4s
727        add             v23.4s,  v23.4s,  v1.4s
728        add             v24.4s,  v24.4s,  v2.4s
729        add             v26.4s,  v26.4s,  v3.4s
730        saddl           v0.4s,   v8.4h,   v10.4h
731        saddl2          v1.4s,   v8.8h,   v10.8h
732        saddl           v2.4s,   v9.4h,   v11.4h
733        saddl2          v3.4s,   v9.8h,   v11.8h
734        add             v4.4s,   v4.4s,   v22.4s
735        add             v5.4s,   v5.4s,   v23.4s
736        add             v6.4s,   v6.4s,   v24.4s
737        add             v7.4s,   v7.4s,   v26.4s
738        mov             v13.16b, v14.16b
739        mov             v14.16b, v15.16b
740        add             v4.4s,   v4.4s,   v0.4s
741        add             v5.4s,   v5.4s,   v1.4s
742        add             v6.4s,   v6.4s,   v2.4s
743        add             v7.4s,   v7.4s,   v3.4s
744        mov             v16.16b, v17.16b
745        mov             v17.16b, v18.16b
746        saddw           v4.4s,   v4.4s,   v12.4h
747        saddw2          v5.4s,   v5.4s,   v12.8h
748        saddw           v6.4s,   v6.4s,   v19.4h
749        saddw2          v7.4s,   v7.4s,   v19.8h
750
751        mov             v19.16b, v20.16b
752        mov             v20.16b, v21.16b
753        ret
754endfunc
755
756.macro sum_lag3_func type, uv_layout, edge, elems=16
757function sum_\type\()_lag3_\edge\()_neon
758        AARCH64_SIGN_LINK_REGISTER
759        str             x30, [sp, #-16]!
760.ifc \edge, left
761        sub             x11, x0,  #3*GRAIN_WIDTH
762        sub             x12, x0,  #2*GRAIN_WIDTH
763        sub             x13, x0,  #1*GRAIN_WIDTH
764        ld1             {v14.16b}, [x11] // load the previous block right above
765        ld1             {v17.16b}, [x12]
766        ld1             {v20.16b}, [x13]
767.endif
768        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8]
769endfunc
770.endm
771
772sum_lag3_func y,      0,   left
773sum_lag3_func y,      0,   mid
774sum_lag3_func y,      0,   right, 15
775sum_lag3_func uv_444, 444, left
776sum_lag3_func uv_444, 444, mid
777sum_lag3_func uv_444, 444, right, 15
778sum_lag3_func uv_422, 422, left
779sum_lag3_func uv_422, 422, mid
780sum_lag3_func uv_422, 422, right, 9
781sum_lag3_func uv_420, 420, left
782sum_lag3_func uv_420, 420, mid
783sum_lag3_func uv_420, 420, right, 9
784
785function generate_grain_rows_neon
786        AARCH64_SIGN_LINK_REGISTER
787        str             x30, [sp, #-16]!
7881:
789        get_grain_row   v16, v17, v18, v19, v20, v21
790        subs            w1,  w1,  #1
791        store_grain_row v16, v17, v18, v19, v20, v21
792        b.gt            1b
793        ldr             x30, [sp], #16
794        AARCH64_VALIDATE_LINK_REGISTER
795        ret
796endfunc
797
798function generate_grain_rows_44_neon
799        AARCH64_SIGN_LINK_REGISTER
800        str             x30, [sp, #-16]!
8011:
802        get_grain_row_44 v16, v17, v18
803        subs            w1,  w1,  #1
804        store_grain_row_44 v16, v17, v18
805        b.gt            1b
806        ldr             x30, [sp], #16
807        AARCH64_VALIDATE_LINK_REGISTER
808        ret
809endfunc
810
811function get_grain_row_neon
812        AARCH64_SIGN_LINK_REGISTER
813        str             x30, [sp, #-16]!
814        get_grain_row   v16, v17, v18, v19, v20, v21
815        ldr             x30, [sp], #16
816        AARCH64_VALIDATE_LINK_REGISTER
817        ret
818endfunc
819
820function get_grain_row_44_neon
821        AARCH64_SIGN_LINK_REGISTER
822        str             x30, [sp, #-16]!
823        get_grain_row_44 v16, v17, v18
824        ldr             x30, [sp], #16
825        AARCH64_VALIDATE_LINK_REGISTER
826        ret
827endfunc
828
829function add_uv_444_coeff_lag0_neon
830add_coeff_lag0_start:
831        smull           v2.8h,   v0.8b,   v27.8b
832        smull2          v3.8h,   v0.16b,  v27.16b
833        srshl           v2.8h,   v2.8h,   v28.8h
834        srshl           v3.8h,   v3.8h,   v28.8h
835        saddw           v2.8h,   v2.8h,   v1.8b
836        saddw2          v3.8h,   v3.8h,   v1.16b
837        sqxtn           v2.8b,   v2.8h
838        sqxtn2          v2.16b,  v3.8h
839        ret
840endfunc
841
842function add_uv_420_coeff_lag0_neon
843        ld1             {v4.16b, v5.16b}, [x19], #32
844        ld1             {v6.16b, v7.16b}, [x12], #32
845        saddlp          v4.8h,   v4.16b
846        saddlp          v5.8h,   v5.16b
847        saddlp          v6.8h,   v6.16b
848        saddlp          v7.8h,   v7.16b
849        add             v4.8h,   v4.8h,   v6.8h
850        add             v5.8h,   v5.8h,   v7.8h
851        rshrn           v4.8b,   v4.8h,   #2
852        rshrn2          v4.16b,  v5.8h,   #2
853        and             v0.16b,  v4.16b,  v0.16b
854        b               add_coeff_lag0_start
855endfunc
856
857function add_uv_422_coeff_lag0_neon
858        ld1             {v4.16b, v5.16b}, [x19], #32
859        saddlp          v4.8h,   v4.16b
860        saddlp          v5.8h,   v5.16b
861        rshrn           v4.8b,   v4.8h,   #1
862        rshrn2          v4.16b,  v5.8h,   #1
863        and             v0.16b,  v4.16b,  v0.16b
864        b               add_coeff_lag0_start
865endfunc
866
867.macro gen_grain_82 type
868function generate_grain_\type\()_8bpc_neon, export=1
869        AARCH64_SIGN_LINK_REGISTER
870        stp             x30, x19, [sp, #-96]!
871
872.ifc \type, uv_444
873        mov             w13, w3
874        mov             w14, #28
875        add             x19, x1,  #3*GRAIN_WIDTH
876        mov             x1,  x2
877        mul             w13, w13, w14
878.endif
879        movrel          x3,  X(gaussian_sequence)
880        ldr             w2,  [x1, #FGD_SEED]
881        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
882.ifc \type, y
883        add             x4,  x1,  #FGD_AR_COEFFS_Y
884.else
885        add             x4,  x1,  #FGD_AR_COEFFS_UV
886.endif
887        adr             x16, L(gen_grain_\type\()_tbl)
888        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
889        add             w9,  w9,  #4
890        ldrh            w17, [x16, w17, uxtw #1]
891        dup             v31.8h,  w9    // 4 + data->grain_scale_shift
892        sub             x16, x16, w17, uxtw
893        neg             v31.8h,  v31.8h
894
895.ifc \type, uv_444
896        cmp             w13, #0
897        mov             w11, #0x49d8
898        mov             w14, #0xb524
899        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
900        csel            w11, w11, w14, ne
901.endif
902
903        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
904        mov             w8,  #1
905        mov             w10, #1
906        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
907        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
908        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
909        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
910        mov             w5,  #127
911        mov             w6,  #-128
912
913.ifc \type, uv_444
914        eor             w2,  w2,  w11
915.endif
916
917        br              x16
918
919L(generate_grain_\type\()_lag0):
920        AARCH64_VALID_JUMP_TARGET
921.ifc \type, y
922        mov             w1,  #GRAIN_HEIGHT
923        bl              generate_grain_rows_neon
924.else
925        dup             v28.8h,  w7
926        ld1r            {v27.16b}, [x4]     // ar_coeffs_uv[0]
927        movi            v0.16b,  #0
928        movi            v1.16b,  #255
929        ext             v29.16b, v0.16b,  v1.16b,  #13
930        ext             v30.16b, v1.16b,  v0.16b,  #1
931        neg             v28.8h,  v28.8h
932
933        mov             w1,  #3
934        bl              generate_grain_rows_neon
935        mov             w1,  #GRAIN_HEIGHT-3
9361:
937        ld1             {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64
938        bl              get_grain_row_neon
939        and             v0.16b,  v22.16b, v29.16b
940        mov             v1.16b,  v16.16b
941        bl              add_uv_444_coeff_lag0_neon
942        mov             v0.16b,  v23.16b
943        mov             v1.16b,  v17.16b
944        mov             v16.16b, v2.16b
945        bl              add_uv_444_coeff_lag0_neon
946        ld1             {v26.16b}, [x19], #16
947        mov             v0.16b,  v24.16b
948        mov             v1.16b,  v18.16b
949        mov             v17.16b, v2.16b
950        bl              add_uv_444_coeff_lag0_neon
951        add             x19, x19, #2
952        mov             v0.16b,  v25.16b
953        mov             v1.16b,  v19.16b
954        mov             v18.16b, v2.16b
955        bl              add_uv_444_coeff_lag0_neon
956        and             v0.16b,  v26.16b, v30.16b
957        mov             v1.16b,  v20.16b
958        mov             v19.16b, v2.16b
959        bl              add_uv_444_coeff_lag0_neon
960        mov             v20.16b, v2.16b
961        subs            w1,  w1,  #1
962        store_grain_row v16, v17, v18, v19, v20, v21
963        b.gt            1b
964.endif
965        ldp             x30, x19, [sp], #96
966        AARCH64_VALIDATE_LINK_REGISTER
967        ret
968
969L(generate_grain_\type\()_lag1):
970        AARCH64_VALID_JUMP_TARGET
971        ld1r            {v27.16b}, [x4], #1 // ar_coeffs_y[0]
972        ld1r            {v28.16b}, [x4], #1 // ar_coeffs_y[1]
973        ld1r            {v29.16b}, [x4]     // ar_coeffs_y[2]
974.ifc \type, y
975        ldrsb           w4,  [x4, #1]       // ar_coeffs_y[3]
976.else
977        add             x4,  x4,  #2
978.endif
979
980        mov             w1,  #3
981.ifc \type, uv_444
982        ld1r            {v30.16b}, [x4]     // ar_coeffs_uv[4]
983        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
984.endif
985        bl              generate_grain_rows_neon
986
987        mov             w1,  #GRAIN_HEIGHT - 3
9881:
989        sum_\type\()_lag1 v22, v16, v16, v17, left
990        sum_\type\()_lag1 v23, v16, v17, v18
991        sum_\type\()_lag1 v24, v17, v18, v19
992        sum_\type\()_lag1 v25, v18, v19, v20
993        sum_\type\()_lag1 v20, v19, v20, v21, right
994        get_grain_2     v21
995        subs            w1,  w1,  #1
996.ifc \type, uv_444
997        add             x19, x19, #2
998.endif
999        store_grain_row v22, v23, v24, v25, v20, v21
1000        mov             v16.16b, v22.16b
1001        mov             v17.16b, v23.16b
1002        mov             v18.16b, v24.16b
1003        mov             v19.16b, v25.16b
1004        b.gt            1b
1005
1006        ldp             x30, x19, [sp], #96
1007        AARCH64_VALIDATE_LINK_REGISTER
1008        ret
1009
1010L(generate_grain_\type\()_lag2):
1011        AARCH64_VALID_JUMP_TARGET
1012        ld1             {v30.16b}, [x4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
1013
1014        smov            w4,  v30.b[10]
1015        smov            w17, v30.b[11]
1016
1017        mov             w1,  #3
1018        bl              generate_grain_rows_neon
1019
1020        mov             w1,  #GRAIN_HEIGHT - 3
10211:
1022        bl              sum_\type\()_lag2_left_neon
1023        bl              sum_\type\()_lag2_mid_neon
1024        bl              sum_\type\()_lag2_mid_neon
1025        bl              sum_\type\()_lag2_mid_neon
1026        bl              sum_\type\()_lag2_right_neon
1027        get_grain_2     v16
1028        subs            w1,  w1,  #1
1029.ifc \type, uv_444
1030        add             x19, x19, #2
1031.endif
1032        st1             {v16.h}[0], [x0], #2
1033        b.gt            1b
1034
1035        ldp             x30, x19, [sp], #96
1036        AARCH64_VALIDATE_LINK_REGISTER
1037        ret
1038
1039L(generate_grain_\type\()_lag3):
1040        AARCH64_VALID_JUMP_TARGET
1041        ld1             {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
1042        stp             d8,  d9,  [sp, #16]
1043        stp             d10, d11, [sp, #32]
1044        stp             d12, d13, [sp, #48]
1045        stp             d14, d15, [sp, #64]
1046        stp             x20, x21, [sp, #80]
1047
1048        smov            w4,  v30.b[5]
1049        smov            w20, v30.b[6]
1050        smov            w21, v30.b[7]
1051
1052        mov             w1,  #3
1053        bl              generate_grain_rows_neon
1054
1055        mov             w1,  #GRAIN_HEIGHT - 3
10561:
1057        bl              sum_\type\()_lag3_left_neon
1058        bl              sum_\type\()_lag3_mid_neon
1059        bl              sum_\type\()_lag3_mid_neon
1060        bl              sum_\type\()_lag3_mid_neon
1061        bl              sum_\type\()_lag3_right_neon
1062        get_grain_2     v16
1063        subs            w1,  w1,  #1
1064.ifc \type, uv_444
1065        add             x19, x19, #2
1066.endif
1067        st1             {v16.h}[0], [x0], #2
1068        b.gt            1b
1069
1070        ldp             x20, x21, [sp, #80]
1071        ldp             d14, d15, [sp, #64]
1072        ldp             d12, d13, [sp, #48]
1073        ldp             d10, d11, [sp, #32]
1074        ldp             d8,  d9,  [sp, #16]
1075        ldp             x30, x19, [sp], #96
1076        AARCH64_VALIDATE_LINK_REGISTER
1077        ret
1078
1079L(gen_grain_\type\()_tbl):
1080        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
1081        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
1082        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
1083        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
1084endfunc
1085.endm
1086
1087gen_grain_82 y
1088gen_grain_82 uv_444
1089
1090.macro set_height dst, type
1091.ifc \type, uv_420
1092        mov             \dst,  #SUB_GRAIN_HEIGHT-3
1093.else
1094        mov             \dst,  #GRAIN_HEIGHT-3
1095.endif
1096.endm
1097
1098.macro increment_y_ptr reg, type
1099.ifc \type, uv_420
1100        add             \reg, \reg, #2*GRAIN_WIDTH-(3*32)
1101.else
1102        sub             \reg, \reg, #3*32-GRAIN_WIDTH
1103.endif
1104.endm
1105
1106.macro gen_grain_44 type
1107function generate_grain_\type\()_8bpc_neon, export=1
1108        AARCH64_SIGN_LINK_REGISTER
1109        stp             x30, x19, [sp, #-96]!
1110
1111        mov             w13, w3
1112        mov             w14, #28
1113        add             x19, x1,  #3*GRAIN_WIDTH-3
1114        mov             x1,  x2
1115        mul             w13, w13, w14
1116
1117        movrel          x3,  X(gaussian_sequence)
1118        ldr             w2,  [x1, #FGD_SEED]
1119        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
1120        add             x4,  x1,  #FGD_AR_COEFFS_UV
1121        adr             x16, L(gen_grain_\type\()_tbl)
1122        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
1123        add             w9,  w9,  #4
1124        ldrh            w17, [x16, w17, uxtw #1]
1125        dup             v31.8h,  w9    // 4 + data->grain_scale_shift
1126        sub             x16, x16, w17, uxtw
1127        neg             v31.8h,  v31.8h
1128
1129        cmp             w13, #0
1130        mov             w11, #0x49d8
1131        mov             w14, #0xb524
1132        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
1133        csel            w11, w11, w14, ne
1134
1135        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
1136        mov             w8,  #1
1137        mov             w10, #1
1138        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
1139        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
1140        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
1141        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
1142        mov             w5,  #127
1143        mov             w6,  #-128
1144
1145        eor             w2,  w2,  w11
1146
1147        br              x16
1148
1149L(generate_grain_\type\()_lag0):
1150        AARCH64_VALID_JUMP_TARGET
1151        dup             v28.8h,  w7
1152        ld1r            {v27.16b}, [x4]     // ar_coeffs_uv[0]
1153        movi            v0.16b,  #0
1154        movi            v1.16b,  #255
1155        ext             v29.16b, v0.16b,  v1.16b,  #13
1156        ext             v30.16b, v1.16b,  v0.16b,  #7
1157        neg             v28.8h,  v28.8h
1158
1159        mov             w1,  #3
1160        bl              generate_grain_rows_44_neon
1161        set_height      w1,  \type
11621:
1163        bl              get_grain_row_44_neon
1164.ifc \type, uv_420
1165        add             x12, x19, #GRAIN_WIDTH
1166.endif
1167        mov             v0.16b,  v29.16b
1168        mov             v1.16b,  v16.16b
1169        bl              add_\type\()_coeff_lag0_neon
1170        movi            v0.16b,  #255
1171        mov             v1.16b,  v17.16b
1172        mov             v16.16b, v2.16b
1173        bl              add_\type\()_coeff_lag0_neon
1174        mov             v0.16b,  v30.16b
1175        mov             v1.16b,  v18.16b
1176        mov             v17.16b, v2.16b
1177        bl              add_\type\()_coeff_lag0_neon
1178        mov             v18.16b, v2.16b
1179        subs            w1,  w1,  #1
1180        increment_y_ptr x19, \type
1181        store_grain_row_44 v16, v17, v18
1182        b.gt            1b
1183
1184        ldp             x30, x19, [sp], #96
1185        AARCH64_VALIDATE_LINK_REGISTER
1186        ret
1187
1188L(generate_grain_\type\()_lag1):
1189        AARCH64_VALID_JUMP_TARGET
1190        ld1r            {v27.16b}, [x4], #1 // ar_coeffs_uv[0]
1191        ld1r            {v28.16b}, [x4], #1 // ar_coeffs_uv[1]
1192        ld1r            {v29.16b}, [x4]     // ar_coeffs_uv[2]
1193        add             x4,  x4,  #2
1194
1195        mov             w1,  #3
1196        ld1r            {v30.16b}, [x4]     // ar_coeffs_u4[4]
1197        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
1198        bl              generate_grain_rows_44_neon
1199
1200        set_height      w1,  \type
12011:
1202        sum_\type\()_lag1 v20, v16, v16, v17, left
1203        sum_\type\()_lag1 v21, v16, v17, v18
1204        sum_\type\()_lag1 v18, v17, v18, v18, right
1205        subs            w1,  w1,  #1
1206        increment_y_ptr x19, \type
1207        store_grain_row_44 v20, v21, v18
1208        mov             v16.16b, v20.16b
1209        mov             v17.16b, v21.16b
1210        b.gt            1b
1211
1212        ldp             x30, x19, [sp], #96
1213        AARCH64_VALIDATE_LINK_REGISTER
1214        ret
1215
1216L(generate_grain_\type\()_lag2):
1217        AARCH64_VALID_JUMP_TARGET
1218        ld1             {v30.16b}, [x4]     // ar_coeffs_uv[0-12]
1219
1220        smov            w4,  v30.b[10]
1221        smov            w17, v30.b[11]
1222
1223        mov             w1,  #3
1224        bl              generate_grain_rows_44_neon
1225
1226        set_height      w1,  \type
12271:
1228        bl              sum_\type\()_lag2_left_neon
1229        bl              sum_\type\()_lag2_mid_neon
1230        bl              sum_\type\()_lag2_right_neon
1231        subs            w1,  w1,  #1
1232        increment_y_ptr x19, \type
1233        add             x0,  x0,  #GRAIN_WIDTH-48
1234        b.gt            1b
1235
1236        ldp             x30, x19, [sp], #96
1237        AARCH64_VALIDATE_LINK_REGISTER
1238        ret
1239
1240L(generate_grain_\type\()_lag3):
1241        AARCH64_VALID_JUMP_TARGET
1242        ldr             q29,      [x4]      // ar_coeffs_uv[0-15]
1243        ldr             q30,      [x4, #16] // ar_coeffs_uv[16-24]
1244        stp             d8,  d9,  [sp, #16]
1245        stp             d10, d11, [sp, #32]
1246        stp             d12, d13, [sp, #48]
1247        stp             d14, d15, [sp, #64]
1248        stp             x20, x21, [sp, #80]
1249
1250        smov            w4,  v30.b[5]
1251        smov            w20, v30.b[6]
1252        smov            w21, v30.b[7]
1253
1254        mov             w1,  #3
1255        bl              generate_grain_rows_44_neon
1256
1257        set_height      w1,  \type
12581:
1259        bl              sum_\type\()_lag3_left_neon
1260        bl              sum_\type\()_lag3_mid_neon
1261        bl              sum_\type\()_lag3_right_neon
1262        subs            w1,  w1,  #1
1263        increment_y_ptr x19, \type
1264        add             x0,  x0,  #GRAIN_WIDTH-48
1265        b.gt            1b
1266
1267        ldp             x20, x21, [sp, #80]
1268        ldp             d14, d15, [sp, #64]
1269        ldp             d12, d13, [sp, #48]
1270        ldp             d10, d11, [sp, #32]
1271        ldp             d8,  d9,  [sp, #16]
1272        ldp             x30, x19, [sp], #96
1273        AARCH64_VALIDATE_LINK_REGISTER
1274        ret
1275
1276L(gen_grain_\type\()_tbl):
1277        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
1278        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
1279        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
1280        .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
1281endfunc
1282.endm
1283
1284gen_grain_44 uv_420
1285gen_grain_44 uv_422
1286
1287.macro gather_interleaved dst1, dst2, src1, src2, off
1288        umov            w14, \src1[0+\off]
1289        umov            w15, \src2[8+\off]
1290        umov            w16, \src1[2+\off]
1291        add             x14, x14, x3
1292        umov            w17, \src2[10+\off]
1293        add             x15, x15, x3
1294        ld1             {\dst1}[0+\off],  [x14]
1295        umov            w14, \src1[4+\off]
1296        add             x16, x16, x3
1297        ld1             {\dst2}[8+\off],  [x15]
1298        umov            w15, \src2[12+\off]
1299        add             x17, x17, x3
1300        ld1             {\dst1}[2+\off],  [x16]
1301        umov            w16, \src1[6+\off]
1302        add             x14, x14, x3
1303        ld1             {\dst2}[10+\off], [x17]
1304        umov            w17, \src2[14+\off]
1305        add             x15, x15, x3
1306        ld1             {\dst1}[4+\off],  [x14]
1307        add             x16, x16, x3
1308        ld1             {\dst2}[12+\off], [x15]
1309        add             x17, x17, x3
1310        ld1             {\dst1}[6+\off],  [x16]
1311        ld1             {\dst2}[14+\off], [x17]
1312.endm
1313
1314.macro gather dst1, dst2, src1, src2
1315        gather_interleaved \dst1, \dst2, \src1, \src2, 0
1316        gather_interleaved \dst2, \dst1, \src2, \src1, 0
1317        gather_interleaved \dst1, \dst2, \src1, \src2, 1
1318        gather_interleaved \dst2, \dst1, \src2, \src1, 1
1319.endm
1320
1321function gather32_neon
1322        gather          v4.b, v5.b, v0.b, v1.b
1323        ret
1324endfunc
1325
1326function gather16_neon
1327        gather_interleaved v4.b, v5.b, v0.b, v0.b, 0
1328        gather_interleaved v4.b, v5.b, v0.b, v0.b, 1
1329        ins             v4.d[1], v5.d[1]
1330        ret
1331endfunc
1332
1333const overlap_coeffs_0, align=4
1334        .byte 27, 17, 0,  0,  0,  0,  0,  0
1335        .byte 17, 27, 32, 32, 32, 32, 32, 32
1336endconst
1337
1338const overlap_coeffs_1, align=4
1339        .byte 23, 0,  0,  0,  0,  0,  0,  0
1340        .byte 22, 32, 32, 32, 32, 32, 32, 32
1341endconst
1342
1343.macro calc_offset offx, offy, src, sx, sy
1344        and             \offy, \src,  #0xF     // randval & 0xF
1345        lsr             \offx, \src,  #4       // randval >> 4
1346.if \sy == 0
1347        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
1348.endif
1349.if \sx == 0
1350        add             \offx, \offx, \offx    // 2 * (randval >> 4)
1351.endif
1352.endm
1353
1354.macro add_offset dst, offx, offy, src, stride
1355        madd            \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
1356        add             \dst, \dst, \offx, uxtw // grain_lut += offx
1357.endm
1358
1359// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
1360//                                const ptrdiff_t stride,
1361//                                const uint8_t scaling[SCALING_SIZE],
1362//                                const int scaling_shift,
1363//                                const entry grain_lut[][GRAIN_WIDTH],
1364//                                const int offsets[][2],
1365//                                const int h, const ptrdiff_t clip,
1366//                                const ptrdiff_t type);
1367function fgy_32x32_8bpc_neon, export=1
1368        AARCH64_SIGN_LINK_REGISTER
1369        str             x30, [sp, #-16]!
1370        ldr             w11, [x6, #8]          // offsets[1][0]
1371        ldr             w13, [x6, #4]          // offsets[0][1]
1372        ldr             w15, [x6, #12]         // offsets[1][1]
1373        ldr             w6,  [x6]              // offsets[0][0]
1374        ldr             w8,  [sp, #16]         // clip
1375        mov             x9,  #GRAIN_WIDTH      // grain_lut stride
1376
1377        neg             w4,  w4
1378        dup             v29.8h,  w4            // -scaling_shift
1379
1380        movrel          x16, overlap_coeffs_0
1381
1382        cbz             w8,  1f
1383        // clip
1384        movi            v30.16b, #16
1385        movi            v31.16b, #235
1386        b               2f
13871:
1388        // no clip
1389        movi            v30.16b, #0
1390        movi            v31.16b, #255
13912:
1392
1393        ld1             {v27.8b, v28.8b}, [x16] // overlap_coeffs
1394
1395        add             x5,  x5,  #9           // grain_lut += 9
1396        add             x5,  x5,  x9,  lsl #3  // grain_lut += 8 * grain_stride
1397        add             x5,  x5,  x9           // grain_lut += grain_stride
1398
1399        calc_offset     w11, w12, w11, 0,  0
1400        calc_offset     w13, w14, w13, 0,  0
1401        calc_offset     w15, w16, w15, 0,  0
1402        calc_offset     w6,  w10, w6,  0,  0
1403
1404        add_offset      x12, w11, x12, x5,  x9
1405        add_offset      x14, w13, x14, x5,  x9
1406        add_offset      x16, w15, x16, x5,  x9
1407        add_offset      x5,  w6,  x10, x5,  x9
1408
1409        ldr             w11, [sp, #24]         // type
1410        adr             x13, L(fgy_loop_tbl)
1411
1412        add             x4,  x12, #32          // grain_lut += FG_BLOCK_SIZE * bx
1413        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1414
1415        tst             w11, #1
1416        ldrh            w11, [x13, w11, uxtw #1]
1417
1418        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1419        add             x8,  x8,  #32          // grain_lut += FG_BLOCK_SIZE * bx
1420
1421        sub             x11, x13, w11, uxtw
1422
1423        b.eq            1f
1424        // y overlap
1425        dup             v6.16b,  v27.b[0]
1426        dup             v7.16b,  v27.b[1]
1427        mov             w10, w7                // backup actual h
1428        mov             w7,  #2
14291:
1430        br              x11
1431endfunc
1432
1433function fgy_loop_neon
1434.macro fgy ox, oy
1435L(loop_\ox\oy):
1436        AARCH64_VALID_JUMP_TARGET
14371:
1438        ld1             {v0.16b,  v1.16b},  [x1],  x2 // src
1439.if \ox
1440        ld1             {v20.8b},           [x4],  x9 // grain_lut old
1441.endif
1442.if \oy
1443        ld1             {v22.16b, v23.16b}, [x6],  x9 // grain_lut top
1444.endif
1445.if \ox && \oy
1446        ld1             {v21.8b},           [x8],  x9 // grain_lut top old
1447.endif
1448        ld1             {v18.16b, v19.16b}, [x5],  x9 // grain_lut
1449
1450        bl              gather32_neon
1451
1452.if \ox
1453        smull           v20.8h,  v20.8b,  v27.8b
1454        smlal           v20.8h,  v18.8b,  v28.8b
1455.endif
1456
1457.if \oy
1458.if \ox
1459        smull           v21.8h,  v21.8b,  v27.8b
1460        smlal           v21.8h,  v22.8b,  v28.8b
1461        sqrshrn         v20.8b,  v20.8h,  #5
1462        sqrshrn         v21.8b,  v21.8h,  #5
1463.endif
1464
1465.if \ox
1466        smull           v16.8h,  v20.8b,  v7.8b
1467.else
1468        smull           v16.8h,  v18.8b,  v7.8b
1469.endif
1470        smull2          v17.8h,  v18.16b, v7.16b
1471        smull           v18.8h,  v19.8b,  v7.8b
1472        smull2          v19.8h,  v19.16b, v7.16b
1473.if \ox
1474        smlal           v16.8h,  v21.8b,  v6.8b
1475.else
1476        smlal           v16.8h,  v22.8b,  v6.8b
1477.endif
1478        smlal2          v17.8h,  v22.16b, v6.16b
1479        smlal           v18.8h,  v23.8b,  v6.8b
1480        smlal2          v19.8h,  v23.16b, v6.16b
1481        sqrshrn         v22.8b,  v16.8h,  #5
1482        sqrshrn2        v22.16b, v17.8h,  #5
1483        sqrshrn         v23.8b,  v18.8h,  #5
1484        sqrshrn2        v23.16b, v19.8h,  #5
1485.endif
1486
1487        // sxtl of grain
1488.if \oy
1489        sxtl            v16.8h,  v22.8b
1490        sxtl2           v17.8h,  v22.16b
1491        sxtl            v18.8h,  v23.8b
1492        sxtl2           v19.8h,  v23.16b
1493.elseif \ox
1494        sqrshrn         v20.8b,  v20.8h,  #5
1495        sxtl2           v17.8h,  v18.16b
1496        sxtl            v18.8h,  v19.8b
1497        sxtl2           v19.8h,  v19.16b
1498        sxtl            v16.8h,  v20.8b
1499.else
1500        sxtl            v16.8h,  v18.8b
1501        sxtl2           v17.8h,  v18.16b
1502        sxtl            v18.8h,  v19.8b
1503        sxtl2           v19.8h,  v19.16b
1504.endif
1505
1506        uxtl            v2.8h,   v4.8b   // scaling
1507        uxtl2           v3.8h,   v4.16b
1508        uxtl            v4.8h,   v5.8b
1509        uxtl2           v5.8h,   v5.16b
1510
1511        mul             v16.8h,  v16.8h,  v2.8h   // scaling * grain
1512        mul             v17.8h,  v17.8h,  v3.8h
1513        mul             v18.8h,  v18.8h,  v4.8h
1514        mul             v19.8h,  v19.8h,  v5.8h
1515
1516        srshl           v16.8h,  v16.8h,  v29.8h  // round2(scaling * grain, scaling_shift)
1517        srshl           v17.8h,  v17.8h,  v29.8h
1518        srshl           v18.8h,  v18.8h,  v29.8h
1519        srshl           v19.8h,  v19.8h,  v29.8h
1520
1521        uaddw           v16.8h,  v16.8h,  v0.8b   // *src + noise
1522        uaddw2          v17.8h,  v17.8h,  v0.16b
1523        uaddw           v18.8h,  v18.8h,  v1.8b
1524        uaddw2          v19.8h,  v19.8h,  v1.16b
1525
1526        sqxtun          v0.8b,   v16.8h
1527        sqxtun2         v0.16b,  v17.8h
1528        sqxtun          v1.8b,   v18.8h
1529        sqxtun2         v1.16b,  v19.8h
1530
1531        umax            v0.16b,  v0.16b,  v30.16b
1532        umax            v1.16b,  v1.16b,  v30.16b
1533        umin            v0.16b,  v0.16b,  v31.16b
1534        umin            v1.16b,  v1.16b,  v31.16b
1535
1536        subs            w7,  w7,  #1
1537.if \oy
1538        dup             v6.16b,  v28.b[0]
1539        dup             v7.16b,  v28.b[1]
1540.endif
1541        st1             {v0.16b,  v1.16b},  [x0], x2 // dst
1542        b.gt            1b
1543
1544.if \oy
1545        cmp             w10, #2
1546        sub             w7,  w10, #2           // restore actual remaining h
1547        b.gt            L(loop_\ox\()0)
1548.endif
1549        ldr             x30, [sp], #16
1550        AARCH64_VALIDATE_LINK_REGISTER
1551        ret
1552.endm
1553
1554        fgy             0, 0
1555        fgy             0, 1
1556        fgy             1, 0
1557        fgy             1, 1
1558
1559L(fgy_loop_tbl):
1560        .hword L(fgy_loop_tbl) - L(loop_00)
1561        .hword L(fgy_loop_tbl) - L(loop_01)
1562        .hword L(fgy_loop_tbl) - L(loop_10)
1563        .hword L(fgy_loop_tbl) - L(loop_11)
1564endfunc
1565
1566// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
1567//                                     const pixel *const src,
1568//                                     const ptrdiff_t stride,
1569//                                     const uint8_t scaling[SCALING_SIZE],
1570//                                     const Dav1dFilmGrainData *const data,
1571//                                     const entry grain_lut[][GRAIN_WIDTH],
1572//                                     const pixel *const luma_row,
1573//                                     const ptrdiff_t luma_stride,
1574//                                     const int offsets[][2],
1575//                                     const ptrdiff_t h, const ptrdiff_t uv,
1576//                                     const ptrdiff_t is_id,
1577//                                     const ptrdiff_t type);
1578.macro fguv layout, sx, sy
1579function fguv_32x32_\layout\()_8bpc_neon, export=1
1580        AARCH64_SIGN_LINK_REGISTER
1581        str             x30,      [sp, #-32]!
1582        str             d8,       [sp, #16]
1583        ldp             x8,  x9,  [sp, #32]    // offsets, h
1584        ldp             x10, x11, [sp, #48]    // uv, is_id
1585
1586        ldr             w13, [x4, #FGD_SCALING_SHIFT]
1587        ldr             w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
1588        neg             w13, w13               // -scaling_shift
1589
1590        // !csfl
1591        add             x10, x4,  x10, lsl #2  // + 4*uv
1592        add             x14, x10, #FGD_UV_LUMA_MULT
1593        add             x15, x10, #FGD_UV_MULT
1594        add             x10, x10, #FGD_UV_OFFSET
1595        ld1             {v8.h}[0], [x14]       // uv_luma_mult
1596        ld1r            {v24.8h},  [x10]       // uv_offset
1597        ld1             {v8.h}[1], [x15]       // uv_mult
1598
1599        dup             v29.8h,  w13           // -scaling_shift
1600
1601        cbz             w12, 1f
1602        // clip
1603        movi            v30.16b, #16
1604        movi            v31.16b, #240
1605        cbz             w11, 2f
1606        // is_id
1607        movi            v31.16b, #235
1608        b               2f
16091:
1610        // no clip
1611        movi            v30.16b, #0
1612        movi            v31.16b, #255
16132:
1614
1615        ldr             w12, [x8, #8]          // offsets[1][0]
1616        ldr             w14, [x8, #4]          // offsets[0][1]
1617        ldr             w16, [x8, #12]         // offsets[1][1]
1618        ldr             w8,  [x8]              // offsets[0][0]
1619
1620        mov             x10, #GRAIN_WIDTH      // grain_lut stride
1621
1622        add             x5,  x5,  #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
1623.if \sy
1624        add             x5,  x5,  x10, lsl #2  // grain_lut += 4 * grain_stride
1625        add             x5,  x5,  x10, lsl #1  // grain_lut += 2 * grain_stride
1626.else
1627        add             x5,  x5,  x10, lsl #3  // grain_lut += 8 * grain_stride
1628        add             x5,  x5,  x10          // grain_lut += grain_stride
1629.endif
1630
1631        calc_offset     w12, w13, w12, \sx, \sy
1632        calc_offset     w14, w15, w14, \sx, \sy
1633        calc_offset     w16, w17, w16, \sx, \sy
1634        calc_offset     w8,  w11, w8,  \sx, \sy
1635
1636        add_offset      x13, w12, x13, x5,  x10
1637        add_offset      x15, w14, x15, x5,  x10
1638        add_offset      x17, w16, x17, x5,  x10
1639        add_offset      x5,  w8,  x11, x5,  x10
1640
1641        add             x4,  x13, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
1642        add             x8,  x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1643        add             x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1644        add             x11, x11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
1645
1646        ldr             w13, [sp, #64]         // type
1647
1648        movrel          x16, overlap_coeffs_\sx
1649        adr             x14, L(fguv_loop_sx\sx\()_tbl)
1650
1651        ld1             {v27.8b, v28.8b}, [x16] // overlap_coeffs
1652        tst             w13, #1
1653        ldrh            w13, [x14, w13, uxtw #1]
1654
1655        b.eq            1f
1656        // y overlap
1657        sub             w12, w9,  #(2 >> \sy)  // backup remaining h
1658        mov             w9,  #(2 >> \sy)
1659
16601:
1661        sub             x13, x14, w13, uxtw
1662
1663.if \sy
1664        movi            v25.16b, #23
1665        movi            v26.16b, #22
1666.else
1667        movi            v25.16b, #27
1668        movi            v26.16b, #17
1669.endif
1670
1671.if \sy
1672        add             x7,  x7,  x7           // luma_stride *= 2
1673.endif
1674
1675        br              x13
1676endfunc
1677.endm
1678
1679fguv 420, 1, 1
1680fguv 422, 1, 0
1681fguv 444, 0, 0
1682
1683function fguv_loop_sx0_neon
1684.macro fguv_loop_sx0 csfl, ox, oy
1685L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
1686        AARCH64_VALID_JUMP_TARGET
16871:
1688        ld1             {v0.16b,  v1.16b},  [x6],  x7  // luma
1689        ld1             {v6.16b,  v7.16b},  [x1],  x2  // src
1690.if \ox
1691        ld1             {v20.8b},           [x4],  x10 // grain_lut old
1692.endif
1693.if \oy
1694        ld1             {v22.16b, v23.16b}, [x8],  x10 // grain_lut top
1695.endif
1696.if \ox && \oy
1697        ld1             {v21.8b},           [x11], x10 // grain_lut top old
1698.endif
1699        ld1             {v18.16b, v19.16b}, [x5],  x10 // grain_lut
1700
1701.if !\csfl
1702        uxtl            v2.8h,   v0.8b
1703        uxtl2           v3.8h,   v0.16b
1704        uxtl            v4.8h,   v1.8b
1705        uxtl2           v5.8h,   v1.16b
1706        uxtl            v0.8h,   v6.8b
1707        uxtl2           v1.8h,   v6.16b
1708        uxtl            v16.8h,  v7.8b
1709        uxtl2           v17.8h,  v7.16b
1710        mul             v2.8h,   v2.8h,   v8.h[0]
1711        mul             v3.8h,   v3.8h,   v8.h[0]
1712        mul             v4.8h,   v4.8h,   v8.h[0]
1713        mul             v5.8h,   v5.8h,   v8.h[0]
1714        mul             v0.8h,   v0.8h,   v8.h[1]
1715        mul             v1.8h,   v1.8h,   v8.h[1]
1716        mul             v16.8h,  v16.8h,  v8.h[1]
1717        mul             v17.8h,  v17.8h,  v8.h[1]
1718        sqadd           v2.8h,   v2.8h,   v0.8h
1719        sqadd           v3.8h,   v3.8h,   v1.8h
1720        sqadd           v4.8h,   v4.8h,   v16.8h
1721        sqadd           v5.8h,   v5.8h,   v17.8h
1722        sshr            v2.8h,   v2.8h,   #6
1723        sshr            v3.8h,   v3.8h,   #6
1724        sshr            v4.8h,   v4.8h,   #6
1725        sshr            v5.8h,   v5.8h,   #6
1726        add             v2.8h,   v2.8h,   v24.8h
1727        add             v3.8h,   v3.8h,   v24.8h
1728        add             v4.8h,   v4.8h,   v24.8h
1729        add             v5.8h,   v5.8h,   v24.8h
1730        sqxtun          v0.8b,   v2.8h
1731        sqxtun2         v0.16b,  v3.8h
1732        sqxtun          v1.8b,   v4.8h
1733        sqxtun2         v1.16b,  v5.8h
1734.endif
1735
1736        bl              gather32_neon
1737
1738.if \ox
1739        smull           v20.8h,  v20.8b,  v27.8b
1740        smlal           v20.8h,  v18.8b,  v28.8b
1741.endif
1742
1743.if \oy
1744.if \ox
1745        smull           v21.8h,  v21.8b,  v27.8b
1746        smlal           v21.8h,  v22.8b,  v28.8b
1747        sqrshrn         v20.8b,  v20.8h,  #5
1748        sqrshrn         v21.8b,  v21.8h,  #5
1749.endif
1750
1751.if \ox
1752        smull           v16.8h,  v20.8b,  v26.8b
1753.else
1754        smull           v16.8h,  v18.8b,  v26.8b
1755.endif
1756        smull2          v17.8h,  v18.16b, v26.16b
1757        smull           v18.8h,  v19.8b,  v26.8b
1758        smull2          v19.8h,  v19.16b, v26.16b
1759.if \ox
1760        smlal           v16.8h,  v21.8b,  v25.8b
1761.else
1762        smlal           v16.8h,  v22.8b,  v25.8b
1763.endif
1764        smlal2          v17.8h,  v22.16b, v25.16b
1765        smlal           v18.8h,  v23.8b,  v25.8b
1766        smlal2          v19.8h,  v23.16b, v25.16b
1767        sqrshrn         v22.8b,  v16.8h,  #5
1768        sqrshrn2        v22.16b, v17.8h,  #5
1769        sqrshrn         v23.8b,  v18.8h,  #5
1770        sqrshrn2        v23.16b, v19.8h,  #5
1771.endif
1772
1773        // sxtl of grain
1774.if \oy
1775        sxtl            v16.8h,  v22.8b
1776        sxtl2           v17.8h,  v22.16b
1777        sxtl            v18.8h,  v23.8b
1778        sxtl2           v19.8h,  v23.16b
1779.elseif \ox
1780        sqrshrn         v20.8b,  v20.8h,  #5
1781        sxtl2           v17.8h,  v18.16b
1782        sxtl            v18.8h,  v19.8b
1783        sxtl2           v19.8h,  v19.16b
1784        sxtl            v16.8h,  v20.8b
1785.else
1786        sxtl            v16.8h,  v18.8b
1787        sxtl2           v17.8h,  v18.16b
1788        sxtl            v18.8h,  v19.8b
1789        sxtl2           v19.8h,  v19.16b
1790.endif
1791
1792        uxtl            v2.8h,   v4.8b   // scaling
1793        uxtl2           v3.8h,   v4.16b
1794        uxtl            v4.8h,   v5.8b
1795        uxtl2           v5.8h,   v5.16b
1796
1797        mul             v16.8h,  v16.8h,  v2.8h   // scaling * grain
1798        mul             v17.8h,  v17.8h,  v3.8h
1799        mul             v18.8h,  v18.8h,  v4.8h
1800        mul             v19.8h,  v19.8h,  v5.8h
1801
1802        srshl           v16.8h,  v16.8h,  v29.8h  // round2(scaling * grain, scaling_shift)
1803        srshl           v17.8h,  v17.8h,  v29.8h
1804        srshl           v18.8h,  v18.8h,  v29.8h
1805        srshl           v19.8h,  v19.8h,  v29.8h
1806
1807        uaddw           v16.8h,  v16.8h,  v6.8b   // *src + noise
1808        uaddw2          v17.8h,  v17.8h,  v6.16b
1809        uaddw           v18.8h,  v18.8h,  v7.8b
1810        uaddw2          v19.8h,  v19.8h,  v7.16b
1811
1812        sqxtun          v0.8b,   v16.8h
1813        sqxtun2         v0.16b,  v17.8h
1814        sqxtun          v1.8b,   v18.8h
1815        sqxtun2         v1.16b,  v19.8h
1816
1817        umax            v0.16b,  v0.16b,  v30.16b
1818        umax            v1.16b,  v1.16b,  v30.16b
1819        umin            v0.16b,  v0.16b,  v31.16b
1820        umin            v1.16b,  v1.16b,  v31.16b
1821
1822        subs            w9,  w9,  #1
1823.if \oy
1824        dup             v25.16b, v28.b[0]
1825        dup             v26.16b, v28.b[1]
1826.endif
1827        st1             {v0.16b,  v1.16b},  [x0], x2 // dst
1828        b.gt            1b
1829
1830.if \oy
1831        cmp             w12, #0
1832        mov             w9,  w12               // restore actual remaining h
1833        b.gt            L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
1834.endif
1835        b               9f
1836.endm
1837        fguv_loop_sx0   0, 0, 0
1838        fguv_loop_sx0   0, 0, 1
1839        fguv_loop_sx0   0, 1, 0
1840        fguv_loop_sx0   0, 1, 1
1841        fguv_loop_sx0   1, 0, 0
1842        fguv_loop_sx0   1, 0, 1
1843        fguv_loop_sx0   1, 1, 0
1844        fguv_loop_sx0   1, 1, 1
1845
18469:
1847        ldr             d8,       [sp, #16]
1848        ldr             x30,      [sp], #32
1849        AARCH64_VALIDATE_LINK_REGISTER
1850        ret
1851
1852L(fguv_loop_sx0_tbl):
1853        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
1854        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
1855        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
1856        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
1857        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
1858        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
1859        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
1860        .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
1861endfunc
1862
1863function fguv_loop_sx1_neon
1864.macro fguv_loop_sx1 csfl, ox, oy
1865L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
1866        AARCH64_VALID_JUMP_TARGET
18671:
1868        ld1             {v0.16b, v1.16b},  [x6],  x7  // luma
1869        ld1             {v6.16b},          [x1],  x2  // src
1870.if \ox
1871        ld1             {v20.8b},          [x4],  x10 // grain_lut old
1872.endif
1873.if \oy
1874        ld1             {v22.16b},         [x8],  x10 // grain_lut top
1875.endif
1876.if \ox && \oy
1877        ld1             {v21.8b},          [x11], x10 // grain_lut top old
1878.endif
1879        ld1             {v18.16b},         [x5],  x10 // grain_lut
1880
1881        uaddlp          v2.8h,   v0.16b
1882        uaddlp          v3.8h,   v1.16b
1883.if \csfl
1884        rshrn           v0.8b,   v2.8h,   #1
1885        rshrn2          v0.16b,  v3.8h,   #1
1886.else
1887        urshr           v2.8h,   v2.8h,   #1
1888        urshr           v3.8h,   v3.8h,   #1
1889        uxtl            v0.8h,   v6.8b
1890        uxtl2           v1.8h,   v6.16b
1891        mul             v2.8h,   v2.8h,   v8.h[0]
1892        mul             v3.8h,   v3.8h,   v8.h[0]
1893        mul             v0.8h,   v0.8h,   v8.h[1]
1894        mul             v1.8h,   v1.8h,   v8.h[1]
1895        sqadd           v2.8h,   v2.8h,   v0.8h
1896        sqadd           v3.8h,   v3.8h,   v1.8h
1897        sshr            v2.8h,   v2.8h,   #6
1898        sshr            v3.8h,   v3.8h,   #6
1899        add             v2.8h,   v2.8h,   v24.8h
1900        add             v3.8h,   v3.8h,   v24.8h
1901        sqxtun          v0.8b,   v2.8h
1902        sqxtun2         v0.16b,  v3.8h
1903.endif
1904
1905        bl              gather16_neon
1906
1907.if \ox
1908        smull           v20.8h,  v20.8b,  v27.8b
1909        smlal           v20.8h,  v18.8b,  v28.8b
1910.endif
1911
1912.if \oy
1913.if \ox
1914        smull           v21.8h,  v21.8b,  v27.8b
1915        smlal           v21.8h,  v22.8b,  v28.8b
1916        sqrshrn         v20.8b,  v20.8h,  #5
1917        sqrshrn         v21.8b,  v21.8h,  #5
1918.endif
1919
1920.if \ox
1921        smull           v16.8h,  v20.8b,  v26.8b
1922.else
1923        smull           v16.8h,  v18.8b,  v26.8b
1924.endif
1925        smull2          v17.8h,  v18.16b, v26.16b
1926.if \ox
1927        smlal           v16.8h,  v21.8b,  v25.8b
1928.else
1929        smlal           v16.8h,  v22.8b,  v25.8b
1930.endif
1931        smlal2          v17.8h,  v22.16b, v25.16b
1932        sqrshrn         v22.8b,  v16.8h,  #5
1933        sqrshrn2        v22.16b, v17.8h,  #5
1934.endif
1935
1936        // sxtl of grain
1937.if \oy
1938        sxtl            v16.8h,  v22.8b
1939        sxtl2           v17.8h,  v22.16b
1940.elseif \ox
1941        sqrshrn         v20.8b,  v20.8h,  #5
1942        sxtl2           v17.8h,  v18.16b
1943        sxtl            v16.8h,  v20.8b
1944.else
1945        sxtl            v16.8h,  v18.8b
1946        sxtl2           v17.8h,  v18.16b
1947.endif
1948
1949        uxtl            v2.8h,   v4.8b   // scaling
1950        uxtl2           v3.8h,   v4.16b
1951
1952        mul             v16.8h,  v16.8h,  v2.8h   // scaling * grain
1953        mul             v17.8h,  v17.8h,  v3.8h
1954
1955        srshl           v16.8h,  v16.8h,  v29.8h  // round2(scaling * grain, scaling_shift)
1956        srshl           v17.8h,  v17.8h,  v29.8h
1957
1958        uaddw           v16.8h,  v16.8h,  v6.8b   // *src + noise
1959        uaddw2          v17.8h,  v17.8h,  v6.16b
1960
1961        sqxtun          v0.8b,   v16.8h
1962        sqxtun2         v0.16b,  v17.8h
1963
1964        umax            v0.16b,  v0.16b,  v30.16b
1965        umin            v0.16b,  v0.16b,  v31.16b
1966
1967.if \oy
1968        mov             v16.16b, v25.16b
1969.endif
1970        subs            w9,  w9,  #1
1971.if \oy
1972        mov             v25.16b, v26.16b
1973        mov             v26.16b, v16.16b
1974.endif
1975        st1             {v0.16b},  [x0], x2 // dst
1976        b.gt            1b
1977
1978.if \oy
1979        cmp             w12, #0
1980        mov             w9,  w12               // restore actual remaining h
1981        b.gt            L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
1982.endif
1983
1984        b               9f
1985.endm
1986        fguv_loop_sx1   0, 0, 0
1987        fguv_loop_sx1   0, 0, 1
1988        fguv_loop_sx1   0, 1, 0
1989        fguv_loop_sx1   0, 1, 1
1990        fguv_loop_sx1   1, 0, 0
1991        fguv_loop_sx1   1, 0, 1
1992        fguv_loop_sx1   1, 1, 0
1993        fguv_loop_sx1   1, 1, 1
1994
19959:
1996        ldr             d8,       [sp, #16]
1997        ldr             x30,      [sp], #32
1998        AARCH64_VALIDATE_LINK_REGISTER
1999        ret
2000
2001L(fguv_loop_sx1_tbl):
2002        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
2003        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
2004        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
2005        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
2006        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
2007        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
2008        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
2009        .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
2010endfunc
2011