• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2021, VideoLAN and dav1d authors
3 * Copyright © 2021, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30#include "src/arm/asm-offsets.h"
31
32#define GRAIN_WIDTH 82
33#define GRAIN_HEIGHT 73
34
35#define SUB_GRAIN_WIDTH 44
36#define SUB_GRAIN_HEIGHT 38
37
38.macro increment_seed steps, shift=1
39        lsr             r11, r2,  #3
40        lsr             r12, r2,  #12
41        lsr             lr,  r2,  #1
42        eor             r11, r2,  r11                     // (r >> 0) ^ (r >> 3)
43        eor             r12, r12, lr                      // (r >> 12) ^ (r >> 1)
44        eor             r11, r11, r12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
45.if \shift
46        lsr             r2,  r2,  #\steps
47.endif
48        and             r11, r11, #((1 << \steps) - 1)    // bit
49.if \shift
50        orr             r2,  r2,  r11, lsl #(16 - \steps) // *state
51.else
52        orr             r2,  r2,  r11, lsl #16            // *state
53.endif
54.endm
55
56.macro read_rand dest, bits, age
57        ubfx            \dest,  r2,   #16 - \bits - \age, #\bits
58.endm
59
60.macro read_shift_rand dest, bits
61        ubfx            \dest,  r2,   #17 - \bits, #\bits
62        lsr             r2,  r2,  #1
63.endm
64
65// special calling convention:
66// r2 holds seed
67// r3 holds dav1d_gaussian_sequence
68// clobbers r11-r12
69// returns in d0-d1
70function get_gaussian_neon
71        push            {r5-r6,lr}
72        increment_seed  4
73        read_rand       r5,  11,  3
74        read_rand       r6,  11,  2
75        add             r5,  r3,  r5,  lsl #1
76        add             r6,  r3,  r6,  lsl #1
77        vld1.16         {d0[0]}, [r5]
78        read_rand       r5,  11,  1
79        vld1.16         {d0[1]}, [r6]
80        add             r5,  r3,  r5,  lsl #1
81        read_rand       r6, 11,  0
82        increment_seed  4
83        add             r6,  r3,  r6,  lsl #1
84        vld1.16         {d0[2]}, [r5]
85        read_rand       r5,  11,  3
86        vld1.16         {d0[3]}, [r6]
87        add             r5,  r3,  r5,  lsl #1
88        read_rand       r6,  11,  2
89        vld1.16         {d1[0]}, [r5]
90        add             r6,  r3,  r6,  lsl #1
91        read_rand       r5,  11,  1
92        vld1.16         {d1[1]}, [r6]
93        read_rand       r6,  11,  0
94        add             r5,  r3,  r5,  lsl #1
95        add             r6,  r3,  r6,  lsl #1
96        vld1.16         {d1[2]}, [r5]
97        vld1.16         {d1[3]}, [r6]
98        pop             {r5-r6,pc}
99endfunc
100
101function get_grain_2_neon
102        push            {r11,lr}
103        increment_seed  2
104        read_rand       r11, 11,  1
105        read_rand       r12, 11,  0
106        add             r11, r3,  r11, lsl #1
107        add             r12, r3,  r12, lsl #1
108        vld1.16         {d0[0]}, [r11]
109        vld1.16         {d0[1]}, [r12]
110        vrshl.s16       d0,  d0,  d30
111        pop             {r11,pc}
112endfunc
113
114.macro get_grain_2 dst
115        bl              get_grain_2_neon
116.ifnc \dst, d0
117        vmov            \dst, d0
118.endif
119.endm
120
121function get_grain_4_neon
122        push            {r11,lr}
123        increment_seed  4
124        read_rand       r11, 11,  3
125        read_rand       r12, 11,  2
126        add             r11, r3,  r11, lsl #1
127        add             r12, r3,  r12, lsl #1
128        vld1.16         {d0[0]}, [r11]
129        read_rand       r11, 11,  1
130        vld1.16         {d0[1]}, [r12]
131        read_rand       r12, 11,  0
132        add             r11, r3,  r11, lsl #1
133        add             r12, r3,  r12, lsl #1
134        vld1.16         {d0[2]}, [r11]
135        vld1.16         {d0[3]}, [r12]
136        vrshl.s16       d0,  d0,  d30
137        pop             {r11,pc}
138endfunc
139
140.macro get_grain_4 dst
141        bl              get_grain_4_neon
142.ifnc \dst, d0
143        vmov            \dst, d0
144.endif
145.endm
146
147// r1 holds the number of entries to produce
148// r6, r8 and r10 hold the previous output entries
149// q0 holds the vector of produced entries
150// q1 holds the input vector of sums from above
151.macro output_lag n
152function output_lag\n\()_neon
153        push            {r0, lr}
154.if \n == 1
155        mvn             lr,  r5                   // grain_min = ~grain_max
156.else
157        mov             r0,  #1
158        mov             lr,  #1
159        sub             r7,  r7,  #1
160        sub             r9,  r9,  #1
161        lsl             r0,  r0,  r7
162        lsl             lr,  lr,  r9
163        add             r7,  r7,  #1
164        add             r9,  r9,  #1
165.endif
1661:
167        read_shift_rand r12, 11
168        vmov.32         r11, d2[0]
169        lsl             r12, r12, #1
170        vext.8          q0,  q0,  q0,  #2
171        ldrsh           r12, [r3, r12]
172.if \n == 1
173        mla             r11, r6,  r4,  r11        // sum (above) + *coeff * prev output
174        add             r6,  r11, r8              // 1 << (ar_coeff_shift - 1)
175        add             r12, r12, r10
176        asr             r6,  r6,  r7              // >> ar_coeff_shift
177        asr             r12, r12, r9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
178        add             r6,  r6,  r12
179        cmp             r6,  r5
180.elseif \n == 2
181        mla             r11, r8,  r4,  r11        // sum (above) + *coeff * prev output 1
182        mla             r11, r6,  r10, r11        // += *coeff * prev output 2
183        mov             r8,  r6
184        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
185        add             r12, r12, lr              // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
186        asr             r6,  r6,  r7              // >> ar_coeff_shift
187        asr             r12, r12, r9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
188        add             r6,  r6,  r12
189        push            {lr}
190        cmp             r6,  r5
191        mvn             lr,  r5                   // grain_min = ~grain_max
192.else
193        push            {r1-r3}
194        sbfx            r1,  r4,  #0,  #8
195        sbfx            r2,  r4,  #8,  #8
196        sbfx            r3,  r4,  #16, #8
197        mla             r11, r10, r1,  r11        // sum (above) + *coeff * prev output 1
198        mla             r11, r8,  r2,  r11        // sum (above) + *coeff * prev output 2
199        mla             r11, r6,  r3,  r11        // += *coeff * prev output 3
200        pop             {r1-r3}
201        mov             r10, r8
202        mov             r8,  r6
203
204        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
205        add             r12, r12, lr              // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
206        asr             r6,  r6,  r7              // >> ar_coeff_shift
207        asr             r12, r12, r9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
208        add             r6,  r6,  r12
209        push            {lr}
210        cmp             r6,  r5
211        mvn             lr,  r5                   // grain_min = ~grain_max
212.endif
213        it              gt
214        movgt           r6,  r5
215        cmp             r6,  lr
216        it              lt
217        movlt           r6,  lr
218.if \n >= 2
219        pop             {lr}
220.endif
221        subs            r1,  r1,  #1
222        vext.8          q1,  q1,  q1,  #4
223        vmov.16         d1[3], r6
224        bgt             1b
225        pop             {r0, pc}
226endfunc
227.endm
228
229output_lag 1
230output_lag 2
231output_lag 3
232
233
234function sum_lag1_above_neon
235        sub             r12, r0,  #1*GRAIN_WIDTH*2 - 16
236        vld1.16         {q10}, [r12] // load top right
237
238        vext.8          q0,  q8,  q9,  #14 // top left, top mid
239        vext.8          q1,  q9,  q10, #2  // top left, top mid
240
241        vmull.s16       q2,  d18, d28
242        vmlal.s16       q2,  d0,  d27
243        vmlal.s16       q2,  d2,  d29
244        vmull.s16       q3,  d19, d28
245        vmlal.s16       q3,  d1,  d27
246        vmlal.s16       q3,  d3,  d29
247
248        vmov            q8,  q9
249        vmov            q9,  q10
250
251        bx              lr
252endfunc
253
254.macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
255.ifc \lag\()_\edge, lag3_left
256        bl              sum_lag3_left_above_neon
257.else
258        bl              sum_\lag\()_above_neon
259.endif
260.ifc \type, uv_420
261        vpush           {q6-q7}
262        add             r12, r11, #GRAIN_WIDTH*2
263        vld1.16         {q0, q1}, [r11]!
264        vld1.16         {q6, q7}, [r12]!
265        vpadd.i16       d0,  d0,  d1
266        vpadd.i16       d1,  d2,  d3
267        vpadd.i16       d12, d12, d13
268        vpadd.i16       d13, d14, d15
269        vadd.i16        q0,  q0,  q6
270        vpop            {q6-q7}
271        vrshr.s16       q0,  q0,  #2
272.endif
273.ifc \type, uv_422
274        vld1.16         {q0, q1}, [r11]!
275        vpadd.i16       d0,  d0,  d1
276        vpadd.i16       d1,  d2,  d3
277        vrshr.s16       q0,  q0,  #1
278.endif
279.ifc \type, uv_444
280        vld1.16         {q0}, [r11]!
281.endif
282.if \uv_layout
283.ifnb \uv_coeff
284        vdup.8          d13, \uv_coeff
285        vmovl.s8        q6,  d13
286.endif
287        vmlal.s16       q2,  d0,  d13
288        vmlal.s16       q3,  d1,  d13
289.endif
290.if \uv_layout && \elems == 8
291        b               sum_\lag\()_y_\edge\()_start
292.elseif \uv_layout == 444 && \elems == 7
293        b               sum_\lag\()_y_\edge\()_start
294.elseif \uv_layout == 422 && \elems == 1
295        b               sum_\lag\()_uv_420_\edge\()_start
296.else
297sum_\lag\()_\type\()_\edge\()_start:
298        push            {r11}
299.if \elems > 4
300.ifc \edge, left
301        increment_seed  4
302        read_rand       r11, 11,  3
303        read_rand       r12, 11,  2
304        add             r11, r3,  r11, lsl #1
305        add             r12, r3,  r12, lsl #1
306        vld1.16         {d1[1]}, [r11]
307        read_rand       r11, 11,  1
308        vld1.16         {d1[2]}, [r12]
309        add             r11, r3,  r11, lsl #1
310        vld1.16         {d1[3]}, [r11]
311        lsl             r2,  r2,  #1             // shift back the state as if we'd done increment_seed with shift=0
312        vrshl.s16       d1,  d1,  d30
313        vext.8          q2,  q2,  q2,  #12
314.ifc \lag, lag3
315        vmov.s16        r10, d1[1]
316.endif
317.ifnc \lag, lag1
318        vmov.s16        r8,  d1[2]
319.endif
320        vmov.s16        r6,  d1[3]
321
322        vmov            q1,  q2
323        mov             r1,  #1
324        bl              output_\lag\()_neon
325.else
326        increment_seed  4, shift=0
327        vmov            q1,  q2
328        mov             r1,  #4
329        bl              output_\lag\()_neon
330.endif
331
332        increment_seed  4, shift=0
333        vmov            q1,  q3
334.ifc \edge, right
335        mov             r1,  #3
336        bl              output_\lag\()_neon
337        read_shift_rand r12, 11
338        add             r12, r3,  r12, lsl #1
339        vld1.16         {d2[0]}, [r12]
340        vrshl.s16       d2,  d2,  d30
341        vext.8          q0,  q0,  q1,  #2
342.else
343        mov             r1,  #4
344        bl              output_\lag\()_neon
345.endif
346.else
347        // elems == 1
348        increment_seed  4, shift=0
349        vmov            q1,  q2
350        mov             r1,  #1
351        bl              output_\lag\()_neon
352        lsr             r2,  r2,  #3
353
354        read_rand       r11, 11,  2
355        read_rand       r12, 11,  1
356        add             r11, r3,  r11, lsl #1
357        add             r12, r3,  r12, lsl #1
358        vld1.16         {d2[0]}, [r11]
359        read_rand       r11, 11,  0
360        vld1.16         {d2[1]}, [r12]
361        add             r11, r3,  r11, lsl #1
362        vld1.16         {d2[2]}, [r11]
363        vrshl.s16       d2,  d2,  d30
364        vext.8          q0,  q0,  q1,  #14
365.endif
366        vst1.16         {q0}, [r0]!
367        pop             {r11}
368        pop             {r1, pc}
369.endif
370.endm
371
372.macro sum_lag1_func type, uv_layout, edge, elems=8
373function sum_\type\()_lag1_\edge\()_neon
374        push            {r1, lr}
375.ifc \edge, left
376        sub             r12, r0,  #1*GRAIN_WIDTH*2
377        vld1.8          {q9},  [r12] // load the previous block right above
378.endif
379        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems
380endfunc
381.endm
382
383sum_lag1_func y,      0,   left
384sum_lag1_func y,      0,   mid
385sum_lag1_func y,      0,   right, 7
386sum_lag1_func uv_444, 444, left
387sum_lag1_func uv_444, 444, mid
388sum_lag1_func uv_444, 444, right, 7
389sum_lag1_func uv_422, 422, left
390sum_lag1_func uv_422, 422, mid
391sum_lag1_func uv_422, 422, right, 1
392sum_lag1_func uv_420, 420, left
393sum_lag1_func uv_420, 420, mid
394sum_lag1_func uv_420, 420, right, 1
395
396
397function sum_lag2_above_neon
398        push            {lr}
399        sub             r12, r0,  #2*GRAIN_WIDTH*2 - 16
400        sub             lr,  r0,  #1*GRAIN_WIDTH*2 - 16
401        vld1.16         {q10}, [r12] // load top right
402        vld1.16         {q13}, [lr]
403
404        vdup.8          d10, d28[0]
405        vext.8          q0,  q8,  q9,  #12 // top left, top mid
406        vdup.8          d12, d28[1]
407        vext.8          q1,  q8,  q9,  #14
408        vdup.8          d14, d28[3]
409        vext.8          q4,  q9,  q10, #2  // top mid, top right
410        vmovl.s8        q5,  d10
411        vmovl.s8        q6,  d12
412        vmovl.s8        q7,  d14
413
414        vmull.s16       q2,  d0,  d10
415        vmlal.s16       q2,  d2,  d12
416        vmlal.s16       q2,  d8,  d14
417        vmull.s16       q3,  d1,  d10
418        vmlal.s16       q3,  d3,  d12
419        vmlal.s16       q3,  d9,  d14
420
421        vdup.8          d10, d28[4]
422        vext.8          q0,  q9,  q10, #4  // top mid, top right
423        vdup.8          d12, d28[5]
424        vext.8          q1,  q11, q12, #12 // top left, top mid
425        vdup.8          d14, d28[6]
426        vext.8          q4,  q11, q12, #14
427        vmovl.s8        q5,  d10
428        vmovl.s8        q6,  d12
429        vmovl.s8        q7,  d14
430
431        vmlal.s16       q2,  d0,  d10
432        vmlal.s16       q2,  d2,  d12
433        vmlal.s16       q2,  d8,  d14
434        vmlal.s16       q3,  d1,  d10
435        vmlal.s16       q3,  d3,  d12
436        vmlal.s16       q3,  d9,  d14
437
438        vdup.8          d10, d29[0]
439        vext.8          q0,  q12, q13, #2  // top mid, top right
440        vdup.8          d12, d29[1]
441        vext.8          q1,  q12, q13, #4
442
443        vdup.8          d14, d28[2]
444        vdup.8          d8,  d28[7]
445
446        vmovl.s8        q5,  d10
447        vmovl.s8        q6,  d12
448        vmovl.s8        q7,  d14
449        vmovl.s8        q4,  d8
450
451        vmlal.s16       q2,  d0,  d10
452        vmlal.s16       q2,  d2,  d12
453        vmlal.s16       q2,  d18, d14
454        vmlal.s16       q2,  d24, d8
455        vmlal.s16       q3,  d1,  d10
456        vmlal.s16       q3,  d3,  d12
457        vmlal.s16       q3,  d19, d14
458        vmlal.s16       q3,  d25, d8
459
460        vmov            q8,  q9
461        vmov            q9,  q10
462
463        vmov            q11, q12
464        vmov            q12, q13
465
466        pop             {pc}
467endfunc
468
469.macro sum_lag2_func type, uv_layout, edge, elems=8
470function sum_\type\()_lag2_\edge\()_neon
471        push            {r1, lr}
472.ifc \edge, left
473        sub             r12, r0,  #2*GRAIN_WIDTH*2
474        sub             lr,  r0,  #1*GRAIN_WIDTH*2
475        vld1.16         {q9},  [r12] // load the previous block right above
476        vld1.16         {q12}, [lr]
477.endif
478        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4]
479endfunc
480.endm
481
482sum_lag2_func y,      0,   left
483sum_lag2_func y,      0,   mid
484sum_lag2_func y,      0,   right, 7
485sum_lag2_func uv_444, 444, left
486sum_lag2_func uv_444, 444, mid
487sum_lag2_func uv_444, 444, right, 7
488sum_lag2_func uv_422, 422, left
489sum_lag2_func uv_422, 422, mid
490sum_lag2_func uv_422, 422, right, 1
491sum_lag2_func uv_420, 420, left
492sum_lag2_func uv_420, 420, mid
493sum_lag2_func uv_420, 420, right, 1
494
495
496function sum_lag3_left_above_neon
497        // A separate codepath for the left edge, to avoid reading outside
498        // of the edge of the buffer.
499        sub             r12, r0,  #3*GRAIN_WIDTH*2
500        vld1.8          {q11, q12}, [r12]
501        vext.8          q12, q11, q12, #10
502        vext.8          q11, q11, q11, #10
503        b               sum_lag3_above_start
504endfunc
505
506function sum_lag3_above_neon
507        movw            r12, #(3*GRAIN_WIDTH + 3)*2
508        sub             r12, r0,  r12
509        vld1.8          {q11, q12}, [r12]
510
511sum_lag3_above_start:
512        vdup.8          d12, d26[0]
513        vext.8          q1,  q11, q12, #2
514        vdup.8          d14, d26[1]
515        vext.8          q4,  q11, q12, #4
516        vdup.8          d16, d26[2]
517        vext.8          q5,  q11, q12, #6
518        vdup.8          d18, d26[3]
519        vmovl.s8        q6,  d12
520        vmovl.s8        q7,  d14
521        vmovl.s8        q8,  d16
522        vmovl.s8        q9,  d18
523
524        movw            r12, #(2*GRAIN_WIDTH + 3)*2
525        sub             r12, r0,  r12
526
527        vmull.s16       q2,  d22, d12
528        vmlal.s16       q2,  d2,  d14
529        vmlal.s16       q2,  d8,  d16
530        vmlal.s16       q2,  d10, d18
531        vmull.s16       q3,  d23, d12
532        vmlal.s16       q3,  d3,  d14
533        vmlal.s16       q3,  d9,  d16
534        vmlal.s16       q3,  d11, d18
535
536        vdup.8          d12, d26[4]
537        vext.8          q0,  q11, q12, #8
538        vdup.8          d14, d26[5]
539        vext.8          q1,  q11, q12, #10
540        vdup.8          d16, d26[6]
541        vext.8          q4,  q11, q12, #12
542        vld1.8          {q11, q12}, [r12]
543        vdup.8          d18, d26[7]
544        vmovl.s8        q6,  d12
545        vmovl.s8        q7,  d14
546        vmovl.s8        q8,  d16
547        vmovl.s8        q9,  d18
548
549        vmlal.s16       q2,  d0,  d12
550        vmlal.s16       q2,  d2,  d14
551        vmlal.s16       q2,  d8,  d16
552        vmlal.s16       q2,  d22, d18
553        vmlal.s16       q3,  d1,  d12
554        vmlal.s16       q3,  d3,  d14
555        vmlal.s16       q3,  d9,  d16
556        vmlal.s16       q3,  d23, d18
557
558        vdup.8          d12, d27[0]
559        vext.8          q0,  q11, q12, #2
560        vdup.8          d14, d27[1]
561        vext.8          q1,  q11, q12, #4
562        vdup.8          d16, d27[2]
563        vext.8          q4,  q11, q12, #6
564        vdup.8          d18, d27[3]
565        vext.8          q5,  q11, q12, #8
566        vmovl.s8        q6,  d12
567        vmovl.s8        q7,  d14
568        vmovl.s8        q8,  d16
569        vmovl.s8        q9,  d18
570
571        sub             r12, r0,  #(1*GRAIN_WIDTH + 3)*2
572
573        vmlal.s16       q2,  d0,  d12
574        vmlal.s16       q2,  d2,  d14
575        vmlal.s16       q2,  d8,  d16
576        vmlal.s16       q2,  d10, d18
577        vmlal.s16       q3,  d1,  d12
578        vmlal.s16       q3,  d3,  d14
579        vmlal.s16       q3,  d9,  d16
580        vmlal.s16       q3,  d11, d18
581
582        vdup.8          d12, d27[4]
583        vext.8          q0,  q11, q12, #10
584        vdup.8          d14, d27[5]
585        vext.8          q1,  q11, q12, #12
586        vld1.8          {q11, q12}, [r12]
587        vdup.8          d16, d27[6]
588        vdup.8          d18, d27[7]
589        vmovl.s8        q6,  d12
590        vmovl.s8        q7,  d14
591        vext.8          q5,  q11, q12, #2
592        vmovl.s8        q8,  d16
593        vmovl.s8        q9,  d18
594
595        vmlal.s16       q2,  d0,  d12
596        vmlal.s16       q2,  d2,  d14
597        vmlal.s16       q2,  d22, d16
598        vmlal.s16       q2,  d10, d18
599        vmlal.s16       q3,  d1,  d12
600        vmlal.s16       q3,  d3,  d14
601        vmlal.s16       q3,  d23, d16
602        vmlal.s16       q3,  d11, d18
603
604        vdup.8          d12, d28[0]
605        vext.8          q0,  q11, q12, #4
606        vdup.8          d14, d28[1]
607        vext.8          q1,  q11, q12, #6
608        vdup.8          d16, d28[2]
609        vext.8          q4,  q11, q12, #8
610        vdup.8          d18, d28[3]
611        vext.8          q5,  q11, q12, #10
612        vmovl.s8        q6,  d12
613        vmovl.s8        q7,  d14
614        vmovl.s8        q8,  d16
615        vmovl.s8        q9,  d18
616
617        vmlal.s16       q2,  d0,  d12
618        vmlal.s16       q2,  d2,  d14
619        vmlal.s16       q2,  d8,  d16
620        vmlal.s16       q2,  d10, d18
621        vmlal.s16       q3,  d1,  d12
622        vmlal.s16       q3,  d3,  d14
623        vmlal.s16       q3,  d9,  d16
624        vmlal.s16       q3,  d11, d18
625
626        vdup.8          d12, d28[4]
627        vext.8          q0,  q11, q12, #12
628        vmovl.s8        q6,  d12
629
630        vmlal.s16       q2,  d0,  d12
631        vmlal.s16       q3,  d1,  d12
632
633        bx              lr
634endfunc
635
636.macro sum_lag3_func type, uv_layout, edge, elems=8
637function sum_\type\()_lag3_\edge\()_neon
638        push            {r1, lr}
639        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0]
640endfunc
641.endm
642
643sum_lag3_func y,      0,   left
644sum_lag3_func y,      0,   mid
645sum_lag3_func y,      0,   right, 7
646sum_lag3_func uv_444, 444, left
647sum_lag3_func uv_444, 444, mid
648sum_lag3_func uv_444, 444, right, 7
649sum_lag3_func uv_422, 422, left
650sum_lag3_func uv_422, 422, mid
651sum_lag3_func uv_422, 422, right, 1
652sum_lag3_func uv_420, 420, left
653sum_lag3_func uv_420, 420, mid
654sum_lag3_func uv_420, 420, right, 1
655
656function generate_grain_rows_neon
657        push            {r10-r11,lr}
6581:
659        mov             r10, #80
6602:
661        bl              get_gaussian_neon
662        vrshl.s16       q0,  q0,  q15
663        subs            r10, r10, #8
664        vst1.16         {q0}, [r0]!
665        bgt             2b
666        get_grain_2     d0
667        subs            r1,  r1,  #1
668        vst1.32         {d0[0]}, [r0]!
669        bgt             1b
670        pop             {r10-r11,pc}
671endfunc
672
673function generate_grain_rows_44_neon
674        push            {r10-r11,lr}
6751:
676        mov             r10, #40
6772:
678        bl              get_gaussian_neon
679        vrshl.s16       q0,  q0,  q15
680        subs            r10, r10, #8
681        vst1.16         {q0}, [r0]!
682        bgt             2b
683        get_grain_4     d0
684        subs            r1,  r1,  #1
685        vst1.16         {d0}, [r0]
686        add             r0,  r0,  #GRAIN_WIDTH*2-80
687        bgt             1b
688        pop             {r10-r11,pc}
689endfunc
690
691function gen_grain_uv_444_lag0_neon
692        vld1.16         {q3}, [r11]!
693gen_grain_uv_lag0_8_start:
694        push            {r11,lr}
695        bl              get_gaussian_neon
696        vrshl.s16       q0,  q0,  q15
697gen_grain_uv_lag0_8_add:
698        vand            q3,  q3,  q1
699        vmull.s16       q2,  d6,  d22
700        vmull.s16       q3,  d7,  d22
701        vrshl.s32       q2,  q2,  q12
702        vrshl.s32       q3,  q3,  q12
703        vqmovn.s32      d4,  q2
704        vqmovn.s32      d5,  q3
705        vqadd.s16       q2,  q2,  q0
706        vmin.s16        q2,  q2,  q9
707        vmax.s16        q2,  q2,  q10
708        vst1.16         {q2}, [r0]!
709        pop             {r11,pc}
710endfunc
711
712function gen_grain_uv_420_lag0_8_neon
713        add             r12, r11, #GRAIN_WIDTH*2
714        vld1.16         {q2,q3}, [r11]!
715        vld1.16         {q4,q5}, [r12]
716        vpadd.i16       d4,  d4,  d5
717        vpadd.i16       d5,  d6,  d7
718        vpadd.i16       d8,  d8,  d9
719        vpadd.i16       d9,  d10, d11
720        vadd.i16        q2,  q2,  q4
721        vrshr.s16       q3,  q2,  #2
722        b               gen_grain_uv_lag0_8_start
723endfunc
724
725function gen_grain_uv_422_lag0_8_neon
726        vld1.16         {q2,q3}, [r11]!
727        vpadd.i16       d4,  d4,  d5
728        vpadd.i16       d5,  d6,  d7
729        vrshr.s16       q3,  q2,  #1
730        b               gen_grain_uv_lag0_8_start
731endfunc
732
733function gen_grain_uv_420_lag0_4_neon
734        add             r12, r11, #GRAIN_WIDTH*2
735        vld1.16         {q2}, [r11]
736        vld1.16         {q0}, [r12]
737        add             r11, r11, #32
738        vpadd.i16       d4,  d4,  d5
739        vpadd.i16       d0,  d0,  d1
740        vadd.i16        d4,  d4,  d0
741        vrshr.s16       d6,  d4,  #2
742        push            {r11,lr}
743        get_grain_4     d0
744        b               gen_grain_uv_lag0_8_add
745endfunc
746
747function gen_grain_uv_422_lag0_4_neon
748        vld1.16         {q2}, [r11]
749        add             r11, r11, #32
750        vpadd.i16       d4,  d4,  d5
751        vrshr.s16       d6,  d4,  #1
752        push            {r11,lr}
753        get_grain_4     d0
754        b               gen_grain_uv_lag0_8_add
755endfunc
756
757.macro gen_grain_82 type
758function generate_grain_\type\()_16bpc_neon, export=1
759        push            {r4-r11,lr}
760
761.ifc \type, uv_444
762        ldr             r4,  [sp, #36]
763        mov             r12, r3
764        mov             lr,  #28
765        add             r11, r1,  #3*GRAIN_WIDTH*2
766        mov             r1,  r2
767        mul             r12, r12, lr
768        clz             lr,  r4
769.else
770        clz             lr,  r2
771.endif
772        movrel          r3,  X(gaussian_sequence)
773        sub             lr,  lr,  #24 // -bitdepth_min_8
774        ldr             r2,  [r1, #FGD_SEED]
775        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
776.ifc \type, y
777        add             r4,  r1,  #FGD_AR_COEFFS_Y
778.else
779        add             r4,  r1,  #FGD_AR_COEFFS_UV
780.endif
781        add             r9,  r9,  lr // grain_scale_shift - bitdepth_min_8
782        adr             r5,  L(gen_grain_\type\()_tbl)
783        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
784        add             r9,  r9,  #4
785        ldr             r6,  [r5, r6, lsl #2]
786        vdup.16         q15, r9    // 4 - bitdepth_min_8 + data->grain_scale_shift
787        add             r5,  r5,  r6
788        vneg.s16        q15, q15
789
790.ifc \type, uv_444
791        push            {lr}
792        cmp             r12, #0
793        movw            r10, #0x49d8
794        movw            lr,  #0xb524
795        // Intentionally using a separate register instead of moveq with an
796        // immediate constant, to avoid armv8 deprecated it instruction forms.
797        it              eq
798        moveq           r10, lr
799        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
800        eor             r2,  r2,  r10
801        pop             {lr}
802.endif
803
804        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
805        neg             lr,  lr             // bitdepth_min_8
806        mov             r8,  #1
807        mov             r10, #1
808        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
809        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
810        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
811        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
812
813        bx              r5
814
815        .align 2
816L(gen_grain_\type\()_tbl):
817        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
818        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
819        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
820        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
821
822L(generate_grain_\type\()_lag0):
823.ifc \type, y
824        mov             r1,  #GRAIN_HEIGHT
825        bl              generate_grain_rows_neon
826.else
827        mov             r5,  #128
828        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
829        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
830        mvn             r6,  r5             // grain_min = ~grain_max
831
832        mov             r1,  #3
833        bl              generate_grain_rows_neon
834        mov             r1,  #GRAIN_HEIGHT-3
835
836        vdup.32         q12, r7
837        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
838        vmov.i8         q0,  #0
839        vmov.i8         q1,  #255
840        vdup.16         q9,  r5
841        vdup.16         q10, r6
842        vext.8          q13, q0,  q1,  #10
843        vext.8          q14, q1,  q0,  #2
844        vneg.s32        q12, q12
845        vmovl.s8        q11, d22
846
8471:
848        vmov            q1,  q13
849        bl              gen_grain_uv_444_lag0_neon // 8
850        vmov.i8         q1,  #255
851        bl              gen_grain_uv_444_lag0_neon // 16
852        bl              gen_grain_uv_444_lag0_neon // 24
853        bl              gen_grain_uv_444_lag0_neon // 32
854        bl              gen_grain_uv_444_lag0_neon // 40
855        bl              gen_grain_uv_444_lag0_neon // 48
856        bl              gen_grain_uv_444_lag0_neon // 56
857        bl              gen_grain_uv_444_lag0_neon // 64
858        bl              gen_grain_uv_444_lag0_neon // 72
859        vmov            q1,  q14
860        bl              gen_grain_uv_444_lag0_neon // 80
861        get_grain_2     d16
862        subs            r1,  r1,  #1
863        add             r11, r11, #4
864        vst1.32         {d16[0]}, [r0]!
865        bgt             1b
866.endif
867        pop             {r4-r11,pc}
868
869L(generate_grain_\type\()_lag1):
870        vpush           {q4-q7}
871        mov             r5,  #128
872        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
873        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
874        vld1.8          {d27[]}, [r4]!      // ar_coeffs_y[0]
875        vld1.8          {d28[]}, [r4]!      // ar_coeffs_y[1]
876        vld1.8          {d29[]}, [r4]       // ar_coeffs_y[2]
877.ifc \type, y
878        ldrsb           r4,  [r4, #1]       // ar_coeffs_y[3]
879.else
880        add             r4,  r4,  #2
881.endif
882
883        mov             r1,  #3
884.ifc \type, uv_444
885        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
886        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
887.endif
888        bl              generate_grain_rows_neon
889        vmovl.s8        q13, d27
890        vmovl.s8        q12, d29
891        vmovl.s8        q14, d28
892        vmov            d29, d24
893.ifc \type, uv_444
894        vmovl.s8        q6,  d13
895.endif
896
897        mov             r1,  #GRAIN_HEIGHT - 3
8981:
899        bl              sum_\type\()_lag1_left_neon  // 8
900        bl              sum_\type\()_lag1_mid_neon   // 16
901        bl              sum_\type\()_lag1_mid_neon   // 24
902        bl              sum_\type\()_lag1_mid_neon   // 32
903        bl              sum_\type\()_lag1_mid_neon   // 40
904        bl              sum_\type\()_lag1_mid_neon   // 48
905        bl              sum_\type\()_lag1_mid_neon   // 56
906        bl              sum_\type\()_lag1_mid_neon   // 64
907        bl              sum_\type\()_lag1_mid_neon   // 72
908        bl              sum_\type\()_lag1_right_neon // 80
909        get_grain_2     d16
910        subs            r1,  r1,  #1
911.ifc \type, uv_444
912        add             r11, r11, #4
913.endif
914        vst1.32         {d16[0]}, [r0]!
915        bgt             1b
916
917        vpop            {q4-q7}
918        pop             {r4-r11,pc}
919
920L(generate_grain_\type\()_lag2):
921        vpush           {q4-q7}
922        mov             r5,  #128
923        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
924        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
925        vld1.8          {d28,d29}, [r4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
926
927        vmov.s8         r4,  d29[2]
928        vmov.s8         r10, d29[3]
929
930        mov             r1,  #3
931        bl              generate_grain_rows_neon
932
933        mov             r1,  #GRAIN_HEIGHT - 3
9341:
935        bl              sum_\type\()_lag2_left_neon  // 8
936        bl              sum_\type\()_lag2_mid_neon   // 16
937        bl              sum_\type\()_lag2_mid_neon   // 24
938        bl              sum_\type\()_lag2_mid_neon   // 32
939        bl              sum_\type\()_lag2_mid_neon   // 40
940        bl              sum_\type\()_lag2_mid_neon   // 48
941        bl              sum_\type\()_lag2_mid_neon   // 56
942        bl              sum_\type\()_lag2_mid_neon   // 64
943        bl              sum_\type\()_lag2_mid_neon   // 72
944        bl              sum_\type\()_lag2_right_neon // 80
945        get_grain_2     d16
946        subs            r1,  r1,  #1
947.ifc \type, uv_444
948        add             r11, r11, #4
949.endif
950        vst1.32         {d16[0]}, [r0]!
951        bgt             1b
952
953        vpop            {q4-q7}
954        pop             {r4-r11,pc}
955
956L(generate_grain_\type\()_lag3):
957        vpush           {q4-q7}
958        mov             r5,  #128
959        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
960        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
961        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
962
963        vmov.u8         r4,  d28[5]
964        vmov.u8         r10, d28[6]
965        vmov.u8         r12, d28[7]
966
967        orr             r4,  r4,  r10, lsl #8
968        orr             r4,  r4,  r12, lsl #16
969
970        mov             r1,  #3
971        vpush           {d26}
972        bl              generate_grain_rows_neon
973        vpop            {d26}
974
975        mov             r1,  #GRAIN_HEIGHT - 3
9761:
977        bl              sum_\type\()_lag3_left_neon  // 8
978        bl              sum_\type\()_lag3_mid_neon   // 16
979        bl              sum_\type\()_lag3_mid_neon   // 24
980        bl              sum_\type\()_lag3_mid_neon   // 32
981        bl              sum_\type\()_lag3_mid_neon   // 40
982        bl              sum_\type\()_lag3_mid_neon   // 48
983        bl              sum_\type\()_lag3_mid_neon   // 56
984        bl              sum_\type\()_lag3_mid_neon   // 64
985        bl              sum_\type\()_lag3_mid_neon   // 72
986        bl              sum_\type\()_lag3_right_neon // 80
987        get_grain_2     d16
988        subs            r1,  r1,  #1
989.ifc \type, uv_444
990        add             r11, r11, #4
991.endif
992        vst1.32         {d16[0]}, [r0]!
993        bgt             1b
994
995        vpop            {q4-q7}
996        pop             {r4-r11,pc}
997endfunc
998.endm
999
1000gen_grain_82 y
1001gen_grain_82 uv_444
1002
1003.macro set_height dst, type
1004.ifc \type, uv_420
1005        mov             \dst,  #SUB_GRAIN_HEIGHT-3
1006.else
1007        mov             \dst,  #GRAIN_HEIGHT-3
1008.endif
1009.endm
1010
1011.macro increment_y_ptr reg, type
1012.ifc \type, uv_420
1013        add             \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
1014.else
1015        sub             \reg, \reg, #6*32-GRAIN_WIDTH*2
1016.endif
1017.endm
1018
1019.macro gen_grain_44 type
1020function generate_grain_\type\()_16bpc_neon, export=1
1021        push            {r4-r11,lr}
1022
1023        ldr             r4,  [sp, #36]
1024        mov             r12, r3
1025        movw            r11, #(3*GRAIN_WIDTH-3)*2
1026        mov             lr,  #28
1027        add             r11, r1,  r11
1028        mov             r1,  r2
1029        mul             r12, r12, lr
1030        clz             lr,  r4
1031
1032        movrel          r3,  X(gaussian_sequence)
1033        sub             lr,  lr,  #24 // -bitdepth_min_8
1034        ldr             r2,  [r1, #FGD_SEED]
1035        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
1036        add             r4,  r1,  #FGD_AR_COEFFS_UV
1037        add             r9,  r9,  lr // grain_scale_shift - bitdepth_min_8
1038        adr             r5,  L(gen_grain_\type\()_tbl)
1039        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
1040        add             r9,  r9,  #4
1041        ldr             r6,  [r5, r6, lsl #2]
1042        vdup.16         q15, r9    // 4 - bitdepth_min_8 + data->grain_scale_shift
1043        add             r5,  r5,  r6
1044        vneg.s16        q15, q15
1045
1046        push            {lr}
1047        cmp             r12, #0
1048        movw            r10, #0x49d8
1049        movw            lr,  #0xb524
1050        // Intentionally using a separate register instead of moveq with an
1051        // immediate constant, to avoid armv8 deprecated it instruction forms.
1052        it              eq
1053        moveq           r10, lr
1054        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
1055        eor             r2,  r2,  r10
1056        pop             {lr}
1057
1058        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
1059        neg             lr,  lr
1060        mov             r8,  #1
1061        mov             r10, #1
1062        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
1063        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
1064        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
1065        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
1066        bx              r5
1067
1068        .align 2
1069L(gen_grain_\type\()_tbl):
1070        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1071        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1072        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1073        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
1074
1075L(generate_grain_\type\()_lag0):
1076.ifc \type, uv_420
1077        vpush           {q4-q5}
1078.endif
1079        mov             r5,  #128
1080        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
1081        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
1082        mvn             r6,  r5             // grain_min = ~grain_max
1083
1084        mov             r1,  #3
1085        bl              generate_grain_rows_44_neon
1086        set_height      r1,  \type
1087
1088        vdup.32         q12, r7
1089        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
1090        vmov.i8         q0,  #0
1091        vmov.i8         q1,  #255
1092        vdup.16         q9,  r5
1093        vdup.16         q10, r6
1094        vext.8          q13, q0,  q1,  #10
1095        vext.8          q14, q1,  q0,  #14
1096        vneg.s32        q12, q12
1097        vmovl.s8        q11, d22
1098
10991:
1100        vmov            q1,  q13
1101        bl              gen_grain_\type\()_lag0_8_neon // 8
1102        vmov.i8         q1,  #255
1103        bl              gen_grain_\type\()_lag0_8_neon // 16
1104        bl              gen_grain_\type\()_lag0_8_neon // 24
1105        bl              gen_grain_\type\()_lag0_8_neon // 32
1106        bl              gen_grain_\type\()_lag0_8_neon // 40
1107        vmov            q1,  q14
1108        bl              gen_grain_\type\()_lag0_4_neon // 44
1109        subs            r1,  r1,  #1
1110        increment_y_ptr r11, \type
1111        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
1112        bgt             1b
1113
1114.ifc \type, uv_420
1115        vpop            {q4-q5}
1116.endif
1117        pop             {r4-r11,pc}
1118
1119L(generate_grain_\type\()_lag1):
1120        vpush           {q4-q7}
1121        mov             r5,  #128
1122        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
1123        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
1124        vld1.8          {d27[]}, [r4]!      // ar_coeffs_uv[0]
1125        vld1.8          {d28[]}, [r4]!      // ar_coeffs_uv[1]
1126        vld1.8          {d29[]}, [r4]       // ar_coeffs_uv[2]
1127        add             r4,  r4,  #2
1128
1129        mov             r1,  #3
1130        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
1131        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
1132        bl              generate_grain_rows_44_neon
1133        vmovl.s8        q13, d27
1134        vmovl.s8        q12, d29
1135        vmovl.s8        q14, d28
1136        vmov            d29, d24
1137        vmovl.s8        q6,  d13
1138
1139        set_height      r1,  \type
11401:
1141        bl              sum_\type\()_lag1_left_neon  // 8
1142        bl              sum_\type\()_lag1_mid_neon   // 16
1143        bl              sum_\type\()_lag1_mid_neon   // 24
1144        bl              sum_\type\()_lag1_mid_neon   // 32
1145        bl              sum_\type\()_lag1_mid_neon   // 40
1146        bl              sum_\type\()_lag1_right_neon // 44
1147        subs            r1,  r1,  #1
1148        increment_y_ptr r11, \type
1149        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
1150        bgt             1b
1151
1152        vpop            {q4-q7}
1153        pop             {r4-r11,pc}
1154
1155L(generate_grain_\type\()_lag2):
1156        vpush           {q4-q7}
1157        mov             r5,  #128
1158        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
1159        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
1160        vld1.8          {d28,d29}, [r4]     // ar_coeffs_uv[0-12]
1161
1162        vmov.s8         r4,  d29[2]
1163        vmov.s8         r10, d29[3]
1164
1165        mov             r1,  #3
1166        bl              generate_grain_rows_44_neon
1167
1168        set_height      r1,  \type
11691:
1170        bl              sum_\type\()_lag2_left_neon  // 8
1171        bl              sum_\type\()_lag2_mid_neon   // 16
1172        bl              sum_\type\()_lag2_mid_neon   // 24
1173        bl              sum_\type\()_lag2_mid_neon   // 32
1174        bl              sum_\type\()_lag2_mid_neon   // 40
1175        bl              sum_\type\()_lag2_right_neon // 44
1176        subs            r1,  r1,  #1
1177        increment_y_ptr r11, \type
1178        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
1179        bgt             1b
1180
1181        vpop            {q4-q7}
1182        pop             {r4-r11,pc}
1183
1184L(generate_grain_\type\()_lag3):
1185        vpush           {q4-q7}
1186        mov             r5,  #128
1187        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
1188        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
1189        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
1190
1191        vmov.u8         r4,  d28[5]
1192        vmov.u8         r10, d28[6]
1193        vmov.u8         r12, d28[7]
1194
1195        orr             r4,  r4,  r10, lsl #8
1196        orr             r4,  r4,  r12, lsl #16
1197
1198        mov             r1,  #3
1199        bl              generate_grain_rows_44_neon
1200
1201        set_height      r1,  \type
12021:
1203        bl              sum_\type\()_lag3_left_neon  // 8
1204        bl              sum_\type\()_lag3_mid_neon   // 16
1205        bl              sum_\type\()_lag3_mid_neon   // 24
1206        bl              sum_\type\()_lag3_mid_neon   // 32
1207        bl              sum_\type\()_lag3_mid_neon   // 40
1208        bl              sum_\type\()_lag3_right_neon // 44
1209        subs            r1,  r1,  #1
1210        increment_y_ptr r11, \type
1211        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
1212        bgt             1b
1213
1214        vpop            {q4-q7}
1215        pop             {r4-r11,pc}
1216endfunc
1217.endm
1218
1219gen_grain_44 uv_420
1220gen_grain_44 uv_422
1221
1222.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off
1223        vmov.u16        r11, \src1[0+\off]
1224        vmov.u16        r12, \src3[0+\off]
1225        add             r11, r11, r3
1226        vmov.u16        lr,  \src1[2+\off]
1227        add             r12, r12, r3
1228        vld1.8          {\dst1[0+\off]}, [r11]
1229        vmov.u16        r11, \src3[2+\off]
1230        add             lr,  lr,  r3
1231        vld1.8          {\dst2[0+\off]}, [r12]
1232        vmov.u16        r12, \src2[0+\off]
1233        add             r11, r11, r3
1234        vld1.8          {\dst1[2+\off]}, [lr]
1235        vmov.u16        lr,  \src4[0+\off]
1236        add             r12, r12, r3
1237        vld1.8          {\dst2[2+\off]}, [r11]
1238        vmov.u16        r11, \src2[2+\off]
1239        add             lr,  lr,  r3
1240        vld1.8          {\dst1[4+\off]}, [r12]
1241        vmov.u16        r12, \src4[2+\off]
1242        add             r11, r11, r3
1243        vld1.8          {\dst2[4+\off]}, [lr]
1244        add             r12, r12, r3
1245        vld1.8          {\dst1[6+\off]}, [r11]
1246        vld1.8          {\dst2[6+\off]}, [r12]
1247.endm
1248
1249.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8
1250        gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0
1251        gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1
1252        gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0
1253        gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1
1254.endm
1255
1256function gather32_neon
1257        push            {r11-r12,lr}
1258        gather          d8,  d9,  d10, d11, d0,  d1,  d2,  d3,  d4,  d5,  d6,  d7
1259        pop             {r11-r12,pc}
1260endfunc
1261
1262function gather16_neon
1263        push            {r11-r12,lr}
1264        gather_interleaved d8,  d9,  d0,  d1,  d2,  d3,  0
1265        gather_interleaved d8,  d9,  d0,  d1,  d2,  d3,  1
1266        pop             {r11-r12,pc}
1267endfunc
1268
1269const overlap_coeffs_0, align=4
1270        .short 27, 17, 0,  0
1271        .short 17, 27, 32, 32
1272endconst
1273
1274const overlap_coeffs_1, align=4
1275        .short 23, 0,  0,  0
1276        .short 22, 32, 32, 32
1277endconst
1278
1279.macro calc_offset offx, offy, src, sx, sy
1280        and             \offy, \src,  #0xF     // randval & 0xF
1281        lsr             \offx, \src,  #4       // randval >> 4
1282.if \sy == 0
1283        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
1284.endif
1285.if \sx == 0
1286        add             \offx, \offx, \offx    // 2 * (randval >> 4)
1287.endif
1288.endm
1289
1290.macro add_offset dst, offx, offy, src, stride
1291        mla             \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
1292        add             \dst, \dst, \offx, lsl #1  // grain_lut += offx
1293.endm
1294
1295// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
1296//                                 const ptrdiff_t stride,
1297//                                 const uint8_t scaling[SCALING_SIZE],
1298//                                 const int scaling_shift,
1299//                                 const entry grain_lut[][GRAIN_WIDTH],
1300//                                 const int offsets[][2],
1301//                                 const int h, const ptrdiff_t clip,
1302//                                 const ptrdiff_t type,
1303//                                 const int bitdepth_max);
1304function fgy_32x32_16bpc_neon, export=1
1305        push            {r4-r11,lr}
1306        vpush           {q4-q7}
1307        ldrd            r4,  r5,  [sp, #100]   // scaling_shift, grain_lut
1308        ldrd            r6,  r7,  [sp, #108]   // offsets, h
1309        ldr             r8,       [sp, #116]   // clip
1310        mov             r9,  #GRAIN_WIDTH*2    // grain_lut stride
1311        ldr             r10,      [sp, #124]   // bitdepth_max
1312
1313        eor             r4,  r4,  #15          // 15 - scaling_shift
1314        vdup.16         q6,  r10               // bitdepth_max
1315        clz             r10, r10
1316        vdup.16         q13, r4                // 15 - scaling_shift
1317        rsb             r10, r10, #24          // bitdepth_min_8
1318        cmp             r8,  #0
1319        vdup.16         q12, r10               // bitdepth_min_8
1320
1321        movrel_local    r12, overlap_coeffs_0
1322
1323        beq             1f
1324        // clip
1325        vmov.i16        q14, #16
1326        vmov.i16        q15, #235
1327        vshl.s16        q14, q14, q12
1328        vshl.s16        q15, q15, q12
1329        b               2f
13301:
1331        // no clip
1332        vmov.i16        q14, #0
1333        vmov            q15, q6
13342:
1335        vshr.u16        q6,  q6,  #1           // grain_max
1336
1337        vld1.16         {d24, d25}, [r12, :128] // overlap_coeffs
1338
1339        add             r5,  r5,  #18          // grain_lut += 9
1340        add             r5,  r5,  r9,  lsl #3  // grain_lut += 8 * grain_stride
1341        add             r5,  r5,  r9           // grain_lut += grain_stride
1342
1343        ldr             r10, [r6, #8]          // offsets[1][0]
1344        calc_offset     r10, r4,  r10, 0,   0
1345        add_offset      r4,  r10, r4,  r5,  r9
1346        ldr             r10, [r6, #4]          // offsets[0][1]
1347        calc_offset     r10, r11, r10, 0,   0
1348        add_offset      r11, r10, r11, r5,  r9
1349        ldr             r10, [r6, #12]         // offsets[1][1]
1350        calc_offset     r10, r8,  r10, 0,   0
1351        add_offset      r8,  r10, r8,  r5,  r9
1352        ldr             r6,  [r6]              // offsets[0][0]
1353        calc_offset     r6,  lr,  r6,  0,   0
1354        add_offset      r5,  r6,  lr,  r5,  r9
1355
1356        add             r4,  r4,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx
1357        add             r6,  r11, r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1358
1359        ldr             r10, [sp, #120]        // type
1360        adr             r11, L(fgy_loop_tbl)
1361
1362        tst             r10, #1
1363        ldr             r10, [r11, r10, lsl #2]
1364
1365        add             r8,  r8,  r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1366        add             r8,  r8,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx
1367
1368        add             r11, r11, r10
1369
1370        beq             1f
1371        // y overlap
1372        vdup.16         d14, d24[0]
1373        vdup.16         d15, d24[1]
1374        mov             r10, r7                // backup actual h
1375        mov             r7,  #2
13761:
1377        sub             r2,  r2,  #32          // src_stride   -= 32
1378        sub             r9,  r9,  #32          // grain_stride -= 32
1379        bx              r11
1380endfunc
1381
1382function fgy_loop_neon
1383L(fgy_loop_tbl):
1384        .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
1385        .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
1386        .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
1387        .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
1388
1389.macro fgy ox, oy
1390L(loop_\ox\oy):
13911:
1392.if \ox
1393        vld1.16         {d0},       [r4],       r9 // grain_lut old
1394.endif
1395.if \oy
1396        vld1.16         {q2,  q3},  [r6]!          // grain_lut top
1397.endif
1398.if \ox && \oy
1399        vld1.16         {d2},       [r8],       r9 // grain_lut top old
1400.endif
1401.if \oy
1402        vld1.16         {q4,  q5},  [r6],       r9 // grain_lut top
1403.endif
1404.if !\ox && !\oy
1405        vld1.16         {q0,  q1},  [r1, :128]!    // src
1406.endif
1407        vld1.16         {q8,  q9},  [r5]!          // grain_lut
1408.if !\ox && !\oy
1409        vld1.16         {q2,  q3},  [r1, :128], r2 // src
1410.endif
1411.if !\oy
1412        vmvn.i16        q5,  #0xf000               // 0x0fff
1413.endif
1414        vld1.16         {q10, q11}, [r5],       r9 // grain_lut
1415
1416.if \ox
1417        add             r4,  r4,  #32
1418        vmull.s16       q0,  d0,  d24
1419        vmlal.s16       q0,  d16, d25
1420.endif
1421
1422.if \oy
1423.if \ox
1424        add             r8,  r8,  #32
1425        vmull.s16       q1,  d2,  d24
1426        vmlal.s16       q1,  d4,  d25
1427        vqrshrn.s32     d16, q0,  #5
1428        vmvn            d0,  d12                   // grain_min
1429        vqrshrn.s32     d4,  q1,  #5
1430        vmin.s16        d16, d16, d12
1431        vmin.s16        d4,  d4,  d12
1432        vmax.s16        d16, d16, d0
1433        vmax.s16        d4,  d4,  d0
1434.endif
1435
1436        vmull.s16       q0,  d4,  d14
1437        vmull.s16       q1,  d5,  d14
1438        vmull.s16       q2,  d6,  d14
1439        vmull.s16       q3,  d7,  d14
1440        vmlal.s16       q0,  d16, d15
1441        vmlal.s16       q1,  d17, d15
1442        vmlal.s16       q2,  d18, d15
1443        vmlal.s16       q3,  d19, d15
1444        vmull.s16       q8,  d20, d15
1445        vmull.s16       q9,  d21, d15
1446        vmull.s16       q10, d22, d15
1447        vmull.s16       q11, d23, d15
1448        vmlal.s16       q8,  d8,  d14
1449        vmlal.s16       q9,  d9,  d14
1450        vmlal.s16       q10, d10, d14
1451        vmlal.s16       q11, d11, d14
1452        vmvn            q4,  q6                   // grain_min
1453        vqrshrn.s32     d0,  q0,  #5
1454        vqrshrn.s32     d1,  q1,  #5
1455        vqrshrn.s32     d2,  q2,  #5
1456        vqrshrn.s32     d3,  q3,  #5
1457        vqrshrn.s32     d4,  q8,  #5
1458        vqrshrn.s32     d5,  q9,  #5
1459        vqrshrn.s32     d6,  q10, #5
1460        vqrshrn.s32     d7,  q11, #5
1461        vmin.s16        q8,  q0,  q6
1462        vmin.s16        q9,  q1,  q6
1463        vld1.16         {q0,  q1},  [r1, :128]!    // src
1464        vmin.s16        q10, q2,  q6
1465        vmin.s16        q11, q3,  q6
1466        vmax.s16        q8,  q8,  q4
1467        vmax.s16        q9,  q9,  q4
1468        vld1.16         {q2,  q3},  [r1, :128], r2 // src
1469        vmvn.i16        q5,  #0xf000               // 0x0fff
1470        vmax.s16        q10, q10, q4
1471        vmax.s16        q11, q11, q4
1472.elseif \ox
1473        vmvn            d4,  d12                   // grain_min
1474        vqrshrn.s32     d16, q0,  #5
1475        vld1.16         {q0,  q1},  [r1, :128]!    // src
1476        vmin.s16        d16, d16, d12
1477        vmax.s16        d16, d16, d4
1478        vld1.16         {q2,  q3},  [r1, :128], r2 // src
1479.endif
1480
1481        // Make sure that uninitialized pixels out of range past the right
1482        // edge are in range; their actual values shouldn't matter.
1483        vand            q0,  q0,  q5
1484        vand            q1,  q1,  q5
1485        vand            q2,  q2,  q5
1486        vand            q3,  q3,  q5
1487
1488        bl              gather32_neon
1489
1490.if \ox || \oy
1491        vpush           {q6-q7}
1492.endif
1493
1494        vmovl.u8        q6,  d8        // scaling
1495        vmovl.u8        q7,  d9
1496        vmovl.u8        q4,  d10
1497        vmovl.u8        q5,  d11
1498
1499        vshl.u16        q6,  q6,  q13  // scaling << (15 - scaling_shift)
1500        vshl.u16        q7,  q7,  q13
1501        vshl.u16        q4,  q4,  q13
1502        vshl.u16        q5,  q5,  q13
1503
1504        vqrdmulh.s16    q8,  q8,  q6   // round2((scaling << (15 - scaling_shift) * grain, 15)
1505        vqrdmulh.s16    q9,  q9,  q7
1506        vqrdmulh.s16    q10, q10, q4
1507        vqrdmulh.s16    q11, q11, q5
1508
1509.if \ox || \oy
1510        vpop            {q6-q7}
1511.endif
1512
1513        vqadd.s16       q0,  q0,  q8   // *src + noise
1514        vqadd.s16       q1,  q1,  q9
1515        vqadd.s16       q2,  q2,  q10
1516        vqadd.s16       q3,  q3,  q11
1517
1518        vmax.s16        q0,  q0,  q14
1519        vmax.s16        q1,  q1,  q14
1520        vmax.s16        q2,  q2,  q14
1521        vmax.s16        q3,  q3,  q14
1522        vmin.s16        q0,  q0,  q15
1523        vmin.s16        q1,  q1,  q15
1524        vmin.s16        q2,  q2,  q15
1525        vmin.s16        q3,  q3,  q15
1526
1527        vst1.16         {q0, q1}, [r0, :128]!    // dst
1528        subs            r7,  r7,  #1
1529.if \oy
1530        vdup.16         d14, d25[0]
1531        vdup.16         d15, d25[1]
1532.endif
1533        vst1.16         {q2, q3}, [r0, :128], r2 // dst
1534        bgt             1b
1535
1536.if \oy
1537        cmp             r10, #2
1538        sub             r7,  r10, #2           // restore actual remaining h
1539        bgt             L(loop_\ox\()0)
1540.endif
1541        vpop            {q4-q7}
1542        pop             {r4-r11,pc}
1543.endm
1544
1545        fgy             0, 0
1546        fgy             0, 1
1547        fgy             1, 0
1548        fgy             1, 1
1549endfunc
1550
1551// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
1552//                                      const pixel *const src,
1553//                                      const ptrdiff_t stride,
1554//                                      const uint8_t scaling[SCALING_SIZE],
1555//                                      const Dav1dFilmGrainData *const data,
1556//                                      const entry grain_lut[][GRAIN_WIDTH],
1557//                                      const pixel *const luma_row,
1558//                                      const ptrdiff_t luma_stride,
1559//                                      const int offsets[][2],
1560//                                      const ptrdiff_t h, const ptrdiff_t uv,
1561//                                      const ptrdiff_t is_id,
1562//                                      const ptrdiff_t type,
1563//                                      const int bitdepth_max);
1564.macro fguv layout, sx, sy
1565function fguv_32x32_\layout\()_16bpc_neon, export=1
1566        push            {r4-r11,lr}
1567        vpush           {q4-q7}
1568        ldrd            r4,  r5,  [sp, #100]   // data, grain_lut
1569        ldrd            r10, r11, [sp, #124]   // uv, is_id
1570        ldr             r6,       [sp, #136]   // bitdepth_max
1571
1572        clz             r7,  r6
1573        rsb             r7,  r7,  #24          // bitdepth_min_8
1574
1575        // !csfl
1576        add             r10, r4,  r10, lsl #2  // + 4*uv
1577        add             r12, r10, #FGD_UV_LUMA_MULT
1578        add             lr,  r10, #FGD_UV_MULT
1579        ldrh            r10, [r10, #FGD_UV_OFFSET] // uv_offset
1580        vld1.16         {d30[]},  [r12]        // uv_luma_mult
1581        lsl             r10, r10, r7           // uv_offset << bitdepth_min_8
1582        vld1.16         {d30[1]}, [lr]         // uv_mult
1583
1584        ldr             lr,  [r4, #FGD_SCALING_SHIFT]
1585        ldr             r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
1586        eor             lr,  lr,  #15          // 15 - scaling_shift
1587
1588        vmov.16         d30[2], r10            // uv_offset << bitdepth_min_8
1589
1590        cmp             r12, #0
1591        vdup.16         q13, lr                // 15 - scaling_shift
1592
1593        beq             1f
1594        // clip
1595        cmp             r11, #0
1596        mov             r8,  #16
1597        mov             r9,  #240
1598        lsl             r8,  r8,  r7
1599        lsl             r9,  r9,  r7
1600        beq             2f
1601        // is_id
1602        mov             r9,  #235
1603        lsl             r9,  r9,  r7
1604        b               2f
16051:
1606        // no clip
1607        mov             r8,  #0
1608        mov             r9,  r6                // bitdepth_max
16092:
1610        vmov.16         d30[3], r6             // bitdepth_max
1611        vdup.16         d31, r8                // clip_min
1612
1613        mov             r10, #GRAIN_WIDTH*2    // grain_lut stride
1614
1615.if \sy
1616        mov             r6,  #23
1617        mov             r7,  #22
1618.else
1619        mov             r6,  #27
1620        mov             r7,  #17
1621.endif
1622        vmov.16         d31[1], r9             // clip_max
1623
1624        ldrd            r8,  r9,  [sp, #116]   // offsets, h
1625
1626        add             r5,  r5,  #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
1627.if \sy
1628        add             r5,  r5,  r10, lsl #2  // grain_lut += 4 * grain_stride
1629        add             r5,  r5,  r10, lsl #1  // grain_lut += 2 * grain_stride
1630.else
1631        add             r5,  r5,  r10, lsl #3  // grain_lut += 8 * grain_stride
1632        add             r5,  r5,  r10          // grain_lut += grain_stride
1633.endif
1634        vmov.16         d31[2], r6             // overlap y [0]
1635
1636        ldr             r12, [r8, #8]          // offsets[1][0]
1637        calc_offset     r12, r4,  r12, \sx, \sy
1638        add_offset      r4,  r12, r4,  r5,  r10
1639
1640        ldr             r12, [r8, #4]          // offsets[0][1]
1641        calc_offset     r12, lr,  r12, \sx, \sy
1642        add_offset      lr,  r12, lr,  r5,  r10
1643
1644        ldr             r12, [r8, #12]         // offsets[1][1]
1645        calc_offset     r12, r11, r12, \sx, \sy
1646        add_offset      r11, r12, r11, r5,  r10
1647
1648        ldr             r8,  [r8]              // offsets[0][0]
1649        calc_offset     r8,  r12, r8,  \sx, \sy
1650        add_offset      r5,  r8,  r12, r5,  r10
1651
1652        vmov.16         d31[3], r7             // overlap y [1]
1653
1654        add             r4,  r4,  #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
1655        add             r8,  lr,  r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1656        add             r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
1657        add             r11, r11, #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
1658
1659        movrel_local    r12, overlap_coeffs_\sx
1660        ldr             lr,       [sp, #132]   // type
1661        ldrd            r6,  r7,  [sp, #108]   // luma_row, luma_stride
1662
1663        vld1.16         {d24, d25}, [r12, :128] // overlap_coeffs
1664
1665        movrel_local    r12, L(fguv_loop_sx\sx\()_tbl)
1666#if CONFIG_THUMB
1667        // This uses movrel_local instead of adr above, because the target
1668        // can be out of range for adr. But movrel_local leaves the thumb bit
1669        // set on COFF (but probably wouldn't if building for thumb on ELF),
1670        // thus try to clear the bit for robustness.
1671        bic             r12, r12, #1
1672#endif
1673
1674        tst             lr,  #1
1675        ldr             lr,  [r12, lr,  lsl #2]
1676
1677        add             r12, r12, lr
1678
1679        beq             1f
1680        // y overlap
1681        sub             lr,  r9,  #(2 >> \sy)  // backup remaining h
1682        mov             r9,  #(2 >> \sy)
1683
16841:
1685.if \sy
1686        add             r7,  r7,  r7           // luma_stride *= 2
1687.endif
1688        sub             r7,  r7,  #32          // luma_stride -= 32
1689
1690        bx              r12
1691endfunc
1692.endm
1693
1694fguv 420, 1, 1
1695fguv 422, 1, 0
1696fguv 444, 0, 0
1697
1698function fguv_loop_sx0_neon
1699L(fguv_loop_sx0_tbl):
1700        .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1701        .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1702        .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1703        .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1704        .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1705        .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1706        .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1707        .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
1708
1709.macro fguv_loop_sx0 csfl, ox, oy
1710L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
1711        sub             r2,  r2,  #32          // src_stride   -= 32
1712        sub             r10, r10, #32          // grain_stride -= 32
1713.if \oy
1714        mov             r12, lr
1715.endif
1716L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart):
17171:
1718.if \ox
1719        vld1.16         {d0},       [r4],       r10 // grain_lut old
1720.endif
1721.if \oy
1722        vld1.16         {q2,  q3},  [r8]!           // grain_lut top
1723.endif
1724.if \ox && \oy
1725        vld1.16         {d2},       [r11],      r10 // grain_lut top old
1726.endif
1727.if !\ox && !\oy
1728        vld1.16         {q0,  q1},  [r6, :128]!     // luma
1729.endif
1730        vld1.16         {q8,  q9},  [r5]!           // grain_lut
1731.if \oy
1732        vld1.16         {q4,  q5},  [r8],       r10 // grain_lut top
1733.endif
1734.if !\ox && !\oy
1735        vld1.16         {q2,  q3},  [r6, :128], r7  // luma
1736.endif
1737.if \oy
1738        vdup.16         d28, d31[2]                 // overlap y coeff
1739        vdup.16         d29, d31[3]                 // overlap y coeff
1740.endif
1741        vld1.16         {q10, q11}, [r5],       r10 // grain_lut
1742
1743.if \ox
1744        vdup.16         q7,  d30[3]                // bitdepth_max
1745        add             r4,  r4,  #32
1746        vmull.s16       q0,  d0,  d24
1747        vshr.u16        q7,  q7,  #1               // grain_max
1748        vmlal.s16       q0,  d16, d25
1749        vmvn            q6,  q7                    // grain_min
1750.endif
1751
1752.if \oy
1753.if \ox
1754        add             r11, r11, #32
1755        vmull.s16       q1,  d2,  d24
1756        vmlal.s16       q1,  d4,  d25
1757        vqrshrn.s32     d16, q0,  #5
1758        vqrshrn.s32     d4,  q1,  #5
1759        vmin.s16        d4,  d4,  d14
1760        vmin.s16        d16, d16, d14
1761        vmax.s16        d4,  d4,  d12
1762        vmax.s16        d16, d16, d12
1763.endif
1764
1765        vmull.s16       q0,  d4,  d28
1766        vmull.s16       q1,  d5,  d28
1767        vmull.s16       q2,  d6,  d28
1768        vmull.s16       q3,  d7,  d28
1769.if !\ox
1770        vdup.16         q7,  d30[3]                // bitdepth_max
1771.endif
1772        vmlal.s16       q0,  d16, d29
1773        vmlal.s16       q1,  d17, d29
1774        vmlal.s16       q2,  d18, d29
1775        vmlal.s16       q3,  d19, d29
1776.if !\ox
1777        vshr.u16        q7,  q7,  #1               // grain_max
1778.endif
1779        vmull.s16       q8,  d20, d29
1780        vmull.s16       q9,  d21, d29
1781        vmull.s16       q10, d22, d29
1782        vmull.s16       q11, d23, d29
1783.if !\ox
1784        vmvn            q6,  q7                    // grain_min
1785.endif
1786        vmlal.s16       q8,  d8,  d28
1787        vmlal.s16       q9,  d9,  d28
1788        vmlal.s16       q10, d10, d28
1789        vmlal.s16       q11, d11, d28
1790        vqrshrn.s32     d0,  q0,  #5
1791        vqrshrn.s32     d1,  q1,  #5
1792        vqrshrn.s32     d2,  q2,  #5
1793        vqrshrn.s32     d3,  q3,  #5
1794        vqrshrn.s32     d4,  q8,  #5
1795        vqrshrn.s32     d5,  q9,  #5
1796        vqrshrn.s32     d6,  q10, #5
1797        vqrshrn.s32     d7,  q11, #5
1798        vmin.s16        q8,  q0,  q7
1799        vmin.s16        q9,  q1,  q7
1800        vld1.16         {q0,  q1},  [r6, :128]!    // luma
1801        vmin.s16        q10, q2,  q7
1802        vmin.s16        q11, q3,  q7
1803        vmax.s16        q8,  q8,  q6
1804        vmax.s16        q9,  q9,  q6
1805        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
1806        vmax.s16        q10, q10, q6
1807        vmax.s16        q11, q11, q6
1808.elseif \ox
1809        vqrshrn.s32     d16, q0,  #5
1810        vld1.16         {q0,  q1},  [r6, :128]!    // luma
1811        vmin.s16        d16, d16, d14
1812        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
1813        vmax.s16        d16, d16, d12
1814.endif
1815
1816.if !\csfl
1817        vdup.16         d28, d30[0]   // uv_luma_mult
1818        vld1.16         {q4,  q5},  [r1, :128]! // src
1819        vdup.16         d29, d30[1]   // uv_mult
1820        vmull.s16       q6,  d0,  d28
1821        vmull.s16       q7,  d1,  d28
1822        vmull.s16       q0,  d2,  d28
1823        vmull.s16       q1,  d3,  d28
1824        vmlal.s16       q6,  d8,  d29
1825        vmlal.s16       q7,  d9,  d29
1826        vmlal.s16       q0,  d10, d29
1827        vmlal.s16       q1,  d11, d29
1828        vld1.16         {q4,  q5},  [r1, :128]  // src
1829        sub             r1,  r1,  #32
1830        vshrn.s32       d12, q6,  #6
1831        vshrn.s32       d13, q7,  #6
1832        vshrn.s32       d14, q0,  #6
1833        vshrn.s32       d15, q1,  #6
1834        vmull.s16       q0,  d4,  d28
1835        vmull.s16       q1,  d5,  d28
1836        vmull.s16       q2,  d6,  d28
1837        vmull.s16       q3,  d7,  d28
1838        vmlal.s16       q0,  d8,  d29
1839        vmlal.s16       q1,  d9,  d29
1840        vmlal.s16       q2,  d10, d29
1841        vmlal.s16       q3,  d11, d29
1842        vdup.16         q14, d30[2]   // uv_offset
1843        vshrn.s32       d0,  q0,  #6
1844        vshrn.s32       d1,  q1,  #6
1845        vshrn.s32       d2,  q2,  #6
1846        vshrn.s32       d3,  q3,  #6
1847        vdup.16         q4,  d30[3]   // bitdepth_max
1848        vmov.i16        q5,  #0
1849        vadd.i16        q6,  q6,  q14
1850        vadd.i16        q7,  q7,  q14
1851        vadd.i16        q2,  q0,  q14
1852        vadd.i16        q3,  q1,  q14
1853        vmin.s16        q0,  q6,  q4
1854        vmin.s16        q1,  q7,  q4
1855        vmin.s16        q2,  q2,  q4
1856        vmin.s16        q3,  q3,  q4
1857        vmax.s16        q0,  q0,  q5
1858        vmax.s16        q1,  q1,  q5
1859        vmax.s16        q2,  q2,  q5
1860        vmax.s16        q3,  q3,  q5
1861.else
1862        vdup.16         q14, d30[3]  // bitdepth_max
1863        // Make sure that uninitialized pixels out of range past the right
1864        // edge are in range; their actual values shouldn't matter.
1865        vand            q0,  q0,  q14
1866        vand            q1,  q1,  q14
1867        vand            q2,  q2,  q14
1868        vand            q3,  q3,  q14
1869.endif
1870
1871        bl              gather32_neon
1872
1873        vld1.16         {q0,  q1},  [r1, :128]!    // src
1874
1875        vmovl.u8        q6,  d8        // scaling
1876        vmovl.u8        q7,  d9
1877        vmovl.u8        q4,  d10
1878        vmovl.u8        q5,  d11
1879
1880        vld1.16         {q2,  q3},  [r1, :128], r2 // src
1881
1882        vshl.u16        q6,  q6,  q13  // scaling << (15 - scaling_shift)
1883        vshl.u16        q7,  q7,  q13
1884        vshl.u16        q4,  q4,  q13
1885        vshl.u16        q5,  q5,  q13
1886
1887        vqrdmulh.s16    q8,  q8,  q6   // round2((scaling << (15 - scaling_shift) * grain, 15)
1888        vqrdmulh.s16    q9,  q9,  q7
1889        vqrdmulh.s16    q10, q10, q4
1890        vqrdmulh.s16    q11, q11, q5
1891
1892
1893        vdup.16         q4,  d31[0]    // clip_min
1894        vdup.16         q5,  d31[1]    // clip_max
1895
1896        vqadd.s16       q0,  q0,  q8   // *src + noise
1897        vqadd.s16       q1,  q1,  q9
1898        vqadd.s16       q2,  q2,  q10
1899        vqadd.s16       q3,  q3,  q11
1900
1901.if \oy
1902        vmov.32         lr,  d25[0] // 2 first 16 bit coeffs from overlap x
1903.endif
1904
1905        vmax.s16        q0,  q0,  q4
1906        vmax.s16        q1,  q1,  q4
1907        vmax.s16        q2,  q2,  q4
1908        vmax.s16        q3,  q3,  q4
1909        vmin.s16        q0,  q0,  q5
1910        vmin.s16        q1,  q1,  q5
1911        vmin.s16        q2,  q2,  q5
1912        vmin.s16        q3,  q3,  q5
1913
1914        vst1.16         {q0, q1}, [r0, :128]! // dst
1915
1916        subs            r9,  r9,  #1
1917.if \oy
1918        vmov.32         d31[1], lr  // new coeffs for overlap y
1919.endif
1920
1921        vst1.16         {q2, q3}, [r0, :128], r2 // dst
1922        bgt             1b
1923
1924.if \oy
1925        cmp             r12, #0
1926        mov             r9,  r12               // restore actual remaining h
1927        bgt             L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart)
1928.endif
1929        b               9f
1930.endm
1931        fguv_loop_sx0   0, 0, 0
1932        fguv_loop_sx0   0, 0, 1
1933        fguv_loop_sx0   0, 1, 0
1934        fguv_loop_sx0   0, 1, 1
1935        fguv_loop_sx0   1, 0, 0
1936        fguv_loop_sx0   1, 0, 1
1937        fguv_loop_sx0   1, 1, 0
1938        fguv_loop_sx0   1, 1, 1
1939
19409:
1941        vpop            {q4-q7}
1942        pop             {r4-r11,pc}
1943endfunc
1944
1945function fguv_loop_sx1_neon
1946L(fguv_loop_sx1_tbl):
1947        .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1948        .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1949        .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1950        .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1951        .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1952        .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1953        .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1954        .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
1955
1956.macro fguv_loop_sx1 csfl, ox, oy
1957L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
1958.if \oy
1959        mov             r12, lr
1960.endif
19611:
1962.if \ox
1963        vld1.16         {d0},       [r4],       r10 // grain_lut old
1964.endif
1965.if \ox && \oy
1966        vld1.16         {d2},       [r11],      r10 // grain_lut top old
1967.endif
1968.if \oy
1969        vld1.16         {q2,  q3},  [r8],       r10 // grain_lut top
1970.endif
1971.if !\ox && !\oy
1972        vld1.16         {q0,  q1},  [r6, :128]!     // luma
1973.endif
1974        vld1.16         {q8,  q9},  [r5],       r10 // grain_lut
1975.if \oy
1976        vdup.16         d28, d31[2]                 // overlap y coeff
1977        vdup.16         d29, d31[3]                 // overlap y coeff
1978.endif
1979.if !\ox && !\oy
1980        vld1.16         {q2,  q3},  [r6, :128], r7  // luma
1981.endif
1982
1983.if \ox
1984        vdup.16         q7,  d30[3]                // bitdepth_max
1985        vmull.s16       q0,  d0,  d24
1986        vshr.u16        q7,  q7,  #1               // grain_max
1987        vmlal.s16       q0,  d16, d25
1988        vmvn            q6,  q7                    // grain_min
1989.endif
1990
1991.if \oy
1992.if \ox
1993        vmull.s16       q1,  d2,  d24
1994        vmlal.s16       q1,  d4,  d25
1995        vqrshrn.s32     d16, q0,  #5
1996        vqrshrn.s32     d4,  q1,  #5
1997        vmin.s16        d4,  d4,  d14
1998        vmin.s16        d16, d16, d14
1999        vmax.s16        d4,  d4,  d12
2000        vmax.s16        d16, d16, d12
2001.endif
2002
2003        vmull.s16       q0,  d4,  d28
2004        vmull.s16       q1,  d5,  d28
2005        vmull.s16       q2,  d6,  d28
2006        vmull.s16       q3,  d7,  d28
2007.if !\ox
2008        vdup.16         q7,  d30[3]                // bitdepth_max
2009.endif
2010        vmlal.s16       q0,  d16, d29
2011        vmlal.s16       q1,  d17, d29
2012        vmlal.s16       q2,  d18, d29
2013        vmlal.s16       q3,  d19, d29
2014.if !\ox
2015        vshr.u16        q7,  q7,  #1               // grain_max
2016.endif
2017        vqrshrn.s32     d16, q0,  #5
2018        vqrshrn.s32     d17, q1,  #5
2019        vqrshrn.s32     d18, q2,  #5
2020        vqrshrn.s32     d19, q3,  #5
2021.if !\ox
2022        vmvn            q6,  q7                    // grain_min
2023.endif
2024        vld1.16         {q0,  q1},  [r6, :128]!    // luma
2025        vmin.s16        q8,  q8,  q7
2026        vmin.s16        q9,  q9,  q7
2027        vmax.s16        q8,  q8,  q6
2028        vmax.s16        q9,  q9,  q6
2029        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
2030.elseif \ox
2031        vqrshrn.s32     d16, q0,  #5
2032        vld1.16         {q0,  q1},  [r6, :128]!    // luma
2033        vmin.s16        d16, d16, d14
2034        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
2035        vmax.s16        d16, d16, d12
2036.endif
2037
2038        vpadd.i16       d0,  d0,  d1
2039        vpadd.i16       d1,  d2,  d3
2040        vpadd.i16       d2,  d4,  d5
2041        vpadd.i16       d3,  d6,  d7
2042        vrshr.u16       q0,  q0,  #1
2043        vrshr.u16       q1,  q1,  #1
2044.if !\csfl
2045        vdup.16         d28, d30[0]   // uv_luma_mult
2046        vld1.16         {q2,  q3},  [r1, :128], r2 // src
2047        vdup.16         d29, d30[1]   // uv_mult
2048        vmull.s16       q6,  d0,  d28
2049        vmull.s16       q7,  d1,  d28
2050        vmull.s16       q0,  d2,  d28
2051        vmull.s16       q1,  d3,  d28
2052        vmlal.s16       q6,  d4,  d29
2053        vmlal.s16       q7,  d5,  d29
2054        vmlal.s16       q0,  d6,  d29
2055        vmlal.s16       q1,  d7,  d29
2056        vshrn.s32       d12, q6,  #6
2057        vshrn.s32       d13, q7,  #6
2058        vshrn.s32       d14, q0,  #6
2059        vshrn.s32       d15, q1,  #6
2060        vdup.16         q14, d30[2]   // uv_offset
2061        vdup.16         q4,  d30[3]   // bitdepth_max
2062        vmov.i16        q5,  #0
2063        vadd.i16        q6,  q6,  q14
2064        vadd.i16        q7,  q7,  q14
2065        vmin.s16        q0,  q6,  q4
2066        vmin.s16        q1,  q7,  q4
2067        vmax.s16        q0,  q0,  q5
2068        vmax.s16        q1,  q1,  q5
2069.else
2070        vdup.16         q14, d30[3]  // bitdepth_max
2071        vld1.16         {q2,  q3},  [r1, :128], r2 // src
2072
2073        // Make sure that uninitialized pixels out of range past the right
2074        // edge are in range; their actual values shouldn't matter.
2075        vand            q0,  q0,  q14
2076        vand            q1,  q1,  q14
2077.endif
2078
2079        bl              gather16_neon
2080
2081        vmovl.u8        q6,  d8        // scaling
2082        vmovl.u8        q7,  d9
2083
2084        vshl.u16        q6,  q6,  q13  // scaling << (15 - scaling_shift)
2085        vshl.u16        q7,  q7,  q13
2086
2087        vqrdmulh.s16    q8,  q8,  q6   // round2((scaling << (15 - scaling_shift) * grain, 15)
2088        vqrdmulh.s16    q9,  q9,  q7
2089
2090
2091        vdup.16         q4,  d31[0]    // clip_min
2092        vdup.16         q5,  d31[1]    // clip_max
2093
2094        vqadd.s16       q0,  q2,  q8   // *src + noise
2095        vqadd.s16       q1,  q3,  q9
2096
2097.if \oy
2098        // Swap the two last coefficients of d31, place them first in d28
2099        vrev64.16       d28, d31
2100.endif
2101
2102        vmax.s16        q0,  q0,  q4
2103        vmax.s16        q1,  q1,  q4
2104        vmin.s16        q0,  q0,  q5
2105        vmin.s16        q1,  q1,  q5
2106
2107        subs            r9,  r9,  #1
2108.if \oy
2109        // Take the first two 16 bit coefficients of d28 and place them at the
2110        // end of d31
2111        vtrn.32         d31, d28
2112.endif
2113
2114        vst1.16         {q0, q1}, [r0, :128], r2 // dst
2115        bgt             1b
2116
2117.if \oy
2118        cmp             r12, #0
2119        mov             r9,  r12               // restore actual remaining h
2120        bgt             L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
2121.endif
2122
2123        b               9f
2124.endm
2125        fguv_loop_sx1   0, 0, 0
2126        fguv_loop_sx1   0, 0, 1
2127        fguv_loop_sx1   0, 1, 0
2128        fguv_loop_sx1   0, 1, 1
2129        fguv_loop_sx1   1, 0, 0
2130        fguv_loop_sx1   1, 0, 1
2131        fguv_loop_sx1   1, 1, 0
2132        fguv_loop_sx1   1, 1, 1
2133
21349:
2135        vpop            {q4-q7}
2136        pop             {r4-r11,pc}
2137endfunc
2138