• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31#define SUM_STRIDE (384+16)
32
33// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum,
34//                            const int w, const int h,
35//                            const enum LrEdgeFlags edges);
36function sgr_box3_v_neon, export=1
37        push            {r4-r9,lr}
38        ldr             r4,  [sp, #28]
39        add             r12, r3,  #2 // Number of output rows to move back
40        mov             lr,  r3      // Number of input rows to move back
41        add             r2,  r2,  #2 // Actual summed width
42        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
43        mov             r8,       #(2*SUM_STRIDE) // sum stride
44        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
45        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
46
47        tst             r4,  #4 // LR_HAVE_TOP
48        beq             0f
49        // If have top, read from row -2.
50        sub             r5,  r0,  #(4*SUM_STRIDE)
51        sub             r6,  r1,  #(2*SUM_STRIDE)
52        add             lr,  lr,  #2
53        b               1f
540:
55        // !LR_HAVE_TOP
56        // If we don't have top, read from row 0 even if
57        // we start writing to row -1.
58        add             r5,  r0,  #(4*SUM_STRIDE)
59        add             r6,  r1,  #(2*SUM_STRIDE)
601:
61
62        tst             r4,  #8 // LR_HAVE_BOTTOM
63        beq             1f
64        // LR_HAVE_BOTTOM
65        add             r3,  r3,  #2  // Sum all h+2 lines with the main loop
66        add             lr,  lr,  #2
671:
68        mov             r9,  r3       // Backup of h for next loops
69
701:
71        // Start of horizontal loop; start one vertical filter slice.
72        // Start loading rows into q8-q13 and q0-q2 taking top
73        // padding into consideration.
74        tst             r4,  #4 // LR_HAVE_TOP
75        vld1.32         {q8,  q9},  [r5, :128], r7
76        vld1.16         {q0},       [r6, :128], r8
77        beq             2f
78        // LR_HAVE_TOP
79        vld1.32         {q10, q11}, [r5, :128], r7
80        vld1.16         {q1},       [r6, :128], r8
81        vld1.32         {q12, q13}, [r5, :128], r7
82        vld1.16         {q2},       [r6, :128], r8
83        b               3f
842:      // !LR_HAVE_TOP
85        vmov            q10, q8
86        vmov            q11, q9
87        vmov            q1,  q0
88        vmov            q12, q8
89        vmov            q13, q9
90        vmov            q2,  q0
91
923:
93        subs            r3,  r3,  #1
94.macro add3
95        vadd.i32        q8,  q8,  q10
96        vadd.i32        q9,  q9,  q11
97        vadd.i16        q0,  q0,  q1
98        vadd.i32        q8,  q8,  q12
99        vadd.i32        q9,  q9,  q13
100        vadd.i16        q0,  q0,  q2
101        vst1.32         {q8, q9}, [r0, :128], r7
102        vst1.16         {q0},     [r1, :128], r8
103.endm
104        add3
105        vmov            q8,  q10
106        vmov            q9,  q11
107        vmov            q0,  q1
108        vmov            q10, q12
109        vmov            q11, q13
110        vmov            q1,  q2
111        ble             4f
112        vld1.32         {q12, q13}, [r5, :128], r7
113        vld1.16         {q2},       [r6, :128], r8
114        b               3b
115
1164:
117        tst             r4,  #8 // LR_HAVE_BOTTOM
118        bne             5f
119        // !LR_HAVE_BOTTOM
120        // Produce two more rows, extending the already loaded rows.
121        add3
122        vmov            q8,  q10
123        vmov            q9,  q11
124        vmov            q0,  q1
125        add3
126
1275:      // End of one vertical slice.
128        subs            r2,  r2,  #8
129        ble             0f
130        // Move pointers back up to the top and loop horizontally.
131        // Input pointers
132        mls             r5,  r7,  lr,  r5
133        mls             r6,  r8,  lr,  r6
134        // Output pointers
135        mls             r0,  r7,  r12, r0
136        mls             r1,  r8,  r12, r1
137        add             r0,  r0,  #32
138        add             r1,  r1,  #16
139        add             r5,  r5,  #32
140        add             r6,  r6,  #16
141        mov             r3,  r9
142        b               1b
143
1440:
145        pop             {r4-r9,pc}
146.purgem add3
147endfunc
148
149// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum,
150//                            const int w, const int h,
151//                            const enum LrEdgeFlags edges);
152function sgr_box5_v_neon, export=1
153        push            {r4-r9,lr}
154        vpush           {q5-q7}
155        ldr             r4,  [sp, #76]
156        add             r12, r3,  #2 // Number of output rows to move back
157        mov             lr,  r3      // Number of input rows to move back
158        add             r2,  r2,  #8 // Actual summed width
159        mov             r7,       #(4*SUM_STRIDE) // sumsq stride
160        mov             r8,       #(2*SUM_STRIDE) // sum stride
161        sub             r0,  r0,  #(4*SUM_STRIDE) // sumsq -= stride
162        sub             r1,  r1,  #(2*SUM_STRIDE) // sum   -= stride
163
164        tst             r4,  #4 // LR_HAVE_TOP
165        beq             0f
166        // If have top, read from row -2.
167        sub             r5,  r0,  #(4*SUM_STRIDE)
168        sub             r6,  r1,  #(2*SUM_STRIDE)
169        add             lr,  lr,  #2
170        b               1f
1710:
172        // !LR_HAVE_TOP
173        // If we don't have top, read from row 0 even if
174        // we start writing to row -1.
175        add             r5,  r0,  #(4*SUM_STRIDE)
176        add             r6,  r1,  #(2*SUM_STRIDE)
1771:
178
179        tst             r4,  #8 // LR_HAVE_BOTTOM
180        beq             0f
181        // LR_HAVE_BOTTOM
182        add             r3,  r3,  #2  // Handle h+2 lines with the main loop
183        add             lr,  lr,  #2
184        b               1f
1850:
186        // !LR_HAVE_BOTTOM
187        sub             r3,  r3,  #1  // Handle h-1 lines with the main loop
1881:
189        mov             r9,  r3       // Backup of h for next loops
190
1911:
192        // Start of horizontal loop; start one vertical filter slice.
193        // Start loading rows into q6-q15 and q0-q3,q5 taking top
194        // padding into consideration.
195        tst             r4,  #4 // LR_HAVE_TOP
196        vld1.32         {q6,  q7},  [r5, :128], r7
197        vld1.16         {q0},       [r6, :128], r8
198        beq             2f
199        // LR_HAVE_TOP
200        vld1.32         {q10, q11}, [r5, :128], r7
201        vld1.16         {q2},       [r6, :128], r8
202        vmov            q8,  q6
203        vmov            q9,  q7
204        vmov            q1,  q0
205        vld1.32         {q12, q13}, [r5, :128], r7
206        vld1.16         {q3},       [r6, :128], r8
207        b               3f
2082:      // !LR_HAVE_TOP
209        vmov            q8,  q6
210        vmov            q9,  q7
211        vmov            q1,  q0
212        vmov            q10, q6
213        vmov            q11, q7
214        vmov            q2,  q0
215        vmov            q12, q6
216        vmov            q13, q7
217        vmov            q3,  q0
218
2193:
220        cmp             r3,  #0
221        beq             4f
222        vld1.32         {q14, q15}, [r5, :128], r7
223        vld1.16         {q5},       [r6, :128], r8
224
2253:
226        // Start of vertical loop
227        subs            r3,  r3,  #2
228.macro add5
229        vadd.i32        q6,  q6,  q8
230        vadd.i32        q7,  q7,  q9
231        vadd.i16        q0,  q0,  q1
232        vadd.i32        q6,  q6,  q10
233        vadd.i32        q7,  q7,  q11
234        vadd.i16        q0,  q0,  q2
235        vadd.i32        q6,  q6,  q12
236        vadd.i32        q7,  q7,  q13
237        vadd.i16        q0,  q0,  q3
238        vadd.i32        q6,  q6,  q14
239        vadd.i32        q7,  q7,  q15
240        vadd.i16        q0,  q0,  q5
241        vst1.32         {q6, q7}, [r0, :128], r7
242        vst1.16         {q0},     [r1, :128], r8
243.endm
244        add5
245.macro shift2
246        vmov            q6,  q10
247        vmov            q7,  q11
248        vmov            q0,  q2
249        vmov            q8,  q12
250        vmov            q9,  q13
251        vmov            q1,  q3
252        vmov            q10, q14
253        vmov            q11, q15
254        vmov            q2,  q5
255.endm
256        shift2
257        add             r0,  r0,  r7
258        add             r1,  r1,  r8
259        ble             5f
260        vld1.32         {q12, q13}, [r5, :128], r7
261        vld1.16         {q3},       [r6, :128], r8
262        vld1.32         {q14, q15}, [r5, :128], r7
263        vld1.16         {q5},       [r6, :128], r8
264        b               3b
265
2664:
267        // h == 1, !LR_HAVE_BOTTOM.
268        // Pad the last row with the only content row, and add.
269        vmov            q14, q12
270        vmov            q15, q13
271        vmov            q5,  q3
272        add5
273        shift2
274        add             r0,  r0,  r7
275        add             r1,  r1,  r8
276        add5
277        b               6f
278
2795:
280        tst             r4,  #8 // LR_HAVE_BOTTOM
281        bne             6f
282        // !LR_HAVE_BOTTOM
283        cmp             r3,  #0
284        bne             5f
285        // The intended three edge rows left; output the one at h-2 and
286        // the past edge one at h.
287        vld1.32         {q12, q13}, [r5, :128], r7
288        vld1.16         {q3},       [r6, :128], r8
289        // Pad the past-edge row from the last content row.
290        vmov            q14, q12
291        vmov            q15, q13
292        vmov            q5,  q3
293        add5
294        shift2
295        add             r0,  r0,  r7
296        add             r1,  r1,  r8
297        // The last two rows are already padded properly here.
298        add5
299        b               6f
300
3015:
302        // r3 == -1, two rows left, output one.
303        // Pad the last two rows from the mid one.
304        vmov            q12, q10
305        vmov            q13, q11
306        vmov            q3,  q2
307        vmov            q14, q10
308        vmov            q15, q11
309        vmov            q5,  q2
310        add5
311        add             r0,  r0,  r7
312        add             r1,  r1,  r8
313        b               6f
314
3156:      // End of one vertical slice.
316        subs            r2,  r2,  #8
317        ble             0f
318        // Move pointers back up to the top and loop horizontally.
319        // Input pointers
320        mls             r5,  r7,  lr,  r5
321        mls             r6,  r8,  lr,  r6
322        // Output pointers
323        mls             r0,  r7,  r12, r0
324        mls             r1,  r8,  r12, r1
325        add             r0,  r0,  #32
326        add             r1,  r1,  #16
327        add             r5,  r5,  #32
328        add             r6,  r6,  #16
329        mov             r3,  r9
330        b               1b
331
3320:
333        vpop            {q5-q7}
334        pop             {r4-r9,pc}
335.purgem add5
336endfunc
337
338// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b,
339//                              const int w, const int h, const int strength,
340//                              const int bitdepth_max);
341// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b,
342//                              const int w, const int h, const int strength,
343//                              const int bitdepth_max);
344function sgr_calc_ab1_neon, export=1
345        push            {r4-r7,lr}
346        vpush           {q4-q7}
347        ldrd            r4,  r5,  [sp, #84]
348        add             r3,  r3,  #2   // h += 2
349        clz             r6,  r5
350        vmov.i32        q15, #9        // n
351        movw            r5,  #455
352        mov             lr,  #SUM_STRIDE
353        b               sgr_calc_ab_neon
354endfunc
355
356function sgr_calc_ab2_neon, export=1
357        push            {r4-r7,lr}
358        vpush           {q4-q7}
359        ldrd            r4,  r5,  [sp, #84]
360        add             r3,  r3,  #3   // h += 3
361        clz             r6,  r5
362        asr             r3,  r3,  #1   // h /= 2
363        vmov.i32        q15, #25       // n
364        mov             r5,  #164
365        mov             lr,  #(2*SUM_STRIDE)
366endfunc
367
368function sgr_calc_ab_neon
369        movrel          r12, X(sgr_x_by_x)
370        sub             r6,  r6,  #24  // -bitdepth_min_8
371        vld1.8          {q8, q9}, [r12, :128]!
372        add             r7,  r6,  r6   // -2*bitdepth_min_8
373        vmov.i8         q11, #5
374        vmov.i8         d10, #55       // idx of last 5
375        vld1.8          {q10},    [r12, :128]
376        vmov.i8         d11, #72       // idx of last 4
377        vmov.i8         d12, #101      // idx of last 3
378        vmov.i8         d13, #169      // idx of last 2
379        vmov.i8         d14, #254      // idx of last 1
380        vmov.i8         d15, #32       // elements consumed in first vtbl
381        add             r2,  r2,  #2   // w += 2
382        add             r12, r2,  #7
383        bic             r12, r12, #7   // aligned w
384        sub             r12, lr,  r12  // increment between rows
385        vdup.32         q12, r4
386        sub             r0,  r0,  #(4*(SUM_STRIDE))
387        sub             r1,  r1,  #(2*(SUM_STRIDE))
388        mov             r4,  r2        // backup of w
389        vsub.i8         q8,  q8,  q11
390        vsub.i8         q9,  q9,  q11
391        vsub.i8         q10, q10, q11
3921:
393        vld1.32         {q0, q1}, [r0, :128] // a
394        vld1.16         {q2},     [r1, :128] // b
395        vdup.32         q13, r7        // -2*bitdepth_min_8
396        vdup.16         q14, r6        // -bitdepth_min_8
397        subs            r2,  r2,  #8
398        vrshl.s32       q0,  q0,  q13
399        vrshl.s32       q1,  q1,  q13
400        vrshl.s16       q4,  q2,  q14
401        vmul.i32        q0,  q0,  q15  // a * n
402        vmul.i32        q1,  q1,  q15  // a * n
403        vmull.u16       q3,  d8,  d8   // b * b
404        vmull.u16       q4,  d9,  d9   // b * b
405        vqsub.u32       q0,  q0,  q3   // imax(a * n - b * b, 0)
406        vqsub.u32       q1,  q1,  q4   // imax(a * n - b * b, 0)
407        vmul.i32        q0,  q0,  q12  // p * s
408        vmul.i32        q1,  q1,  q12  // p * s
409        vqshrn.u32      d0,  q0,  #16
410        vqshrn.u32      d1,  q1,  #16
411        vqrshrn.u16     d0,  q0,  #4   // imin(z, 255)
412
413        vcgt.u8         d2,  d0,  d10  // = -1 if sgr_x_by_x[d0] < 5
414        vcgt.u8         d3,  d0,  d11  // = -1 if sgr_x_by_x[d0] < 4
415        vtbl.8          d1,  {q8, q9}, d0
416        vcgt.u8         d6,  d0,  d12  // = -1 if sgr_x_by_x[d0] < 3
417        vsub.i8         d9,  d0,  d15  // indices for vtbx
418        vcgt.u8         d7,  d0,  d13  // = -1 if sgr_x_by_x[d0] < 2
419        vadd.i8         d2,  d2,  d3
420        vtbx.8          d1,  {q10}, d9
421        vcgt.u8         d8,  d0,  d14  // = -1 if sgr_x_by_x[d0] < 1
422        vadd.i8         d6,  d6,  d7
423        vadd.i8         d8,  d8,  d22
424        vadd.i8         d2,  d2,  d6
425        vadd.i8         d1,  d1,  d8
426        vadd.i8         d1,  d1,  d2
427        vmovl.u8        q0,  d1        // x
428
429        vmov.i16        q13, #256
430        vdup.32         q14, r5        // one_by_x
431
432        vmull.u16       q1,  d0,  d4   // x * BB[i]
433        vmull.u16       q2,  d1,  d5   // x * BB[i]
434        vmul.i32        q1,  q1,  q14  // x * BB[i] * sgr_one_by_x
435        vmul.i32        q2,  q2,  q14  // x * BB[i] * sgr_one_by_x
436        vrshr.s32       q1,  q1,  #12  // AA[i]
437        vrshr.s32       q2,  q2,  #12  // AA[i]
438        vsub.i16        q0,  q13, q0   // 256 - x
439
440        vst1.32         {q1, q2}, [r0, :128]!
441        vst1.16         {q0},     [r1, :128]!
442        bgt             1b
443
444        subs            r3,  r3,  #1
445        ble             0f
446        add             r0,  r0,  r12, lsl #2
447        add             r1,  r1,  r12, lsl #1
448        mov             r2,  r4
449        b               1b
4500:
451        vpop            {q4-q7}
452        pop             {r4-r7,pc}
453endfunc
454