• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
32//                               int32_t *AA, int16_t *BB,
33//                               const int w, const int s,
34//                               const int bitdepth_max);
35function sgr_box3_vert_neon, export=1
36        stp             d8,  d9,  [sp, #-0x30]!
37        stp             d10, d11, [sp, #0x10]
38        stp             d12, d13, [sp, #0x20]
39
40        add             w4,  w4,  #2
41        clz             w9,  w6        // bitdepth_max
42        dup             v28.4s,   w5   // strength
43
44        ldp             x5,  x6,  [x0]
45        ldr             x0,       [x0, #16]
46        ldp             x7,  x8,  [x1]
47        ldr             x1,       [x1, #16]
48
49        movi            v31.4s,   #9   // n
50
51        sub             w9,  w9,  #24  // -bitdepth_min_8
52        movrel          x12, X(sgr_x_by_x)
53        mov             w13, #455      // one_by_x
54        ld1             {v16.16b, v17.16b, v18.16b}, [x12]
55        dup             v6.8h,    w9   // -bitdepth_min_8
56        movi            v19.16b,  #5
57        movi            v20.8b,   #55  // idx of last 5
58        movi            v21.8b,   #72  // idx of last 4
59        movi            v22.8b,   #101 // idx of last 3
60        movi            v23.8b,   #169 // idx of last 2
61        movi            v24.8b,   #254 // idx of last 1
62        saddl           v7.4s,    v6.4h,   v6.4h  // -2*bitdepth_min_8
63        movi            v29.8h,   #1, lsl #8
64        dup             v30.4s,   w13  // one_by_x
65
66        sub             v16.16b, v16.16b, v19.16b
67        sub             v17.16b, v17.16b, v19.16b
68        sub             v18.16b, v18.16b, v19.16b
69
70        ld1             {v8.4s,  v9.4s},  [x5], #32
71        ld1             {v10.4s, v11.4s}, [x6], #32
72        ld1             {v12.8h},         [x7], #16
73        ld1             {v13.8h},         [x8], #16
74        ld1             {v0.4s, v1.4s},   [x0], #32
75        ld1             {v2.8h},          [x1], #16
761:
77
78        add             v8.4s,   v8.4s,   v10.4s
79        add             v9.4s,   v9.4s,   v11.4s
80
81        add             v12.8h,  v12.8h,  v13.8h
82
83        subs            w4,  w4,  #8
84        add             v0.4s,   v0.4s,   v8.4s
85        add             v1.4s,   v1.4s,   v9.4s
86        add             v2.8h,   v2.8h,   v12.8h
87
88        srshl           v0.4s,   v0.4s,   v7.4s
89        srshl           v1.4s,   v1.4s,   v7.4s
90        srshl           v4.8h,   v2.8h,   v6.8h
91        mul             v0.4s,   v0.4s,   v31.4s // a * n
92        mul             v1.4s,   v1.4s,   v31.4s // a * n
93        umull           v3.4s,   v4.4h,   v4.4h  // b * b
94        umull2          v4.4s,   v4.8h,   v4.8h  // b * b
95        uqsub           v0.4s,   v0.4s,   v3.4s  // imax(a * n - b * b, 0)
96        uqsub           v1.4s,   v1.4s,   v4.4s  // imax(a * n - b * b, 0)
97        mul             v0.4s,   v0.4s,   v28.4s // p * s
98        mul             v1.4s,   v1.4s,   v28.4s // p * s
99        ld1             {v8.4s,  v9.4s},  [x5], #32
100        uqshrn          v0.4h,   v0.4s,   #16
101        uqshrn2         v0.8h,   v1.4s,   #16
102        ld1             {v10.4s, v11.4s}, [x6], #32
103        uqrshrn         v0.8b,   v0.8h,   #4     // imin(z, 255)
104
105        ld1             {v12.8h},         [x7], #16
106
107        cmhi            v25.8b,  v0.8b,   v20.8b // = -1 if sgr_x_by_x[v0] < 5
108        cmhi            v26.8b,  v0.8b,   v21.8b // = -1 if sgr_x_by_x[v0] < 4
109        tbl             v1.8b, {v16.16b,  v17.16b, v18.16b}, v0.8b
110        cmhi            v27.8b,  v0.8b,   v22.8b // = -1 if sgr_x_by_x[v0] < 3
111        cmhi            v4.8b,   v0.8b,   v23.8b // = -1 if sgr_x_by_x[v0] < 2
112        add             v25.8b,  v25.8b,  v26.8b
113        cmhi            v5.8b,   v0.8b,   v24.8b // = -1 if sgr_x_by_x[v0] < 1
114        add             v27.8b,  v27.8b,  v4.8b
115        add             v5.8b,   v5.8b,   v19.8b
116        add             v25.8b,  v25.8b,  v27.8b
117        add             v5.8b,   v1.8b,   v5.8b
118        ld1             {v13.8h},         [x8], #16
119        add             v5.8b,   v5.8b,   v25.8b
120        ld1             {v0.4s, v1.4s},   [x0], #32
121        uxtl            v5.8h,   v5.8b           // x
122
123        umull           v3.4s,   v5.4h,   v2.4h  // x * BB[i]
124        umull2          v4.4s,   v5.8h,   v2.8h  // x * BB[i]
125        mul             v3.4s,   v3.4s,   v30.4s // x * BB[i] * sgr_one_by_x
126        mul             v4.4s,   v4.4s,   v30.4s // x * BB[i] * sgr_one_by_x
127        srshr           v3.4s,   v3.4s,   #12    // AA[i]
128        srshr           v4.4s,   v4.4s,   #12    // AA[i]
129        sub             v5.8h,   v29.8h,  v5.8h  // 256 - x
130        ld1             {v2.8h},          [x1], #16
131
132        st1             {v3.4s, v4.4s}, [x2], #32
133        st1             {v5.8h}, [x3], #16
134        b.gt            1b
135
136        ldp             d12, d13, [sp, #0x20]
137        ldp             d10, d11, [sp, #0x10]
138        ldp             d8,  d9,  [sp], 0x30
139        ret
140endfunc
141
142// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
143//                               int32_t *AA, int16_t *BB,
144//                               const int w, const int s,
145//                               const int bitdepth_max);
146function sgr_box5_vert_neon, export=1
147        stp             d8,  d9,  [sp, #-0x40]!
148        stp             d10, d11, [sp, #0x10]
149        stp             d12, d13, [sp, #0x20]
150        stp             d14, d15, [sp, #0x30]
151
152        add             w4,  w4,  #2
153        clz             w15, w6        // bitdepth_max
154        dup             v28.4s,   w5   // strength
155
156        ldp             x5,  x6,  [x0]
157        ldp             x7,  x8,  [x0, #16]
158        ldr             x0,       [x0, #32]
159        ldp             x9,  x10, [x1]
160        ldp             x11, x12, [x1, #16]
161        ldr             x1,       [x1, #32]
162
163        movi            v31.4s,   #25   // n
164
165        sub             w15, w15, #24  // -bitdepth_min_8
166        movrel          x13, X(sgr_x_by_x)
167        mov             w14, #164      // one_by_x
168        ld1             {v16.16b, v17.16b, v18.16b}, [x13]
169        dup             v6.8h,   w15  // -bitdepth_min_8
170        movi            v19.16b, #5
171        movi            v24.8b,  #254 // idx of last 1
172        saddl           v7.4s,   v6.4h,   v6.4h  // -2*bitdepth_min_8
173        movi            v29.8h,  #1, lsl #8
174        dup             v30.4s,  w14  // one_by_x
175
176        sub             v16.16b, v16.16b, v19.16b
177        sub             v17.16b, v17.16b, v19.16b
178        sub             v18.16b, v18.16b, v19.16b
179
180        ld1             {v8.4s,  v9.4s},  [x5], #32
181        ld1             {v10.4s, v11.4s}, [x6], #32
182        ld1             {v12.4s, v13.4s}, [x7], #32
183        ld1             {v14.4s, v15.4s}, [x8], #32
184        ld1             {v20.8h},         [x9], #16
185        ld1             {v21.8h},         [x10], #16
186        ld1             {v22.8h},         [x11], #16
187        ld1             {v23.8h},         [x12], #16
188        ld1             {v0.4s,  v1.4s},  [x0], #32
189        ld1             {v2.8h},          [x1], #16
190
1911:
192        add             v8.4s,   v8.4s,   v10.4s
193        add             v9.4s,   v9.4s,   v11.4s
194        add             v12.4s,  v12.4s,  v14.4s
195        add             v13.4s,  v13.4s,  v15.4s
196
197        add             v20.8h,  v20.8h,  v21.8h
198        add             v22.8h,  v22.8h,  v23.8h
199
200        add             v0.4s,   v0.4s,   v8.4s
201        add             v1.4s,   v1.4s,   v9.4s
202        add             v2.8h,   v2.8h,   v20.8h
203
204        add             v0.4s,   v0.4s,   v12.4s
205        add             v1.4s,   v1.4s,   v13.4s
206        add             v2.8h,   v2.8h,   v22.8h
207
208        subs            w4,  w4,  #8
209
210        movi            v20.8b,  #55  // idx of last 5
211        movi            v21.8b,  #72  // idx of last 4
212        movi            v22.8b,  #101 // idx of last 3
213        movi            v23.8b,  #169 // idx of last 2
214
215        srshl           v0.4s,   v0.4s,   v7.4s
216        srshl           v1.4s,   v1.4s,   v7.4s
217        srshl           v4.8h,   v2.8h,   v6.8h
218        mul             v0.4s,   v0.4s,   v31.4s // a * n
219        mul             v1.4s,   v1.4s,   v31.4s // a * n
220        umull           v3.4s,   v4.4h,   v4.4h  // b * b
221        umull2          v4.4s,   v4.8h,   v4.8h  // b * b
222        uqsub           v0.4s,   v0.4s,   v3.4s  // imax(a * n - b * b, 0)
223        uqsub           v1.4s,   v1.4s,   v4.4s  // imax(a * n - b * b, 0)
224        mul             v0.4s,   v0.4s,   v28.4s // p * s
225        mul             v1.4s,   v1.4s,   v28.4s // p * s
226        ld1             {v8.4s,  v9.4s},  [x5], #32
227        uqshrn          v0.4h,   v0.4s,   #16
228        uqshrn2         v0.8h,   v1.4s,   #16
229        ld1             {v10.4s, v11.4s}, [x6], #32
230        uqrshrn         v0.8b,   v0.8h,   #4     // imin(z, 255)
231
232        ld1             {v12.4s, v13.4s}, [x7], #32
233
234        cmhi            v25.8b,  v0.8b,   v20.8b // = -1 if sgr_x_by_x[v0] < 5
235        cmhi            v26.8b,  v0.8b,   v21.8b // = -1 if sgr_x_by_x[v0] < 4
236        tbl             v1.8b, {v16.16b,  v17.16b, v18.16b}, v0.8b
237        cmhi            v27.8b,  v0.8b,   v22.8b // = -1 if sgr_x_by_x[v0] < 3
238        cmhi            v4.8b,   v0.8b,   v23.8b // = -1 if sgr_x_by_x[v0] < 2
239        ld1             {v14.4s, v15.4s}, [x8], #32
240        add             v25.8b,  v25.8b,  v26.8b
241        cmhi            v5.8b,   v0.8b,   v24.8b // = -1 if sgr_x_by_x[v0] < 1
242        add             v27.8b,  v27.8b,  v4.8b
243        ld1             {v20.8h},         [x9], #16
244        add             v5.8b,   v5.8b,   v19.8b
245        add             v25.8b,  v25.8b,  v27.8b
246        ld1             {v21.8h},         [x10], #16
247        add             v5.8b,   v1.8b,   v5.8b
248        ld1             {v22.8h},         [x11], #16
249        add             v5.8b,   v5.8b,   v25.8b
250        ld1             {v23.8h},         [x12], #16
251        uxtl            v5.8h,   v5.8b           // x
252
253        ld1             {v0.4s,  v1.4s},  [x0], #32
254        umull           v3.4s,   v5.4h,   v2.4h  // x * BB[i]
255        umull2          v4.4s,   v5.8h,   v2.8h  // x * BB[i]
256        mul             v3.4s,   v3.4s,   v30.4s // x * BB[i] * sgr_one_by_x
257        mul             v4.4s,   v4.4s,   v30.4s // x * BB[i] * sgr_one_by_x
258        srshr           v3.4s,   v3.4s,   #12    // AA[i]
259        srshr           v4.4s,   v4.4s,   #12    // AA[i]
260        sub             v5.8h,   v29.8h,  v5.8h  // 256 - x
261        ld1             {v2.8h},          [x1], #16
262
263        st1             {v3.4s, v4.4s}, [x2], #32
264        st1             {v5.8h}, [x3], #16
265        b.gt            1b
266
267        ldp             d14, d15, [sp, #0x30]
268        ldp             d12, d13, [sp, #0x20]
269        ldp             d10, d11, [sp, #0x10]
270        ldp             d8,  d9,  [sp], 0x40
271        ret
272endfunc
273