• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2021, VideoLAN and dav1d authors
3 * Copyright © 2021, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv,
32//                          int bx4, int bw4, int bh4)
33
34function splat_mv_neon, export=1
35        ld1             {v3.16b},  [x1]
36        clz             w3,  w3
37        adr             x5,  L(splat_tbl)
38        sub             w3,  w3,  #26
39        ext             v2.16b,  v3.16b,  v3.16b,  #12
40        ldrh            w3,  [x5, w3, uxtw #1]
41        add             w2,  w2,  w2,  lsl #1
42        ext             v0.16b,  v2.16b,  v3.16b,  #4
43        sub             x3,  x5,  w3, uxtw
44        ext             v1.16b,  v2.16b,  v3.16b,  #8
45        lsl             w2,  w2,  #2
46        ext             v2.16b,  v2.16b,  v3.16b,  #12
471:
48        ldr             x1,  [x0],  #8
49        subs            w4,  w4,  #1
50        add             x1,  x1,  x2
51        br              x3
52
5310:
54        AARCH64_VALID_JUMP_TARGET
55        st1             {v0.8b}, [x1]
56        str             s2,  [x1, #8]
57        b.gt            1b
58        ret
5920:
60        AARCH64_VALID_JUMP_TARGET
61        st1             {v0.16b}, [x1]
62        str             d1,  [x1, #16]
63        b.gt            1b
64        ret
65320:
66        AARCH64_VALID_JUMP_TARGET
67        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
68        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
69        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
70        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
71160:
72        AARCH64_VALID_JUMP_TARGET
73        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
74        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
7580:
76        AARCH64_VALID_JUMP_TARGET
77        st1             {v0.16b, v1.16b, v2.16b}, [x1], #48
7840:
79        AARCH64_VALID_JUMP_TARGET
80        st1             {v0.16b, v1.16b, v2.16b}, [x1]
81        b.gt            1b
82        ret
83
84L(splat_tbl):
85        .hword L(splat_tbl) -  320b
86        .hword L(splat_tbl) -  160b
87        .hword L(splat_tbl) -   80b
88        .hword L(splat_tbl) -   40b
89        .hword L(splat_tbl) -   20b
90        .hword L(splat_tbl) -   10b
91endfunc
92
93const mv_tbls, align=4
94        .byte           255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
95        .byte           0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
96        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
97        .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
98endconst
99
100const mask_mult, align=4
101        .byte           1, 2, 1, 2, 0, 0, 0, 0
102endconst
103
104// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride,
105//                           refmvs_block **rr, const uint8_t *ref_sign,
106//                           int col_end8, int row_end8,
107//                           int col_start8, int row_start8)
108function save_tmvs_neon, export=1
109        AARCH64_SIGN_LINK_REGISTER
110        stp             x29, x30, [sp, #-16]!
111        mov             x29, sp
112
113        movi            v30.8b,  #0
114        ld1             {v31.8b}, [x3]
115        adr             x8,  L(save_tmvs_tbl)
116        movrel          x16, mask_mult
117        movrel          x13, mv_tbls
118        ld1             {v29.8b}, [x16]
119        ext             v31.8b,  v30.8b,  v31.8b,  #7 // [0, ref_sign]
120        mov             w15, #5
121        mov             w14, #12*2
122        sxtw            x4,  w4
123        sxtw            x6,  w6
124        mul             w1,  w1,  w15             // stride *= 5
125        sub             w5,  w5,  w7              // h = row_end8 - row_start8
126        lsl             w7,  w7,  #1              // row_start8 <<= 1
1271:
128        mov             w15, #5
129        and             w9,  w7,  #30             // (y & 15) * 2
130        ldr             x9,  [x2, w9, uxtw #3]    // b = rr[(y & 15) * 2]
131        add             x9,  x9,  #12             // &b[... + 1]
132        madd            x10, x4,  x14,  x9        // end_cand_b = &b[col_end8*2 + 1]
133        madd            x9,  x6,  x14,  x9        // cand_b = &b[x*2 + 1]
134
135        madd            x3,  x6,  x15,  x0        // &rp[x]
136
1372:
138        ldrb            w11, [x9, #10]            // cand_b->bs
139        ld1             {v0.16b}, [x9]            // cand_b->mv
140        add             x11, x8,  w11, uxtw #2
141        ldr             h1,  [x9, #8]             // cand_b->ref
142        ldrh            w12, [x11]                // bw8
143        mov             x15, x8
144        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
145        cmp             x9,  x10
146        mov             v2.8b,   v0.8b
147        b.ge            3f
148
149        ldrb            w15, [x9, #10]            // cand_b->bs
150        add             x16, x9,  #8
151        ld1             {v4.16b}, [x9]            // cand_b->mv
152        add             x15, x8,  w15, uxtw #2
153        ld1             {v1.h}[1], [x16]          // cand_b->ref
154        ldrh            w12, [x15]                // bw8
155        add             x9,  x9,  w12, uxtw #1    // cand_b += bw8*2
156        trn1            v2.2d,   v0.2d,   v4.2d
157
1583:
159        abs             v2.8h,   v2.8h            // abs(mv[].xy)
160        tbl             v1.8b, {v31.16b}, v1.8b   // ref_sign[ref]
161        ushr            v2.8h,   v2.8h,   #12     // abs(mv[].xy) >> 12
162        umull           v1.8h,   v1.8b,   v29.8b  // ref_sign[ref] * {1, 2}
163        cmeq            v2.4s,   v2.4s,   #0      // abs(mv[].xy) <= 4096
164        xtn             v2.4h,   v2.4s            // abs() condition to 16 bit
165        and             v1.8b,   v1.8b,   v2.8b   // h[0-3] contains conditions for mv[0-1]
166        addp            v1.4h,   v1.4h,   v1.4h   // Combine condition for [1] and [0]
167        umov            w16, v1.h[0]              // Extract case for first block
168        umov            w17, v1.h[1]
169        ldrh            w11, [x11, #2]            // Fetch jump table entry
170        ldrh            w15, [x15, #2]
171        ldr             q1, [x13, w16, uxtw #4]   // Load permutation table base on case
172        ldr             q5, [x13, w17, uxtw #4]
173        sub             x11, x8,  w11, uxtw       // Find jump table target
174        sub             x15, x8,  w15, uxtw
175        tbl             v0.16b, {v0.16b}, v1.16b  // Permute cand_b to output refmvs_temporal_block
176        tbl             v4.16b, {v4.16b}, v5.16b
177
178        // v1 follows on v0, with another 3 full repetitions of the pattern.
179        ext             v1.16b,  v0.16b,  v0.16b,  #1
180        ext             v5.16b,  v4.16b,  v4.16b,  #1
181        // v2 ends with 3 complete repetitions of the pattern.
182        ext             v2.16b,  v0.16b,  v1.16b,  #4
183        ext             v6.16b,  v4.16b,  v5.16b,  #4
184
185        blr             x11
186        b.ge            4f  // if (cand_b >= end)
187        mov             v0.16b,  v4.16b
188        mov             v1.16b,  v5.16b
189        mov             v2.16b,  v6.16b
190        cmp             x9,  x10
191        blr             x15
192        b.lt            2b  // if (cand_b < end)
193
1944:
195        subs            w5,  w5,  #1              // h--
196        add             w7,  w7,  #2              // y += 2
197        add             x0,  x0,  x1              // rp += stride
198        b.gt            1b
199
200        ldp             x29, x30, [sp], #16
201        AARCH64_VALIDATE_LINK_REGISTER
202        ret
203
20410:
205        AARCH64_VALID_CALL_TARGET
206        add             x16, x3,  #4
207        st1             {v0.s}[0], [x3]
208        st1             {v0.b}[4], [x16]
209        add             x3,  x3,  #5
210        ret
21120:
212        AARCH64_VALID_CALL_TARGET
213        add             x16, x3,  #8
214        st1             {v0.d}[0], [x3]
215        st1             {v0.h}[4], [x16]
216        add             x3,  x3,  #2*5
217        ret
21840:
219        AARCH64_VALID_CALL_TARGET
220        st1             {v0.16b}, [x3]
221        str             s1, [x3, #16]
222        add             x3,  x3,  #4*5
223        ret
22480:
225        AARCH64_VALID_CALL_TARGET
226        // This writes 6 full entries plus 2 extra bytes
227        st1             {v0.16b, v1.16b}, [x3]
228        // Write the last few, overlapping with the first write.
229        stur            q2, [x3, #(8*5-16)]
230        add             x3,  x3,  #8*5
231        ret
232160:
233        AARCH64_VALID_CALL_TARGET
234        add             x16, x3,  #6*5
235        add             x17, x3,  #12*5
236        // This writes 6 full entries plus 2 extra bytes
237        st1             {v0.16b, v1.16b}, [x3]
238        // Write another 6 full entries, slightly overlapping with the first set
239        st1             {v0.16b, v1.16b}, [x16]
240        // Write 8 bytes (one full entry) after the first 12
241        st1             {v0.8b}, [x17]
242        // Write the last 3 entries
243        str             q2, [x3, #(16*5-16)]
244        add             x3,  x3,  #16*5
245        ret
246
247L(save_tmvs_tbl):
248        .hword 16 * 12
249        .hword L(save_tmvs_tbl) - 160b
250        .hword 16 * 12
251        .hword L(save_tmvs_tbl) - 160b
252        .hword 8 * 12
253        .hword L(save_tmvs_tbl) -  80b
254        .hword 8 * 12
255        .hword L(save_tmvs_tbl) -  80b
256        .hword 8 * 12
257        .hword L(save_tmvs_tbl) -  80b
258        .hword 8 * 12
259        .hword L(save_tmvs_tbl) -  80b
260        .hword 4 * 12
261        .hword L(save_tmvs_tbl) -  40b
262        .hword 4 * 12
263        .hword L(save_tmvs_tbl) -  40b
264        .hword 4 * 12
265        .hword L(save_tmvs_tbl) -  40b
266        .hword 4 * 12
267        .hword L(save_tmvs_tbl) -  40b
268        .hword 2 * 12
269        .hword L(save_tmvs_tbl) -  20b
270        .hword 2 * 12
271        .hword L(save_tmvs_tbl) -  20b
272        .hword 2 * 12
273        .hword L(save_tmvs_tbl) -  20b
274        .hword 2 * 12
275        .hword L(save_tmvs_tbl) -  20b
276        .hword 2 * 12
277        .hword L(save_tmvs_tbl) -  20b
278        .hword 1 * 12
279        .hword L(save_tmvs_tbl) -  10b
280        .hword 1 * 12
281        .hword L(save_tmvs_tbl) -  10b
282        .hword 1 * 12
283        .hword L(save_tmvs_tbl) -  10b
284        .hword 1 * 12
285        .hword L(save_tmvs_tbl) -  10b
286        .hword 1 * 12
287        .hword L(save_tmvs_tbl) -  10b
288        .hword 1 * 12
289        .hword L(save_tmvs_tbl) -  10b
290        .hword 1 * 12
291        .hword L(save_tmvs_tbl) -  10b
292endfunc
293