1/* 2 * Copyright © 2021, VideoLAN and dav1d authors 3 * Copyright © 2021, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void dav1d_splat_mv_neon(refmvs_block **rr, const refmvs_block *rmv, 32// int bx4, int bw4, int bh4) 33 34function splat_mv_neon, export=1 35 ld1 {v3.16b}, [x1] 36 clz w3, w3 37 adr x5, L(splat_tbl) 38 sub w3, w3, #26 39 ext v2.16b, v3.16b, v3.16b, #12 40 ldrh w3, [x5, w3, uxtw #1] 41 add w2, w2, w2, lsl #1 42 ext v0.16b, v2.16b, v3.16b, #4 43 sub x3, x5, w3, uxtw 44 ext v1.16b, v2.16b, v3.16b, #8 45 lsl w2, w2, #2 46 ext v2.16b, v2.16b, v3.16b, #12 471: 48 ldr x1, [x0], #8 49 subs w4, w4, #1 50 add x1, x1, x2 51 br x3 52 5310: 54 AARCH64_VALID_JUMP_TARGET 55 st1 {v0.8b}, [x1] 56 str s2, [x1, #8] 57 b.gt 1b 58 ret 5920: 60 AARCH64_VALID_JUMP_TARGET 61 st1 {v0.16b}, [x1] 62 str d1, [x1, #16] 63 b.gt 1b 64 ret 65320: 66 AARCH64_VALID_JUMP_TARGET 67 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 68 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 69 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 70 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 71160: 72 AARCH64_VALID_JUMP_TARGET 73 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 74 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 7580: 76 AARCH64_VALID_JUMP_TARGET 77 st1 {v0.16b, v1.16b, v2.16b}, [x1], #48 7840: 79 AARCH64_VALID_JUMP_TARGET 80 st1 {v0.16b, v1.16b, v2.16b}, [x1] 81 b.gt 1b 82 ret 83 84L(splat_tbl): 85 .hword L(splat_tbl) - 320b 86 .hword L(splat_tbl) - 160b 87 .hword L(splat_tbl) - 80b 88 .hword L(splat_tbl) - 40b 89 .hword L(splat_tbl) - 20b 90 .hword L(splat_tbl) - 10b 91endfunc 92 93const mv_tbls, align=4 94 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 95 .byte 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0 96 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 97 .byte 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4 98endconst 99 100const mask_mult, align=4 101 .byte 1, 2, 1, 2, 0, 0, 0, 0 102endconst 103 104// void dav1d_save_tmvs_neon(refmvs_temporal_block *rp, ptrdiff_t stride, 105// refmvs_block **rr, const uint8_t *ref_sign, 106// int col_end8, int row_end8, 107// int col_start8, int row_start8) 108function save_tmvs_neon, export=1 109 AARCH64_SIGN_LINK_REGISTER 110 stp x29, x30, [sp, #-16]! 111 mov x29, sp 112 113 movi v30.8b, #0 114 ld1 {v31.8b}, [x3] 115 adr x8, L(save_tmvs_tbl) 116 movrel x16, mask_mult 117 movrel x13, mv_tbls 118 ld1 {v29.8b}, [x16] 119 ext v31.8b, v30.8b, v31.8b, #7 // [0, ref_sign] 120 mov w15, #5 121 mov w14, #12*2 122 sxtw x4, w4 123 sxtw x6, w6 124 mul w1, w1, w15 // stride *= 5 125 sub w5, w5, w7 // h = row_end8 - row_start8 126 lsl w7, w7, #1 // row_start8 <<= 1 1271: 128 mov w15, #5 129 and w9, w7, #30 // (y & 15) * 2 130 ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2] 131 add x9, x9, #12 // &b[... + 1] 132 madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1] 133 madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1] 134 135 madd x3, x6, x15, x0 // &rp[x] 136 1372: 138 ldrb w11, [x9, #10] // cand_b->bs 139 ld1 {v0.16b}, [x9] // cand_b->mv 140 add x11, x8, w11, uxtw #2 141 ldr h1, [x9, #8] // cand_b->ref 142 ldrh w12, [x11] // bw8 143 mov x15, x8 144 add x9, x9, w12, uxtw #1 // cand_b += bw8*2 145 cmp x9, x10 146 mov v2.8b, v0.8b 147 b.ge 3f 148 149 ldrb w15, [x9, #10] // cand_b->bs 150 add x16, x9, #8 151 ld1 {v4.16b}, [x9] // cand_b->mv 152 add x15, x8, w15, uxtw #2 153 ld1 {v1.h}[1], [x16] // cand_b->ref 154 ldrh w12, [x15] // bw8 155 add x9, x9, w12, uxtw #1 // cand_b += bw8*2 156 trn1 v2.2d, v0.2d, v4.2d 157 1583: 159 abs v2.8h, v2.8h // abs(mv[].xy) 160 tbl v1.8b, {v31.16b}, v1.8b // ref_sign[ref] 161 ushr v2.8h, v2.8h, #12 // abs(mv[].xy) >> 12 162 umull v1.8h, v1.8b, v29.8b // ref_sign[ref] * {1, 2} 163 cmeq v2.4s, v2.4s, #0 // abs(mv[].xy) <= 4096 164 xtn v2.4h, v2.4s // abs() condition to 16 bit 165 and v1.8b, v1.8b, v2.8b // h[0-3] contains conditions for mv[0-1] 166 addp v1.4h, v1.4h, v1.4h // Combine condition for [1] and [0] 167 umov w16, v1.h[0] // Extract case for first block 168 umov w17, v1.h[1] 169 ldrh w11, [x11, #2] // Fetch jump table entry 170 ldrh w15, [x15, #2] 171 ldr q1, [x13, w16, uxtw #4] // Load permutation table base on case 172 ldr q5, [x13, w17, uxtw #4] 173 sub x11, x8, w11, uxtw // Find jump table target 174 sub x15, x8, w15, uxtw 175 tbl v0.16b, {v0.16b}, v1.16b // Permute cand_b to output refmvs_temporal_block 176 tbl v4.16b, {v4.16b}, v5.16b 177 178 // v1 follows on v0, with another 3 full repetitions of the pattern. 179 ext v1.16b, v0.16b, v0.16b, #1 180 ext v5.16b, v4.16b, v4.16b, #1 181 // v2 ends with 3 complete repetitions of the pattern. 182 ext v2.16b, v0.16b, v1.16b, #4 183 ext v6.16b, v4.16b, v5.16b, #4 184 185 blr x11 186 b.ge 4f // if (cand_b >= end) 187 mov v0.16b, v4.16b 188 mov v1.16b, v5.16b 189 mov v2.16b, v6.16b 190 cmp x9, x10 191 blr x15 192 b.lt 2b // if (cand_b < end) 193 1944: 195 subs w5, w5, #1 // h-- 196 add w7, w7, #2 // y += 2 197 add x0, x0, x1 // rp += stride 198 b.gt 1b 199 200 ldp x29, x30, [sp], #16 201 AARCH64_VALIDATE_LINK_REGISTER 202 ret 203 20410: 205 AARCH64_VALID_CALL_TARGET 206 add x16, x3, #4 207 st1 {v0.s}[0], [x3] 208 st1 {v0.b}[4], [x16] 209 add x3, x3, #5 210 ret 21120: 212 AARCH64_VALID_CALL_TARGET 213 add x16, x3, #8 214 st1 {v0.d}[0], [x3] 215 st1 {v0.h}[4], [x16] 216 add x3, x3, #2*5 217 ret 21840: 219 AARCH64_VALID_CALL_TARGET 220 st1 {v0.16b}, [x3] 221 str s1, [x3, #16] 222 add x3, x3, #4*5 223 ret 22480: 225 AARCH64_VALID_CALL_TARGET 226 // This writes 6 full entries plus 2 extra bytes 227 st1 {v0.16b, v1.16b}, [x3] 228 // Write the last few, overlapping with the first write. 229 stur q2, [x3, #(8*5-16)] 230 add x3, x3, #8*5 231 ret 232160: 233 AARCH64_VALID_CALL_TARGET 234 add x16, x3, #6*5 235 add x17, x3, #12*5 236 // This writes 6 full entries plus 2 extra bytes 237 st1 {v0.16b, v1.16b}, [x3] 238 // Write another 6 full entries, slightly overlapping with the first set 239 st1 {v0.16b, v1.16b}, [x16] 240 // Write 8 bytes (one full entry) after the first 12 241 st1 {v0.8b}, [x17] 242 // Write the last 3 entries 243 str q2, [x3, #(16*5-16)] 244 add x3, x3, #16*5 245 ret 246 247L(save_tmvs_tbl): 248 .hword 16 * 12 249 .hword L(save_tmvs_tbl) - 160b 250 .hword 16 * 12 251 .hword L(save_tmvs_tbl) - 160b 252 .hword 8 * 12 253 .hword L(save_tmvs_tbl) - 80b 254 .hword 8 * 12 255 .hword L(save_tmvs_tbl) - 80b 256 .hword 8 * 12 257 .hword L(save_tmvs_tbl) - 80b 258 .hword 8 * 12 259 .hword L(save_tmvs_tbl) - 80b 260 .hword 4 * 12 261 .hword L(save_tmvs_tbl) - 40b 262 .hword 4 * 12 263 .hword L(save_tmvs_tbl) - 40b 264 .hword 4 * 12 265 .hword L(save_tmvs_tbl) - 40b 266 .hword 4 * 12 267 .hword L(save_tmvs_tbl) - 40b 268 .hword 2 * 12 269 .hword L(save_tmvs_tbl) - 20b 270 .hword 2 * 12 271 .hword L(save_tmvs_tbl) - 20b 272 .hword 2 * 12 273 .hword L(save_tmvs_tbl) - 20b 274 .hword 2 * 12 275 .hword L(save_tmvs_tbl) - 20b 276 .hword 2 * 12 277 .hword L(save_tmvs_tbl) - 20b 278 .hword 1 * 12 279 .hword L(save_tmvs_tbl) - 10b 280 .hword 1 * 12 281 .hword L(save_tmvs_tbl) - 10b 282 .hword 1 * 12 283 .hword L(save_tmvs_tbl) - 10b 284 .hword 1 * 12 285 .hword L(save_tmvs_tbl) - 10b 286 .hword 1 * 12 287 .hword L(save_tmvs_tbl) - 10b 288 .hword 1 * 12 289 .hword L(save_tmvs_tbl) - 10b 290 .hword 1 * 12 291 .hword L(save_tmvs_tbl) - 10b 292endfunc 293