1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31#define SUM_STRIDE (384+16) 32 33// void dav1d_sgr_box3_v_neon(int32_t *sumsq, int16_t *sum, 34// const int w, const int h, 35// const enum LrEdgeFlags edges); 36function sgr_box3_v_neon, export=1 37 push {r4-r9,lr} 38 ldr r4, [sp, #28] 39 add r12, r3, #2 // Number of output rows to move back 40 mov lr, r3 // Number of input rows to move back 41 add r2, r2, #2 // Actual summed width 42 mov r7, #(4*SUM_STRIDE) // sumsq stride 43 mov r8, #(2*SUM_STRIDE) // sum stride 44 sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride 45 sub r1, r1, #(2*SUM_STRIDE) // sum -= stride 46 47 tst r4, #4 // LR_HAVE_TOP 48 beq 0f 49 // If have top, read from row -2. 50 sub r5, r0, #(4*SUM_STRIDE) 51 sub r6, r1, #(2*SUM_STRIDE) 52 add lr, lr, #2 53 b 1f 540: 55 // !LR_HAVE_TOP 56 // If we don't have top, read from row 0 even if 57 // we start writing to row -1. 58 add r5, r0, #(4*SUM_STRIDE) 59 add r6, r1, #(2*SUM_STRIDE) 601: 61 62 tst r4, #8 // LR_HAVE_BOTTOM 63 beq 1f 64 // LR_HAVE_BOTTOM 65 add r3, r3, #2 // Sum all h+2 lines with the main loop 66 add lr, lr, #2 671: 68 mov r9, r3 // Backup of h for next loops 69 701: 71 // Start of horizontal loop; start one vertical filter slice. 72 // Start loading rows into q8-q13 and q0-q2 taking top 73 // padding into consideration. 74 tst r4, #4 // LR_HAVE_TOP 75 vld1.32 {q8, q9}, [r5, :128], r7 76 vld1.16 {q0}, [r6, :128], r8 77 beq 2f 78 // LR_HAVE_TOP 79 vld1.32 {q10, q11}, [r5, :128], r7 80 vld1.16 {q1}, [r6, :128], r8 81 vld1.32 {q12, q13}, [r5, :128], r7 82 vld1.16 {q2}, [r6, :128], r8 83 b 3f 842: // !LR_HAVE_TOP 85 vmov q10, q8 86 vmov q11, q9 87 vmov q1, q0 88 vmov q12, q8 89 vmov q13, q9 90 vmov q2, q0 91 923: 93 subs r3, r3, #1 94.macro add3 95 vadd.i32 q8, q8, q10 96 vadd.i32 q9, q9, q11 97 vadd.i16 q0, q0, q1 98 vadd.i32 q8, q8, q12 99 vadd.i32 q9, q9, q13 100 vadd.i16 q0, q0, q2 101 vst1.32 {q8, q9}, [r0, :128], r7 102 vst1.16 {q0}, [r1, :128], r8 103.endm 104 add3 105 vmov q8, q10 106 vmov q9, q11 107 vmov q0, q1 108 vmov q10, q12 109 vmov q11, q13 110 vmov q1, q2 111 ble 4f 112 vld1.32 {q12, q13}, [r5, :128], r7 113 vld1.16 {q2}, [r6, :128], r8 114 b 3b 115 1164: 117 tst r4, #8 // LR_HAVE_BOTTOM 118 bne 5f 119 // !LR_HAVE_BOTTOM 120 // Produce two more rows, extending the already loaded rows. 121 add3 122 vmov q8, q10 123 vmov q9, q11 124 vmov q0, q1 125 add3 126 1275: // End of one vertical slice. 128 subs r2, r2, #8 129 ble 0f 130 // Move pointers back up to the top and loop horizontally. 131 // Input pointers 132 mls r5, r7, lr, r5 133 mls r6, r8, lr, r6 134 // Output pointers 135 mls r0, r7, r12, r0 136 mls r1, r8, r12, r1 137 add r0, r0, #32 138 add r1, r1, #16 139 add r5, r5, #32 140 add r6, r6, #16 141 mov r3, r9 142 b 1b 143 1440: 145 pop {r4-r9,pc} 146.purgem add3 147endfunc 148 149// void dav1d_sgr_box5_v_neon(int32_t *sumsq, int16_t *sum, 150// const int w, const int h, 151// const enum LrEdgeFlags edges); 152function sgr_box5_v_neon, export=1 153 push {r4-r9,lr} 154 vpush {q5-q7} 155 ldr r4, [sp, #76] 156 add r12, r3, #2 // Number of output rows to move back 157 mov lr, r3 // Number of input rows to move back 158 add r2, r2, #8 // Actual summed width 159 mov r7, #(4*SUM_STRIDE) // sumsq stride 160 mov r8, #(2*SUM_STRIDE) // sum stride 161 sub r0, r0, #(4*SUM_STRIDE) // sumsq -= stride 162 sub r1, r1, #(2*SUM_STRIDE) // sum -= stride 163 164 tst r4, #4 // LR_HAVE_TOP 165 beq 0f 166 // If have top, read from row -2. 167 sub r5, r0, #(4*SUM_STRIDE) 168 sub r6, r1, #(2*SUM_STRIDE) 169 add lr, lr, #2 170 b 1f 1710: 172 // !LR_HAVE_TOP 173 // If we don't have top, read from row 0 even if 174 // we start writing to row -1. 175 add r5, r0, #(4*SUM_STRIDE) 176 add r6, r1, #(2*SUM_STRIDE) 1771: 178 179 tst r4, #8 // LR_HAVE_BOTTOM 180 beq 0f 181 // LR_HAVE_BOTTOM 182 add r3, r3, #2 // Handle h+2 lines with the main loop 183 add lr, lr, #2 184 b 1f 1850: 186 // !LR_HAVE_BOTTOM 187 sub r3, r3, #1 // Handle h-1 lines with the main loop 1881: 189 mov r9, r3 // Backup of h for next loops 190 1911: 192 // Start of horizontal loop; start one vertical filter slice. 193 // Start loading rows into q6-q15 and q0-q3,q5 taking top 194 // padding into consideration. 195 tst r4, #4 // LR_HAVE_TOP 196 vld1.32 {q6, q7}, [r5, :128], r7 197 vld1.16 {q0}, [r6, :128], r8 198 beq 2f 199 // LR_HAVE_TOP 200 vld1.32 {q10, q11}, [r5, :128], r7 201 vld1.16 {q2}, [r6, :128], r8 202 vmov q8, q6 203 vmov q9, q7 204 vmov q1, q0 205 vld1.32 {q12, q13}, [r5, :128], r7 206 vld1.16 {q3}, [r6, :128], r8 207 b 3f 2082: // !LR_HAVE_TOP 209 vmov q8, q6 210 vmov q9, q7 211 vmov q1, q0 212 vmov q10, q6 213 vmov q11, q7 214 vmov q2, q0 215 vmov q12, q6 216 vmov q13, q7 217 vmov q3, q0 218 2193: 220 cmp r3, #0 221 beq 4f 222 vld1.32 {q14, q15}, [r5, :128], r7 223 vld1.16 {q5}, [r6, :128], r8 224 2253: 226 // Start of vertical loop 227 subs r3, r3, #2 228.macro add5 229 vadd.i32 q6, q6, q8 230 vadd.i32 q7, q7, q9 231 vadd.i16 q0, q0, q1 232 vadd.i32 q6, q6, q10 233 vadd.i32 q7, q7, q11 234 vadd.i16 q0, q0, q2 235 vadd.i32 q6, q6, q12 236 vadd.i32 q7, q7, q13 237 vadd.i16 q0, q0, q3 238 vadd.i32 q6, q6, q14 239 vadd.i32 q7, q7, q15 240 vadd.i16 q0, q0, q5 241 vst1.32 {q6, q7}, [r0, :128], r7 242 vst1.16 {q0}, [r1, :128], r8 243.endm 244 add5 245.macro shift2 246 vmov q6, q10 247 vmov q7, q11 248 vmov q0, q2 249 vmov q8, q12 250 vmov q9, q13 251 vmov q1, q3 252 vmov q10, q14 253 vmov q11, q15 254 vmov q2, q5 255.endm 256 shift2 257 add r0, r0, r7 258 add r1, r1, r8 259 ble 5f 260 vld1.32 {q12, q13}, [r5, :128], r7 261 vld1.16 {q3}, [r6, :128], r8 262 vld1.32 {q14, q15}, [r5, :128], r7 263 vld1.16 {q5}, [r6, :128], r8 264 b 3b 265 2664: 267 // h == 1, !LR_HAVE_BOTTOM. 268 // Pad the last row with the only content row, and add. 269 vmov q14, q12 270 vmov q15, q13 271 vmov q5, q3 272 add5 273 shift2 274 add r0, r0, r7 275 add r1, r1, r8 276 add5 277 b 6f 278 2795: 280 tst r4, #8 // LR_HAVE_BOTTOM 281 bne 6f 282 // !LR_HAVE_BOTTOM 283 cmp r3, #0 284 bne 5f 285 // The intended three edge rows left; output the one at h-2 and 286 // the past edge one at h. 287 vld1.32 {q12, q13}, [r5, :128], r7 288 vld1.16 {q3}, [r6, :128], r8 289 // Pad the past-edge row from the last content row. 290 vmov q14, q12 291 vmov q15, q13 292 vmov q5, q3 293 add5 294 shift2 295 add r0, r0, r7 296 add r1, r1, r8 297 // The last two rows are already padded properly here. 298 add5 299 b 6f 300 3015: 302 // r3 == -1, two rows left, output one. 303 // Pad the last two rows from the mid one. 304 vmov q12, q10 305 vmov q13, q11 306 vmov q3, q2 307 vmov q14, q10 308 vmov q15, q11 309 vmov q5, q2 310 add5 311 add r0, r0, r7 312 add r1, r1, r8 313 b 6f 314 3156: // End of one vertical slice. 316 subs r2, r2, #8 317 ble 0f 318 // Move pointers back up to the top and loop horizontally. 319 // Input pointers 320 mls r5, r7, lr, r5 321 mls r6, r8, lr, r6 322 // Output pointers 323 mls r0, r7, r12, r0 324 mls r1, r8, r12, r1 325 add r0, r0, #32 326 add r1, r1, #16 327 add r5, r5, #32 328 add r6, r6, #16 329 mov r3, r9 330 b 1b 331 3320: 333 vpop {q5-q7} 334 pop {r4-r9,pc} 335.purgem add5 336endfunc 337 338// void dav1d_sgr_calc_ab1_neon(int32_t *a, int16_t *b, 339// const int w, const int h, const int strength, 340// const int bitdepth_max); 341// void dav1d_sgr_calc_ab2_neon(int32_t *a, int16_t *b, 342// const int w, const int h, const int strength, 343// const int bitdepth_max); 344function sgr_calc_ab1_neon, export=1 345 push {r4-r7,lr} 346 vpush {q4-q7} 347 ldrd r4, r5, [sp, #84] 348 add r3, r3, #2 // h += 2 349 clz r6, r5 350 vmov.i32 q15, #9 // n 351 movw r5, #455 352 mov lr, #SUM_STRIDE 353 b sgr_calc_ab_neon 354endfunc 355 356function sgr_calc_ab2_neon, export=1 357 push {r4-r7,lr} 358 vpush {q4-q7} 359 ldrd r4, r5, [sp, #84] 360 add r3, r3, #3 // h += 3 361 clz r6, r5 362 asr r3, r3, #1 // h /= 2 363 vmov.i32 q15, #25 // n 364 mov r5, #164 365 mov lr, #(2*SUM_STRIDE) 366endfunc 367 368function sgr_calc_ab_neon 369 movrel r12, X(sgr_x_by_x) 370 sub r6, r6, #24 // -bitdepth_min_8 371 vld1.8 {q8, q9}, [r12, :128]! 372 add r7, r6, r6 // -2*bitdepth_min_8 373 vmov.i8 q11, #5 374 vmov.i8 d10, #55 // idx of last 5 375 vld1.8 {q10}, [r12, :128] 376 vmov.i8 d11, #72 // idx of last 4 377 vmov.i8 d12, #101 // idx of last 3 378 vmov.i8 d13, #169 // idx of last 2 379 vmov.i8 d14, #254 // idx of last 1 380 vmov.i8 d15, #32 // elements consumed in first vtbl 381 add r2, r2, #2 // w += 2 382 add r12, r2, #7 383 bic r12, r12, #7 // aligned w 384 sub r12, lr, r12 // increment between rows 385 vdup.32 q12, r4 386 sub r0, r0, #(4*(SUM_STRIDE)) 387 sub r1, r1, #(2*(SUM_STRIDE)) 388 mov r4, r2 // backup of w 389 vsub.i8 q8, q8, q11 390 vsub.i8 q9, q9, q11 391 vsub.i8 q10, q10, q11 3921: 393 vld1.32 {q0, q1}, [r0, :128] // a 394 vld1.16 {q2}, [r1, :128] // b 395 vdup.32 q13, r7 // -2*bitdepth_min_8 396 vdup.16 q14, r6 // -bitdepth_min_8 397 subs r2, r2, #8 398 vrshl.s32 q0, q0, q13 399 vrshl.s32 q1, q1, q13 400 vrshl.s16 q4, q2, q14 401 vmul.i32 q0, q0, q15 // a * n 402 vmul.i32 q1, q1, q15 // a * n 403 vmull.u16 q3, d8, d8 // b * b 404 vmull.u16 q4, d9, d9 // b * b 405 vqsub.u32 q0, q0, q3 // imax(a * n - b * b, 0) 406 vqsub.u32 q1, q1, q4 // imax(a * n - b * b, 0) 407 vmul.i32 q0, q0, q12 // p * s 408 vmul.i32 q1, q1, q12 // p * s 409 vqshrn.u32 d0, q0, #16 410 vqshrn.u32 d1, q1, #16 411 vqrshrn.u16 d0, q0, #4 // imin(z, 255) 412 413 vcgt.u8 d2, d0, d10 // = -1 if sgr_x_by_x[d0] < 5 414 vcgt.u8 d3, d0, d11 // = -1 if sgr_x_by_x[d0] < 4 415 vtbl.8 d1, {q8, q9}, d0 416 vcgt.u8 d6, d0, d12 // = -1 if sgr_x_by_x[d0] < 3 417 vsub.i8 d9, d0, d15 // indices for vtbx 418 vcgt.u8 d7, d0, d13 // = -1 if sgr_x_by_x[d0] < 2 419 vadd.i8 d2, d2, d3 420 vtbx.8 d1, {q10}, d9 421 vcgt.u8 d8, d0, d14 // = -1 if sgr_x_by_x[d0] < 1 422 vadd.i8 d6, d6, d7 423 vadd.i8 d8, d8, d22 424 vadd.i8 d2, d2, d6 425 vadd.i8 d1, d1, d8 426 vadd.i8 d1, d1, d2 427 vmovl.u8 q0, d1 // x 428 429 vmov.i16 q13, #256 430 vdup.32 q14, r5 // one_by_x 431 432 vmull.u16 q1, d0, d4 // x * BB[i] 433 vmull.u16 q2, d1, d5 // x * BB[i] 434 vmul.i32 q1, q1, q14 // x * BB[i] * sgr_one_by_x 435 vmul.i32 q2, q2, q14 // x * BB[i] * sgr_one_by_x 436 vrshr.s32 q1, q1, #12 // AA[i] 437 vrshr.s32 q2, q2, #12 // AA[i] 438 vsub.i16 q0, q13, q0 // 256 - x 439 440 vst1.32 {q1, q2}, [r0, :128]! 441 vst1.16 {q0}, [r1, :128]! 442 bgt 1b 443 444 subs r3, r3, #1 445 ble 0f 446 add r0, r0, r12, lsl #2 447 add r1, r1, r12, lsl #1 448 mov r2, r4 449 b 1b 4500: 451 vpop {q4-q7} 452 pop {r4-r7,pc} 453endfunc 454