1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31// void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, 32// int32_t *AA, int16_t *BB, 33// const int w, const int s, 34// const int bitdepth_max); 35function sgr_box3_vert_neon, export=1 36 stp d8, d9, [sp, #-0x30]! 37 stp d10, d11, [sp, #0x10] 38 stp d12, d13, [sp, #0x20] 39 40 add w4, w4, #2 41 clz w9, w6 // bitdepth_max 42 dup v28.4s, w5 // strength 43 44 ldp x5, x6, [x0] 45 ldr x0, [x0, #16] 46 ldp x7, x8, [x1] 47 ldr x1, [x1, #16] 48 49 movi v31.4s, #9 // n 50 51 sub w9, w9, #24 // -bitdepth_min_8 52 movrel x12, X(sgr_x_by_x) 53 mov w13, #455 // one_by_x 54 ld1 {v16.16b, v17.16b, v18.16b}, [x12] 55 dup v6.8h, w9 // -bitdepth_min_8 56 movi v19.16b, #5 57 movi v20.8b, #55 // idx of last 5 58 movi v21.8b, #72 // idx of last 4 59 movi v22.8b, #101 // idx of last 3 60 movi v23.8b, #169 // idx of last 2 61 movi v24.8b, #254 // idx of last 1 62 saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 63 movi v29.8h, #1, lsl #8 64 dup v30.4s, w13 // one_by_x 65 66 sub v16.16b, v16.16b, v19.16b 67 sub v17.16b, v17.16b, v19.16b 68 sub v18.16b, v18.16b, v19.16b 69 70 ld1 {v8.4s, v9.4s}, [x5], #32 71 ld1 {v10.4s, v11.4s}, [x6], #32 72 ld1 {v12.8h}, [x7], #16 73 ld1 {v13.8h}, [x8], #16 74 ld1 {v0.4s, v1.4s}, [x0], #32 75 ld1 {v2.8h}, [x1], #16 761: 77 78 add v8.4s, v8.4s, v10.4s 79 add v9.4s, v9.4s, v11.4s 80 81 add v12.8h, v12.8h, v13.8h 82 83 subs w4, w4, #8 84 add v0.4s, v0.4s, v8.4s 85 add v1.4s, v1.4s, v9.4s 86 add v2.8h, v2.8h, v12.8h 87 88 srshl v0.4s, v0.4s, v7.4s 89 srshl v1.4s, v1.4s, v7.4s 90 srshl v4.8h, v2.8h, v6.8h 91 mul v0.4s, v0.4s, v31.4s // a * n 92 mul v1.4s, v1.4s, v31.4s // a * n 93 umull v3.4s, v4.4h, v4.4h // b * b 94 umull2 v4.4s, v4.8h, v4.8h // b * b 95 uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) 96 uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) 97 mul v0.4s, v0.4s, v28.4s // p * s 98 mul v1.4s, v1.4s, v28.4s // p * s 99 ld1 {v8.4s, v9.4s}, [x5], #32 100 uqshrn v0.4h, v0.4s, #16 101 uqshrn2 v0.8h, v1.4s, #16 102 ld1 {v10.4s, v11.4s}, [x6], #32 103 uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) 104 105 ld1 {v12.8h}, [x7], #16 106 107 cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 108 cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 109 tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b 110 cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 111 cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 112 add v25.8b, v25.8b, v26.8b 113 cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 114 add v27.8b, v27.8b, v4.8b 115 add v5.8b, v5.8b, v19.8b 116 add v25.8b, v25.8b, v27.8b 117 add v5.8b, v1.8b, v5.8b 118 ld1 {v13.8h}, [x8], #16 119 add v5.8b, v5.8b, v25.8b 120 ld1 {v0.4s, v1.4s}, [x0], #32 121 uxtl v5.8h, v5.8b // x 122 123 umull v3.4s, v5.4h, v2.4h // x * BB[i] 124 umull2 v4.4s, v5.8h, v2.8h // x * BB[i] 125 mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x 126 mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x 127 srshr v3.4s, v3.4s, #12 // AA[i] 128 srshr v4.4s, v4.4s, #12 // AA[i] 129 sub v5.8h, v29.8h, v5.8h // 256 - x 130 ld1 {v2.8h}, [x1], #16 131 132 st1 {v3.4s, v4.4s}, [x2], #32 133 st1 {v5.8h}, [x3], #16 134 b.gt 1b 135 136 ldp d12, d13, [sp, #0x20] 137 ldp d10, d11, [sp, #0x10] 138 ldp d8, d9, [sp], 0x30 139 ret 140endfunc 141 142// void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, 143// int32_t *AA, int16_t *BB, 144// const int w, const int s, 145// const int bitdepth_max); 146function sgr_box5_vert_neon, export=1 147 stp d8, d9, [sp, #-0x40]! 148 stp d10, d11, [sp, #0x10] 149 stp d12, d13, [sp, #0x20] 150 stp d14, d15, [sp, #0x30] 151 152 add w4, w4, #2 153 clz w15, w6 // bitdepth_max 154 dup v28.4s, w5 // strength 155 156 ldp x5, x6, [x0] 157 ldp x7, x8, [x0, #16] 158 ldr x0, [x0, #32] 159 ldp x9, x10, [x1] 160 ldp x11, x12, [x1, #16] 161 ldr x1, [x1, #32] 162 163 movi v31.4s, #25 // n 164 165 sub w15, w15, #24 // -bitdepth_min_8 166 movrel x13, X(sgr_x_by_x) 167 mov w14, #164 // one_by_x 168 ld1 {v16.16b, v17.16b, v18.16b}, [x13] 169 dup v6.8h, w15 // -bitdepth_min_8 170 movi v19.16b, #5 171 movi v24.8b, #254 // idx of last 1 172 saddl v7.4s, v6.4h, v6.4h // -2*bitdepth_min_8 173 movi v29.8h, #1, lsl #8 174 dup v30.4s, w14 // one_by_x 175 176 sub v16.16b, v16.16b, v19.16b 177 sub v17.16b, v17.16b, v19.16b 178 sub v18.16b, v18.16b, v19.16b 179 180 ld1 {v8.4s, v9.4s}, [x5], #32 181 ld1 {v10.4s, v11.4s}, [x6], #32 182 ld1 {v12.4s, v13.4s}, [x7], #32 183 ld1 {v14.4s, v15.4s}, [x8], #32 184 ld1 {v20.8h}, [x9], #16 185 ld1 {v21.8h}, [x10], #16 186 ld1 {v22.8h}, [x11], #16 187 ld1 {v23.8h}, [x12], #16 188 ld1 {v0.4s, v1.4s}, [x0], #32 189 ld1 {v2.8h}, [x1], #16 190 1911: 192 add v8.4s, v8.4s, v10.4s 193 add v9.4s, v9.4s, v11.4s 194 add v12.4s, v12.4s, v14.4s 195 add v13.4s, v13.4s, v15.4s 196 197 add v20.8h, v20.8h, v21.8h 198 add v22.8h, v22.8h, v23.8h 199 200 add v0.4s, v0.4s, v8.4s 201 add v1.4s, v1.4s, v9.4s 202 add v2.8h, v2.8h, v20.8h 203 204 add v0.4s, v0.4s, v12.4s 205 add v1.4s, v1.4s, v13.4s 206 add v2.8h, v2.8h, v22.8h 207 208 subs w4, w4, #8 209 210 movi v20.8b, #55 // idx of last 5 211 movi v21.8b, #72 // idx of last 4 212 movi v22.8b, #101 // idx of last 3 213 movi v23.8b, #169 // idx of last 2 214 215 srshl v0.4s, v0.4s, v7.4s 216 srshl v1.4s, v1.4s, v7.4s 217 srshl v4.8h, v2.8h, v6.8h 218 mul v0.4s, v0.4s, v31.4s // a * n 219 mul v1.4s, v1.4s, v31.4s // a * n 220 umull v3.4s, v4.4h, v4.4h // b * b 221 umull2 v4.4s, v4.8h, v4.8h // b * b 222 uqsub v0.4s, v0.4s, v3.4s // imax(a * n - b * b, 0) 223 uqsub v1.4s, v1.4s, v4.4s // imax(a * n - b * b, 0) 224 mul v0.4s, v0.4s, v28.4s // p * s 225 mul v1.4s, v1.4s, v28.4s // p * s 226 ld1 {v8.4s, v9.4s}, [x5], #32 227 uqshrn v0.4h, v0.4s, #16 228 uqshrn2 v0.8h, v1.4s, #16 229 ld1 {v10.4s, v11.4s}, [x6], #32 230 uqrshrn v0.8b, v0.8h, #4 // imin(z, 255) 231 232 ld1 {v12.4s, v13.4s}, [x7], #32 233 234 cmhi v25.8b, v0.8b, v20.8b // = -1 if sgr_x_by_x[v0] < 5 235 cmhi v26.8b, v0.8b, v21.8b // = -1 if sgr_x_by_x[v0] < 4 236 tbl v1.8b, {v16.16b, v17.16b, v18.16b}, v0.8b 237 cmhi v27.8b, v0.8b, v22.8b // = -1 if sgr_x_by_x[v0] < 3 238 cmhi v4.8b, v0.8b, v23.8b // = -1 if sgr_x_by_x[v0] < 2 239 ld1 {v14.4s, v15.4s}, [x8], #32 240 add v25.8b, v25.8b, v26.8b 241 cmhi v5.8b, v0.8b, v24.8b // = -1 if sgr_x_by_x[v0] < 1 242 add v27.8b, v27.8b, v4.8b 243 ld1 {v20.8h}, [x9], #16 244 add v5.8b, v5.8b, v19.8b 245 add v25.8b, v25.8b, v27.8b 246 ld1 {v21.8h}, [x10], #16 247 add v5.8b, v1.8b, v5.8b 248 ld1 {v22.8h}, [x11], #16 249 add v5.8b, v5.8b, v25.8b 250 ld1 {v23.8h}, [x12], #16 251 uxtl v5.8h, v5.8b // x 252 253 ld1 {v0.4s, v1.4s}, [x0], #32 254 umull v3.4s, v5.4h, v2.4h // x * BB[i] 255 umull2 v4.4s, v5.8h, v2.8h // x * BB[i] 256 mul v3.4s, v3.4s, v30.4s // x * BB[i] * sgr_one_by_x 257 mul v4.4s, v4.4s, v30.4s // x * BB[i] * sgr_one_by_x 258 srshr v3.4s, v3.4s, #12 // AA[i] 259 srshr v4.4s, v4.4s, #12 // AA[i] 260 sub v5.8h, v29.8h, v5.8h // 256 - x 261 ld1 {v2.8h}, [x1], #16 262 263 st1 {v3.4s, v4.4s}, [x2], #32 264 st1 {v5.8h}, [x3], #16 265 b.gt 1b 266 267 ldp d14, d15, [sp, #0x30] 268 ldp d12, d13, [sp, #0x20] 269 ldp d10, d11, [sp, #0x10] 270 ldp d8, d9, [sp], 0x40 271 ret 272endfunc 273