1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18#define PRIVATE(f) .text; .align 4; .type f,#function; f: 19#define END(f) .size f, .-f; 20 21.set FRACTION_BITS, 7 22.set MAX_R, 25 23 24 25/* A quick way of making a line of code conditional on some other condition. 26 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with 27 * `ifcc`: 28 */ 29.macro ifcc zzz:vararg 30.if cc 31 \zzz 32.endif 33.endm 34 35/* Fetch 16 columns of bytes (regardless of image format), convolve these 36 * vertically, and leave them in the register file. If working near the top or 37 * bottom of an image then clamp the addressing while loading the data in. 38 * 39 * The convolution is fully unrolled for windows up to max_r, with the 40 * outermost edges calculated first. This way it's possible to branch directly 41 * into the relevant part of the code for an arbitrary convolution radius. Two 42 * variants of the loop are produced; one eliminates the clamping code for a 43 * slight speed advantage. 44 * 45 * Where the macro is called with reg=x, the specified register is taken to 46 * contain a pre-calculated pointer into one of the two loops. 47 * 48 * Input: 49 * x1 -- src 50 * x2 -- pitch 51 * x5 -- r 52 * x6 -- rup 53 * x7 -- rdn 54 * x12 -- switch index 55 * v0-v3 -- coefficient table 56 * x13 = -pitch 57 * x15 = top-row in 58 * x19 = bottom-row in 59 * Output: 60 * x1 += 16 61 * v10,v11 -- 16 convolved columns 62 * Modifies: 63 * x10 = upper row pointer 64 * x11 = lower row pointer 65 * v12-v15 = temporary sums 66 */ 67.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/ 68 .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif 69 70 ld1 {v15.16b}, [x1], #16 71 mov x10, x15 72 73 uxtl v14.8h, v15.8b 74// prfm PLDL1KEEP,[x1, #16] // TODO: confirm 75 uxtl2 v15.8h, v15.16b 76 .if \max_r < 16 // approximate 77 ifcc adr \reg, 1f 78 .else 79 ifcc adrp \reg, 1f 80 ifcc add \reg, \reg, #:lo12:1f 81 .endif 82 83 umull v12.4s, v14.4h, v0.h[0] 84 ifcc sub \reg, \reg, x5, LSL #6 85 umull2 v13.4s, v14.8h, v0.h[0] 86 mov x11, x19 87 umull v14.4s, v15.4h, v0.h[0] 88 ifcc add \reg, \reg, x5, LSL #3 89 umull2 v15.4s, v15.8h, v0.h[0] 90 br \reg 91 92 .irp rowclamp, 1, 0 93 .set cc, \rowclamp 94 .align 4 95 .irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h 96 .set i, \dreg * 8 + \lane 97 .if 0 < i && i <= \max_r 98 ld1 {v10.16b}, [x10], x2 99 ifcc cmp x6, #i 100 ld1 {v11.16b}, [x11], x13 101 ifcc csel x10, x15, x10, lo 102 uaddl v16.8h, v10.8b, v11.8b 103 ifcc cmp x7, #i 104 uaddl2 v11.8h, v10.16b, v11.16b 105 ifcc csel x11, x19, x11, lo 106 umlal v12.4s, v16.4h, v\dreg\doth[\lane] 107 umlal2 v13.4s, v16.8h, v\dreg\doth[\lane] 108// prfm PLDL1KEEP,[x10, #32] // TODO: confirm 109nop 110 umlal v14.4s, v11.4h, v\dreg\doth[\lane] 111// prfm PLDL1KEEP,[x11, #32] // TODO: confirm 112nop 113 umlal2 v15.4s, v11.8h, v\dreg\doth[\lane] 114 .endif 115 .endr ; .endr ; .endr 116 .if \rowclamp == 1 117 1: \labelc : 118 b 2f 119 .else 120 2: \labelnc : 121 .endif 122 .endr 123 124 uqrshrn v10.4h, v12.4s, #16 - FRACTION_BITS 125 add x15, x15, #16 126 uqrshrn2 v10.8h, v13.4s, #16 - FRACTION_BITS 127 add x19, x19, #16 128 uqrshrn v11.4h, v14.4s, #16 - FRACTION_BITS 129 uqrshrn2 v11.8h, v15.4s, #16 - FRACTION_BITS 130.endm /*}}}*/ 131 132/* Some portion of the convolution window (as much as will fit, and all of it 133 * for the uchar1 cases) is kept in the register file to avoid unnecessary 134 * memory accesses. This forces the horizontal loops to be unrolled because 135 * there's no indexed addressing into the register file. 136 * 137 * As in the fetch macro, the operations are ordered from outside to inside, so 138 * that jumping into the middle of the block bypasses the unwanted window taps. 139 * 140 * There are several variants of the macro because of the fixed offets of the 141 * taps -- the wider the maximum radius the further the centre tap is from the 142 * most recently fetched data. This means that pre-filling the window requires 143 * more data that won't be used and it means that rotating the window involves 144 * more mov operations. 145 * 146 * When the buffer gets too big the buffer at [x9] is used. 147 * 148 * Input: 149 * v16-v31,v4-v11 -- convoltion window 150 * x9 -- pointer to additional convolution window data 151 * Output: 152 * x9 -- updated buffer pointer (if used) 153 * d31 -- result to be stored 154 * Modifies: 155 * x12 -- temp buffer pointer 156 * v12-v13 -- temporaries for load and vext operations. 157 * v14-v15 -- intermediate sums 158 */ 159#define TUNED_LIST1 8, 16 160.macro hconv1_8/*{{{*/ 161 umull v14.4s, v9.4h, v0.h[0] 162 umull2 v15.4s, v9.8h, v0.h[0] 163 164 adr x16, 100f 165 ldrsh x12, [x16, x5, LSL #1] 166 add x12, x12, x16 167 br x12 168 100: .hword -4 169 .hword 101f-100b 170 .hword 102f-100b 171 .hword 103f-100b 172 .hword 104f-100b 173 .hword 105f-100b 174 .hword 106f-100b 175 .hword 107f-100b 176 .hword 108f-100b 177 .align 4 178 108: umlal v14.4s, v8.4h, v1.h[0] 179 umlal2 v15.4s, v8.8h, v1.h[0] 180 umlal v14.4s, v10.4h, v1.h[0] 181 umlal2 v15.4s, v10.8h, v1.h[0] 182 107: ext v12.16b, v8.16b, v9.16b, #1*2 183 ext v13.16b, v9.16b, v10.16b, #7*2 184 umlal v14.4s, v12.4h, v0.h[7] 185 umlal2 v15.4s, v12.8h, v0.h[7] 186 umlal v14.4s, v13.4h, v0.h[7] 187 umlal2 v15.4s, v13.8h, v0.h[7] 188 106: ext v12.16b, v8.16b, v9.16b, #2*2 189 ext v13.16b, v9.16b, v10.16b, #6*2 190 umlal v14.4s, v12.4h, v0.h[6] 191 umlal2 v15.4s, v12.8h, v0.h[6] 192 umlal v14.4s, v13.4h, v0.h[6] 193 umlal2 v15.4s, v13.8h, v0.h[6] 194 105: ext v12.16b, v8.16b, v9.16b, #3*2 195 ext v13.16b, v9.16b, v10.16b, #5*2 196 umlal v14.4s, v12.4h, v0.h[5] 197 umlal2 v15.4s, v12.8h, v0.h[5] 198 umlal v14.4s, v13.4h, v0.h[5] 199 umlal2 v15.4s, v13.8h, v0.h[5] 200 104: //ext v12.16b, v8.16b, v9.16b, #4*2 201 //ext v13.16b, v9.16b, v10.16b, #4*2 202 umlal2 v14.4s, v8.8h, v0.h[4] 203 umlal v15.4s, v9.4h, v0.h[4] 204 umlal2 v14.4s, v9.8h, v0.h[4] 205 umlal v15.4s, v10.4h, v0.h[4] 206 103: ext v12.16b, v8.16b, v9.16b, #5*2 207 ext v13.16b, v9.16b, v10.16b, #3*2 208 umlal v14.4s, v12.4h, v0.h[3] 209 umlal2 v15.4s, v12.8h, v0.h[3] 210 umlal v14.4s, v13.4h, v0.h[3] 211 umlal2 v15.4s, v13.8h, v0.h[3] 212 102: ext v12.16b, v8.16b, v9.16b, #6*2 213 ext v13.16b, v9.16b, v10.16b, #2*2 214 umlal v14.4s, v12.4h, v0.h[2] 215 umlal2 v15.4s, v12.8h, v0.h[2] 216 umlal v14.4s, v13.4h, v0.h[2] 217 umlal2 v15.4s, v13.8h, v0.h[2] 218 101: ext v12.16b, v8.16b, v9.16b, #7*2 219 ext v13.16b, v9.16b, v10.16b, #1*2 220 umlal v14.4s, v12.4h, v0.h[1] 221 umlal2 v15.4s, v12.8h, v0.h[1] 222 umlal v14.4s, v13.4h, v0.h[1] 223 umlal2 v15.4s, v13.8h, v0.h[1] 224 225 uqrshrn v14.4h, v14.4s, #16 226 uqrshrn2 v14.8h, v15.4s, #16 227 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 228 229 mov v8.16b, v9.16b 230 mov v9.16b, v10.16b 231 mov v10.16b, v11.16b 232.endm/*}}}*/ 233 234.macro hconv1_16/*{{{*/ 235 umull v14.4s, v8.4h, v0.h[0] 236 umull2 v15.4s, v8.8h, v0.h[0] 237 238 adr x16, 100f 239 ldrsh x12, [x16, x5, LSL #1] 240 add x12, x12, x16 241 br x12 242 100: .hword -4 243 .hword 101f-100b 244 .hword 102f-100b 245 .hword 103f-100b 246 .hword 104f-100b 247 .hword 105f-100b 248 .hword 106f-100b 249 .hword 107f-100b 250 .hword 108f-100b 251 .hword 109f-100b 252 .hword 110f-100b 253 .hword 111f-100b 254 .hword 112f-100b 255 .hword 113f-100b 256 .hword 114f-100b 257 .hword 115f-100b 258 .hword 116f-100b 259 .align 4 260 116: //ext v12.16b, v6.16b, v7.16b, #0*2 261 //ext v13.16b, v10.16b, v11.16b, #0*2 262 umlal v14.4s, v6.4h, v2.h[0] 263 umlal2 v15.4s, v6.8h, v2.h[0] 264 umlal v14.4s, v10.4h, v2.h[0] 265 umlal2 v15.4s, v10.8h, v2.h[0] 266 115: ext v12.16b, v6.16b, v7.16b, #1*2 267 ext v13.16b, v9.16b, v10.16b, #7*2 268 umlal v14.4s, v12.4h, v1.h[7] 269 umlal2 v15.4s, v12.8h, v1.h[7] 270 umlal v14.4s, v13.4h, v1.h[7] 271 umlal2 v15.4s, v13.8h, v1.h[7] 272 114: ext v12.16b, v6.16b, v7.16b, #2*2 273 ext v13.16b, v9.16b, v10.16b, #6*2 274 umlal v14.4s, v12.4h, v1.h[6] 275 umlal2 v15.4s, v12.8h, v1.h[6] 276 umlal v14.4s, v13.4h, v1.h[6] 277 umlal2 v15.4s, v13.8h, v1.h[6] 278 113: ext v12.16b, v6.16b, v7.16b, #3*2 279 ext v13.16b, v9.16b, v10.16b, #5*2 280 umlal v14.4s, v12.4h, v1.h[5] 281 umlal2 v15.4s, v12.8h, v1.h[5] 282 umlal v14.4s, v13.4h, v1.h[5] 283 umlal2 v15.4s, v13.8h, v1.h[5] 284 112: //ext v12.16b, v6.16b, v7.16b, #4*2 285 //ext v13.16b, v9.16b, v10.16b, #4*2 286 umlal2 v14.4s, v6.8h, v1.h[4] 287 umlal v15.4s, v7.4h, v1.h[4] 288 umlal2 v14.4s, v9.8h, v1.h[4] 289 umlal v15.4s, v10.4h, v1.h[4] 290 111: ext v12.16b, v6.16b, v7.16b, #5*2 291 ext v13.16b, v9.16b, v10.16b, #3*2 292 umlal v14.4s, v12.4h, v1.h[3] 293 umlal2 v15.4s, v12.8h, v1.h[3] 294 umlal v14.4s, v13.4h, v1.h[3] 295 umlal2 v15.4s, v13.8h, v1.h[3] 296 110: ext v12.16b, v6.16b, v7.16b, #6*2 297 ext v13.16b, v9.16b, v10.16b, #2*2 298 umlal v14.4s, v12.4h, v1.h[2] 299 umlal2 v15.4s, v12.8h, v1.h[2] 300 umlal v14.4s, v13.4h, v1.h[2] 301 umlal2 v15.4s, v13.8h, v1.h[2] 302 109: ext v12.16b, v6.16b, v7.16b, #7*2 303 ext v13.16b, v9.16b, v10.16b, #1*2 304 umlal v14.4s, v12.4h, v1.h[1] 305 umlal2 v15.4s, v12.8h, v1.h[1] 306 umlal v14.4s, v13.4h, v1.h[1] 307 umlal2 v15.4s, v13.8h, v1.h[1] 308 108: //ext v12.16b, v7.16b, v8.16b, #0*2 309 //ext v13.16b, v9.16b, v10.16b, #0*2 310 umlal v14.4s, v7.4h, v1.h[0] 311 umlal2 v15.4s, v7.8h, v1.h[0] 312 umlal v14.4s, v9.4h, v1.h[0] 313 umlal2 v15.4s, v9.8h, v1.h[0] 314 107: ext v12.16b, v7.16b, v8.16b, #1*2 315 ext v13.16b, v8.16b, v9.16b, #7*2 316 umlal v14.4s, v12.4h, v0.h[7] 317 umlal2 v15.4s, v12.8h, v0.h[7] 318 umlal v14.4s, v13.4h, v0.h[7] 319 umlal2 v15.4s, v13.8h, v0.h[7] 320 106: ext v12.16b, v7.16b, v8.16b, #2*2 321 ext v13.16b, v8.16b, v9.16b, #6*2 322 umlal v14.4s, v12.4h, v0.h[6] 323 umlal2 v15.4s, v12.8h, v0.h[6] 324 umlal v14.4s, v13.4h, v0.h[6] 325 umlal2 v15.4s, v13.8h, v0.h[6] 326 105: ext v12.16b, v7.16b, v8.16b, #3*2 327 ext v13.16b, v8.16b, v9.16b, #5*2 328 umlal v14.4s, v12.4h, v0.h[5] 329 umlal2 v15.4s, v12.8h, v0.h[5] 330 umlal v14.4s, v13.4h, v0.h[5] 331 umlal2 v15.4s, v13.8h, v0.h[5] 332 104: //ext v12.16b, v7.16b, v8.16b, #4*2 333 //ext v13.16b, v8.16b, v9.16b, #4*2 334 umlal2 v14.4s, v7.8h, v0.h[4] 335 umlal v15.4s, v8.4h, v0.h[4] 336 umlal2 v14.4s, v8.8h, v0.h[4] 337 umlal v15.4s, v9.4h, v0.h[4] 338 103: ext v12.16b, v7.16b, v8.16b, #5*2 339 ext v13.16b, v8.16b, v9.16b, #3*2 340 umlal v14.4s, v12.4h, v0.h[3] 341 umlal2 v15.4s, v12.8h, v0.h[3] 342 umlal v14.4s, v13.4h, v0.h[3] 343 umlal2 v15.4s, v13.8h, v0.h[3] 344 102: ext v12.16b, v7.16b, v8.16b, #6*2 345 ext v13.16b, v8.16b, v9.16b, #2*2 346 umlal v14.4s, v12.4h, v0.h[2] 347 umlal2 v15.4s, v12.8h, v0.h[2] 348 umlal v14.4s, v13.4h, v0.h[2] 349 umlal2 v15.4s, v13.8h, v0.h[2] 350 101: ext v12.16b, v7.16b, v8.16b, #7*2 351 ext v13.16b, v8.16b, v9.16b, #1*2 352 umlal v14.4s, v12.4h, v0.h[1] 353 umlal2 v15.4s, v12.8h, v0.h[1] 354 umlal v14.4s, v13.4h, v0.h[1] 355 umlal2 v15.4s, v13.8h, v0.h[1] 356 357 uqrshrn v14.4h, v14.4s, #16 358 uqrshrn2 v14.8h, v15.4s, #16 359 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 360 361 mov v6.16b, v7.16b 362 mov v7.16b, v8.16b 363 mov v8.16b, v9.16b 364 mov v9.16b, v10.16b 365 mov v10.16b, v11.16b 366.endm/*}}}*/ 367 368.macro hconv1_25/*{{{*/ 369 ext v12.16b, v6.16b, v7.16b, #7*2 370 umull v14.4s, v12.4h, v0.h[0] 371 umull2 v15.4s, v12.8h, v0.h[0] 372 373 adr x16, 100f 374 ldrsh x12, [x16, x5, LSL #1] 375 add x12, x12, x16 376 br x12 377 100: .hword -4 378 .hword 101f-100b 379 .hword 102f-100b 380 .hword 103f-100b 381 .hword 104f-100b 382 .hword 105f-100b 383 .hword 106f-100b 384 .hword 107f-100b 385 .hword 108f-100b 386 .hword 109f-100b 387 .hword 110f-100b 388 .hword 111f-100b 389 .hword 112f-100b 390 .hword 113f-100b 391 .hword 114f-100b 392 .hword 115f-100b 393 .hword 116f-100b 394 .hword 117f-100b 395 .hword 118f-100b 396 .hword 119f-100b 397 .hword 120f-100b 398 .hword 121f-100b 399 .hword 122f-100b 400 .hword 123f-100b 401 .hword 124f-100b 402 .hword 125f-100b 403 .align 4 404 125: ext v12.16b, v31.16b, v4.16b, #6*2 405 ext v13.16b, v10.16b, v11.16b, #0*2 406 umlal v14.4s, v12.4h, v3.h[1] 407 umlal2 v15.4s, v12.8h, v3.h[1] 408 umlal v14.4s, v13.4h, v3.h[1] 409 umlal2 v15.4s, v13.8h, v3.h[1] 410 124: ext v12.16b, v31.16b, v4.16b, #7*2 411 ext v13.16b, v9.16b, v10.16b, #7*2 412 umlal v14.4s, v12.4h, v3.h[0] 413 umlal2 v15.4s, v12.8h, v3.h[0] 414 umlal v14.4s, v13.4h, v3.h[0] 415 umlal2 v15.4s, v13.8h, v3.h[0] 416 123: ext v12.16b, v4.16b, v5.16b, #0*2 417 ext v13.16b, v9.16b, v10.16b, #6*2 418 umlal v14.4s, v12.4h, v2.h[7] 419 umlal2 v15.4s, v12.8h, v2.h[7] 420 umlal v14.4s, v13.4h, v2.h[7] 421 umlal2 v15.4s, v13.8h, v2.h[7] 422 122: ext v12.16b, v4.16b, v5.16b, #1*2 423 ext v13.16b, v9.16b, v10.16b, #5*2 424 umlal v14.4s, v12.4h, v2.h[6] 425 umlal2 v15.4s, v12.8h, v2.h[6] 426 umlal v14.4s, v13.4h, v2.h[6] 427 umlal2 v15.4s, v13.8h, v2.h[6] 428 121: ext v12.16b, v4.16b, v5.16b, #2*2 429 ext v13.16b, v9.16b, v10.16b, #4*2 430 umlal v14.4s, v12.4h, v2.h[5] 431 umlal2 v15.4s, v12.8h, v2.h[5] 432 umlal v14.4s, v13.4h, v2.h[5] 433 umlal2 v15.4s, v13.8h, v2.h[5] 434 120: ext v12.16b, v4.16b, v5.16b, #3*2 435 ext v13.16b, v9.16b, v10.16b, #3*2 436 umlal v14.4s, v12.4h, v2.h[4] 437 umlal2 v15.4s, v12.8h, v2.h[4] 438 umlal v14.4s, v13.4h, v2.h[4] 439 umlal2 v15.4s, v13.8h, v2.h[4] 440 119: ext v12.16b, v4.16b, v5.16b, #4*2 441 ext v13.16b, v9.16b, v10.16b, #2*2 442 umlal v14.4s, v12.4h, v2.h[3] 443 umlal2 v15.4s, v12.8h, v2.h[3] 444 umlal v14.4s, v13.4h, v2.h[3] 445 umlal2 v15.4s, v13.8h, v2.h[3] 446 118: ext v12.16b, v4.16b, v5.16b, #5*2 447 ext v13.16b, v9.16b, v10.16b, #1*2 448 umlal v14.4s, v12.4h, v2.h[2] 449 umlal2 v15.4s, v12.8h, v2.h[2] 450 umlal v14.4s, v13.4h, v2.h[2] 451 umlal2 v15.4s, v13.8h, v2.h[2] 452 117: ext v12.16b, v4.16b, v5.16b, #6*2 453 ext v13.16b, v9.16b, v10.16b, #0*2 454 umlal v14.4s, v12.4h, v2.h[1] 455 umlal2 v15.4s, v12.8h, v2.h[1] 456 umlal v14.4s, v13.4h, v2.h[1] 457 umlal2 v15.4s, v13.8h, v2.h[1] 458 116: ext v12.16b, v4.16b, v5.16b, #7*2 459 ext v13.16b, v8.16b, v9.16b, #7*2 460 umlal v14.4s, v12.4h, v2.h[0] 461 umlal2 v15.4s, v12.8h, v2.h[0] 462 umlal v14.4s, v13.4h, v2.h[0] 463 umlal2 v15.4s, v13.8h, v2.h[0] 464 115: ext v12.16b, v5.16b, v6.16b, #0*2 465 ext v13.16b, v8.16b, v9.16b, #6*2 466 umlal v14.4s, v12.4h, v1.h[7] 467 umlal2 v15.4s, v12.8h, v1.h[7] 468 umlal v14.4s, v13.4h, v1.h[7] 469 umlal2 v15.4s, v13.8h, v1.h[7] 470 114: ext v12.16b, v5.16b, v6.16b, #1*2 471 ext v13.16b, v8.16b, v9.16b, #5*2 472 umlal v14.4s, v12.4h, v1.h[6] 473 umlal2 v15.4s, v12.8h, v1.h[6] 474 umlal v14.4s, v13.4h, v1.h[6] 475 umlal2 v15.4s, v13.8h, v1.h[6] 476 113: ext v12.16b, v5.16b, v6.16b, #2*2 477 ext v13.16b, v8.16b, v9.16b, #4*2 478 umlal v14.4s, v12.4h, v1.h[5] 479 umlal2 v15.4s, v12.8h, v1.h[5] 480 umlal v14.4s, v13.4h, v1.h[5] 481 umlal2 v15.4s, v13.8h, v1.h[5] 482 112: ext v12.16b, v5.16b, v6.16b, #3*2 483 ext v13.16b, v8.16b, v9.16b, #3*2 484 umlal v14.4s, v12.4h, v1.h[4] 485 umlal2 v15.4s, v12.8h, v1.h[4] 486 umlal v14.4s, v13.4h, v1.h[4] 487 umlal2 v15.4s, v13.8h, v1.h[4] 488 111: ext v12.16b, v5.16b, v6.16b, #4*2 489 ext v13.16b, v8.16b, v9.16b, #2*2 490 umlal v14.4s, v12.4h, v1.h[3] 491 umlal2 v15.4s, v12.8h, v1.h[3] 492 umlal v14.4s, v13.4h, v1.h[3] 493 umlal2 v15.4s, v13.8h, v1.h[3] 494 110: ext v12.16b, v5.16b, v6.16b, #5*2 495 ext v13.16b, v8.16b, v9.16b, #1*2 496 umlal v14.4s, v12.4h, v1.h[2] 497 umlal2 v15.4s, v12.8h, v1.h[2] 498 umlal v14.4s, v13.4h, v1.h[2] 499 umlal2 v15.4s, v13.8h, v1.h[2] 500 109: ext v12.16b, v5.16b, v6.16b, #6*2 501 ext v13.16b, v8.16b, v9.16b, #0*2 502 umlal v14.4s, v12.4h, v1.h[1] 503 umlal2 v15.4s, v12.8h, v1.h[1] 504 umlal v14.4s, v13.4h, v1.h[1] 505 umlal2 v15.4s, v13.8h, v1.h[1] 506 108: ext v12.16b, v5.16b, v6.16b, #7*2 507 ext v13.16b, v7.16b, v8.16b, #7*2 508 umlal v14.4s, v12.4h, v1.h[0] 509 umlal2 v15.4s, v12.8h, v1.h[0] 510 umlal v14.4s, v13.4h, v1.h[0] 511 umlal2 v15.4s, v13.8h, v1.h[0] 512 107: ext v12.16b, v6.16b, v7.16b, #0*2 513 ext v13.16b, v7.16b, v8.16b, #6*2 514 umlal v14.4s, v12.4h, v0.h[7] 515 umlal2 v15.4s, v12.8h, v0.h[7] 516 umlal v14.4s, v13.4h, v0.h[7] 517 umlal2 v15.4s, v13.8h, v0.h[7] 518 106: ext v12.16b, v6.16b, v7.16b, #1*2 519 ext v13.16b, v7.16b, v8.16b, #5*2 520 umlal v14.4s, v12.4h, v0.h[6] 521 umlal2 v15.4s, v12.8h, v0.h[6] 522 umlal v14.4s, v13.4h, v0.h[6] 523 umlal2 v15.4s, v13.8h, v0.h[6] 524 105: ext v12.16b, v6.16b, v7.16b, #2*2 525 ext v13.16b, v7.16b, v8.16b, #4*2 526 umlal v14.4s, v12.4h, v0.h[5] 527 umlal2 v15.4s, v12.8h, v0.h[5] 528 umlal v14.4s, v13.4h, v0.h[5] 529 umlal2 v15.4s, v13.8h, v0.h[5] 530 104: ext v12.16b, v6.16b, v7.16b, #3*2 531 ext v13.16b, v7.16b, v8.16b, #3*2 532 umlal v14.4s, v12.4h, v0.h[4] 533 umlal2 v15.4s, v12.8h, v0.h[4] 534 umlal v14.4s, v13.4h, v0.h[4] 535 umlal2 v15.4s, v13.8h, v0.h[4] 536 103: ext v12.16b, v6.16b, v7.16b, #4*2 537 ext v13.16b, v7.16b, v8.16b, #2*2 538 umlal v14.4s, v12.4h, v0.h[3] 539 umlal2 v15.4s, v12.8h, v0.h[3] 540 umlal v14.4s, v13.4h, v0.h[3] 541 umlal2 v15.4s, v13.8h, v0.h[3] 542 102: ext v12.16b, v6.16b, v7.16b, #5*2 543 ext v13.16b, v7.16b, v8.16b, #1*2 544 umlal v14.4s, v12.4h, v0.h[2] 545 umlal2 v15.4s, v12.8h, v0.h[2] 546 umlal v14.4s, v13.4h, v0.h[2] 547 umlal2 v15.4s, v13.8h, v0.h[2] 548 101: ext v12.16b, v6.16b, v7.16b, #6*2 549 ext v13.16b, v7.16b, v8.16b, #0*2 550 umlal v14.4s, v12.4h, v0.h[1] 551 umlal2 v15.4s, v12.8h, v0.h[1] 552 umlal v14.4s, v13.4h, v0.h[1] 553 umlal2 v15.4s, v13.8h, v0.h[1] 554 555 uqrshrn v14.4h, v14.4s, #16 556 uqrshrn2 v14.8h, v15.4s, #16 557 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 558 559 mov v31.16b, v4.16b 560 mov v4.16b, v5.16b 561 mov v5.16b, v6.16b 562 mov v6.16b, v7.16b 563 mov v7.16b, v8.16b 564 mov v8.16b, v9.16b 565 mov v9.16b, v10.16b 566 mov v10.16b, v11.16b 567.endm/*}}}*/ 568 569#define TUNED_LIST4 6, 12, 20 570.macro hconv4_6/*{{{*/ 571 umull v14.4s, v7.4h, v0.h[0] 572 umull2 v15.4s, v7.8h, v0.h[0] 573 574 adr x16, 100f 575 ldrsh x12, [x16, x5, LSL #1] 576 add x12, x12, x16 577 br x12 578 100: .hword -4 579 .hword 101f-100b 580 .hword 102f-100b 581 .hword 103f-100b 582 .hword 104f-100b 583 .hword 105f-100b 584 .hword 106f-100b 585 .align 4 586 106: umlal v14.4s, v4.4h, v0.h[6] 587 umlal2 v15.4s, v4.8h, v0.h[6] 588 umlal v14.4s, v10.4h, v0.h[6] 589 umlal2 v15.4s, v10.8h, v0.h[6] 590 105: umlal2 v14.4s, v4.8h, v0.h[5] 591 umlal v15.4s, v5.4h, v0.h[5] 592 umlal2 v14.4s, v9.8h, v0.h[5] 593 umlal v15.4s, v10.4h, v0.h[5] 594 104: umlal v14.4s, v5.4h, v0.h[4] 595 umlal2 v15.4s, v5.8h, v0.h[4] 596 umlal v14.4s, v9.4h, v0.h[4] 597 umlal2 v15.4s, v9.8h, v0.h[4] 598 103: umlal2 v14.4s, v5.8h, v0.h[3] 599 umlal v15.4s, v6.4h, v0.h[3] 600 umlal2 v14.4s, v8.8h, v0.h[3] 601 umlal v15.4s, v9.4h, v0.h[3] 602 102: umlal v14.4s, v6.4h, v0.h[2] 603 umlal2 v15.4s, v6.8h, v0.h[2] 604 umlal v14.4s, v8.4h, v0.h[2] 605 umlal2 v15.4s, v8.8h, v0.h[2] 606 101: umlal2 v14.4s, v6.8h, v0.h[1] 607 umlal v15.4s, v7.4h, v0.h[1] 608 umlal2 v14.4s, v7.8h, v0.h[1] 609 umlal v15.4s, v8.4h, v0.h[1] 610 611 uqrshrn v14.4h, v14.4s, #16 612 uqrshrn2 v14.8h, v15.4s, #16 613 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 614 615 mov v4.16b, v5.16b 616 mov v5.16b, v6.16b 617 mov v6.16b, v7.16b 618 mov v7.16b, v8.16b 619 mov v8.16b, v9.16b 620 mov v9.16b, v10.16b 621 mov v10.16b, v11.16b 622.endm/*}}}*/ 623 624.macro hconv4_12/*{{{*/ 625 umull v14.4s, v4.4h, v0.h[0] 626 umull2 v15.4s, v4.8h, v0.h[0] 627 628 adr x16, 100f 629 ldrsh x12, [x16, x5, LSL #1] 630 add x12, x12, x16 631 br x12 632 100: .hword -4 633 .hword 101f-100b 634 .hword 102f-100b 635 .hword 103f-100b 636 .hword 104f-100b 637 .hword 105f-100b 638 .hword 106f-100b 639 .hword 107f-100b 640 .hword 108f-100b 641 .hword 109f-100b 642 .hword 110f-100b 643 .hword 111f-100b 644 .hword 112f-100b 645 .align 4 646 112: umlal v14.4s, v26.4h, v1.h[4] 647 umlal2 v15.4s, v26.8h, v1.h[4] 648 umlal v14.4s, v10.4h, v1.h[4] 649 umlal2 v15.4s, v10.8h, v1.h[4] 650 111: umlal2 v14.4s, v26.8h, v1.h[3] 651 umlal v15.4s, v27.4h, v1.h[3] 652 umlal2 v14.4s, v9.8h, v1.h[3] 653 umlal v15.4s, v10.4h, v1.h[3] 654 110: umlal v14.4s, v27.4h, v1.h[2] 655 umlal2 v15.4s, v27.8h, v1.h[2] 656 umlal v14.4s, v9.4h, v1.h[2] 657 umlal2 v15.4s, v9.8h, v1.h[2] 658 109: umlal2 v14.4s, v27.8h, v1.h[1] 659 umlal v15.4s, v28.4h, v1.h[1] 660 umlal2 v14.4s, v8.8h, v1.h[1] 661 umlal v15.4s, v9.4h, v1.h[1] 662 108: umlal v14.4s, v28.4h, v1.h[0] 663 umlal2 v15.4s, v28.8h, v1.h[0] 664 umlal v14.4s, v8.4h, v1.h[0] 665 umlal2 v15.4s, v8.8h, v1.h[0] 666 107: umlal2 v14.4s, v28.8h, v0.h[7] 667 umlal v15.4s, v29.4h, v0.h[7] 668 umlal2 v14.4s, v7.8h, v0.h[7] 669 umlal v15.4s, v8.4h, v0.h[7] 670 106: umlal v14.4s, v29.4h, v0.h[6] 671 umlal2 v15.4s, v29.8h, v0.h[6] 672 umlal v14.4s, v7.4h, v0.h[6] 673 umlal2 v15.4s, v7.8h, v0.h[6] 674 105: umlal2 v14.4s, v29.8h, v0.h[5] 675 umlal v15.4s, v30.4h, v0.h[5] 676 umlal2 v14.4s, v6.8h, v0.h[5] 677 umlal v15.4s, v7.4h, v0.h[5] 678 104: umlal v14.4s, v30.4h, v0.h[4] 679 umlal2 v15.4s, v30.8h, v0.h[4] 680 umlal v14.4s, v6.4h, v0.h[4] 681 umlal2 v15.4s, v6.8h, v0.h[4] 682 103: umlal2 v14.4s, v30.8h, v0.h[3] 683 umlal v15.4s, v31.4h, v0.h[3] 684 umlal2 v14.4s, v5.8h, v0.h[3] 685 umlal v15.4s, v6.4h, v0.h[3] 686 102: umlal v14.4s, v31.4h, v0.h[2] 687 umlal2 v15.4s, v31.8h, v0.h[2] 688 umlal v14.4s, v5.4h, v0.h[2] 689 umlal2 v15.4s, v5.8h, v0.h[2] 690 101: umlal2 v14.4s, v31.8h, v0.h[1] 691 umlal v15.4s, v4.4h, v0.h[1] 692 umlal2 v14.4s, v4.8h, v0.h[1] 693 umlal v15.4s, v5.4h, v0.h[1] 694 695 uqrshrn v14.4h, v14.4s, #16 696 uqrshrn2 v14.8h, v15.4s, #16 697 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 698 699 mov v26.16b, v27.16b 700 mov v27.16b, v28.16b 701 mov v28.16b, v29.16b 702 mov v29.16b, v30.16b 703 mov v30.16b, v31.16b 704 mov v31.16b, v4.16b 705 mov v4.16b, v5.16b 706 mov v5.16b, v6.16b 707 mov v6.16b, v7.16b 708 mov v7.16b, v8.16b 709 mov v8.16b, v9.16b 710 mov v9.16b, v10.16b 711 mov v10.16b, v11.16b 712.endm/*}}}*/ 713 714.macro hconv4_20/*{{{*/ 715 umull v14.4s, v28.4h, v0.h[0] 716 umull2 v15.4s, v28.8h, v0.h[0] 717 718 adr x16, 100f 719 ldrsh x12, [x16, x5, LSL #1] 720 add x12, x12, x16 721 br x12 722 100: .hword -4 723 .hword 101f-100b 724 .hword 102f-100b 725 .hword 103f-100b 726 .hword 104f-100b 727 .hword 105f-100b 728 .hword 106f-100b 729 .hword 107f-100b 730 .hword 108f-100b 731 .hword 109f-100b 732 .hword 110f-100b 733 .hword 111f-100b 734 .hword 112f-100b 735 .hword 113f-100b 736 .hword 114f-100b 737 .hword 115f-100b 738 .hword 116f-100b 739 .hword 117f-100b 740 .hword 118f-100b 741 .hword 119f-100b 742 .hword 120f-100b 743 .align 4 744 745 120: umlal v14.4s, v18.4h, v2.h[4] 746 umlal2 v15.4s, v18.8h, v2.h[4] 747 umlal v14.4s, v10.4h, v2.h[4] 748 umlal2 v15.4s, v10.8h, v2.h[4] 749 119: umlal2 v14.4s, v18.8h, v2.h[3] 750 umlal v15.4s, v19.4h, v2.h[3] 751 umlal2 v14.4s, v9.8h, v2.h[3] 752 umlal v15.4s, v10.4h, v2.h[3] 753 118: umlal v14.4s, v19.4h, v2.h[2] 754 umlal2 v15.4s, v19.8h, v2.h[2] 755 umlal v14.4s, v9.4h, v2.h[2] 756 umlal2 v15.4s, v9.8h, v2.h[2] 757 117: umlal2 v14.4s, v19.8h, v2.h[1] 758 umlal v15.4s, v20.4h, v2.h[1] 759 umlal2 v14.4s, v8.8h, v2.h[1] 760 umlal v15.4s, v9.4h, v2.h[1] 761 116: umlal v14.4s, v20.4h, v2.h[0] 762 umlal2 v15.4s, v20.8h, v2.h[0] 763 umlal v14.4s, v8.4h, v2.h[0] 764 umlal2 v15.4s, v8.8h, v2.h[0] 765 115: umlal2 v14.4s, v20.8h, v1.h[7] 766 umlal v15.4s, v21.4h, v1.h[7] 767 umlal2 v14.4s, v7.8h, v1.h[7] 768 umlal v15.4s, v8.4h, v1.h[7] 769 114: umlal v14.4s, v21.4h, v1.h[6] 770 umlal2 v15.4s, v21.8h, v1.h[6] 771 umlal v14.4s, v7.4h, v1.h[6] 772 umlal2 v15.4s, v7.8h, v1.h[6] 773 113: umlal2 v14.4s, v21.8h, v1.h[5] 774 umlal v15.4s, v22.4h, v1.h[5] 775 umlal2 v14.4s, v6.8h, v1.h[5] 776 umlal v15.4s, v7.4h, v1.h[5] 777 112: umlal v14.4s, v22.4h, v1.h[4] 778 umlal2 v15.4s, v22.8h, v1.h[4] 779 umlal v14.4s, v6.4h, v1.h[4] 780 umlal2 v15.4s, v6.8h, v1.h[4] 781 111: umlal2 v14.4s, v22.8h, v1.h[3] 782 umlal v15.4s, v23.4h, v1.h[3] 783 umlal2 v14.4s, v5.8h, v1.h[3] 784 umlal v15.4s, v6.4h, v1.h[3] 785 110: umlal v14.4s, v23.4h, v1.h[2] 786 umlal2 v15.4s, v23.8h, v1.h[2] 787 umlal v14.4s, v5.4h, v1.h[2] 788 umlal2 v15.4s, v5.8h, v1.h[2] 789 109: umlal2 v14.4s, v23.8h, v1.h[1] 790 umlal v15.4s, v24.4h, v1.h[1] 791 umlal2 v14.4s, v4.8h, v1.h[1] 792 umlal v15.4s, v5.4h, v1.h[1] 793 108: umlal v14.4s, v24.4h, v1.h[0] 794 umlal2 v15.4s, v24.8h, v1.h[0] 795 umlal v14.4s, v4.4h, v1.h[0] 796 umlal2 v15.4s, v4.8h, v1.h[0] 797 107: umlal2 v14.4s, v24.8h, v0.h[7] 798 umlal v15.4s, v25.4h, v0.h[7] 799 umlal2 v14.4s, v31.8h, v0.h[7] 800 umlal v15.4s, v4.4h, v0.h[7] 801 106: umlal v14.4s, v25.4h, v0.h[6] 802 umlal2 v15.4s, v25.8h, v0.h[6] 803 umlal v14.4s, v31.4h, v0.h[6] 804 umlal2 v15.4s, v31.8h, v0.h[6] 805 105: umlal2 v14.4s, v25.8h, v0.h[5] 806 umlal v15.4s, v26.4h, v0.h[5] 807 umlal2 v14.4s, v30.8h, v0.h[5] 808 umlal v15.4s, v31.4h, v0.h[5] 809 104: umlal v14.4s, v26.4h, v0.h[4] 810 umlal2 v15.4s, v26.8h, v0.h[4] 811 umlal v14.4s, v30.4h, v0.h[4] 812 umlal2 v15.4s, v30.8h, v0.h[4] 813 103: umlal2 v14.4s, v26.8h, v0.h[3] 814 umlal v15.4s, v27.4h, v0.h[3] 815 umlal2 v14.4s, v29.8h, v0.h[3] 816 umlal v15.4s, v30.4h, v0.h[3] 817 102: umlal v14.4s, v27.4h, v0.h[2] 818 umlal2 v15.4s, v27.8h, v0.h[2] 819 umlal v14.4s, v29.4h, v0.h[2] 820 umlal2 v15.4s, v29.8h, v0.h[2] 821 101: umlal2 v14.4s, v27.8h, v0.h[1] 822 umlal v15.4s, v28.4h, v0.h[1] 823 umlal2 v14.4s, v28.8h, v0.h[1] 824 umlal v15.4s, v29.4h, v0.h[1] 825 826 uqrshrn v14.4h, v14.4s, #16 827 uqrshrn2 v14.8h, v15.4s, #16 828 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 829 830 mov v18.16b, v19.16b 831 mov v19.16b, v20.16b 832 mov v20.16b, v21.16b 833 mov v21.16b, v22.16b 834 mov v22.16b, v23.16b 835 mov v23.16b, v24.16b 836 mov v24.16b, v25.16b 837 mov v25.16b, v26.16b 838 mov v26.16b, v27.16b 839 mov v27.16b, v28.16b 840 mov v28.16b, v29.16b 841 mov v29.16b, v30.16b 842 mov v30.16b, v31.16b 843 mov v31.16b, v4.16b 844 mov v4.16b, v5.16b 845 mov v5.16b, v6.16b 846 mov v6.16b, v7.16b 847 mov v7.16b, v8.16b 848 mov v8.16b, v9.16b 849 mov v9.16b, v10.16b 850 mov v10.16b, v11.16b 851.endm/*}}}*/ 852 853.macro hconv4_25/*{{{*/ 854 umull2 v14.4s, v25.8h, v0.h[0] 855 umull v15.4s, v26.4h, v0.h[0] 856 857 adr x16, 100f 858 ldrsh x12, [x16, x5, LSL #1] 859 add x12, x12, x16 860 br x12 861 100: .hword -4 862 .hword 101f-100b 863 .hword 102f-100b 864 .hword 103f-100b 865 .hword 104f-100b 866 .hword 105f-100b 867 .hword 106f-100b 868 .hword 107f-100b 869 .hword 108f-100b 870 .hword 109f-100b 871 .hword 110f-100b 872 .hword 111f-100b 873 .hword 112f-100b 874 .hword 113f-100b 875 .hword 114f-100b 876 .hword 115f-100b 877 .hword 116f-100b 878 .hword 117f-100b 879 .hword 118f-100b 880 .hword 119f-100b 881 .hword 120f-100b 882 .hword 121f-100b 883 .hword 122f-100b 884 .hword 123f-100b 885 .hword 124f-100b 886 .hword 125f-100b 887 .align 4 888 889 125: ld1 {v12.8h}, [x9] 890 umlal v14.4s, v12.4h, v3.h[1] 891 umlal2 v15.4s, v12.8h, v3.h[1] 892 umlal v14.4s, v10.4h, v3.h[1] 893 umlal2 v15.4s, v10.8h, v3.h[1] 894 124: add x12, x9, #0x08 895 bic x12, x12, #0x40 896 ld1 {v12.4h}, [x12], #8 897 bic x12, x12, #0x40 898 ld1 {v13.4h}, [x12] 899 umlal v14.4s, v12.4h, v3.h[0] 900 umlal v15.4s, v13.4h, v3.h[0] 901 umlal2 v14.4s, v9.8h, v3.h[0] 902 umlal v15.4s, v10.4h, v3.h[0] 903 123: add x12, x9, #0x10 904 bic x12, x12, #0x40 905 ld1 {v12.8h}, [x12] 906 umlal v14.4s, v12.4h, v2.h[7] 907 umlal2 v15.4s, v12.8h, v2.h[7] 908 umlal v14.4s, v9.4h, v2.h[7] 909 umlal2 v15.4s, v9.8h, v2.h[7] 910 122: add x12, x9, #0x18 911 bic x12, x12, #0x40 912 ld1 {v12.4h}, [x12], #8 913 bic x12, x12, #0x40 914 ld1 {v13.4h}, [x12] 915 umlal v14.4s, v12.4h, v2.h[6] 916 umlal v15.4s, v13.4h, v2.h[6] 917 umlal2 v14.4s, v8.8h, v2.h[6] 918 umlal v15.4s, v9.4h, v2.h[6] 919 121: add x12, x9, #0x20 920 bic x12, x12, #0x40 921 ld1 {v12.8h}, [x12] 922 umlal v14.4s, v12.4h, v2.h[5] 923 umlal2 v15.4s, v12.8h, v2.h[5] 924 umlal v14.4s, v8.4h, v2.h[5] 925 umlal2 v15.4s, v8.8h, v2.h[5] 926 120: add x12, x9, #0x28 927 bic x12, x12, #0x40 928 ld1 {v12.4h}, [x12], #8 929 bic x12, x12, #0x40 930 ld1 {v13.4h}, [x12] 931 umlal v14.4s, v12.4h, v2.h[4] 932 umlal v15.4s, v13.4h, v2.h[4] 933 umlal2 v14.4s, v7.8h, v2.h[4] 934 umlal v15.4s, v8.4h, v2.h[4] 935 119: add x12, x9, #0x30 936 bic x12, x12, #0x40 937 ld1 {v12.8h}, [x12] 938 umlal v14.4s, v12.4h, v2.h[3] 939 umlal2 v15.4s, v12.8h, v2.h[3] 940 umlal v14.4s, v7.4h, v2.h[3] 941 umlal2 v15.4s, v7.8h, v2.h[3] 942 118: add x12, x9, #0x38 943 bic x12, x12, #0x40 944 ld1 {v12.4h}, [x12] 945 umlal v14.4s, v12.4h, v2.h[2] 946 umlal v15.4s, v17.4h, v2.h[2] 947 umlal2 v14.4s, v6.8h, v2.h[2] 948 umlal v15.4s, v7.4h, v2.h[2] 949 117: umlal v14.4s, v17.4h, v2.h[1] 950 umlal2 v15.4s, v17.8h, v2.h[1] 951 umlal v14.4s, v6.4h, v2.h[1] 952 umlal2 v15.4s, v6.8h, v2.h[1] 953 116: umlal2 v14.4s, v17.8h, v2.h[0] 954 umlal v15.4s, v18.4h, v2.h[0] 955 umlal2 v14.4s, v5.8h, v2.h[0] 956 umlal v15.4s, v6.4h, v2.h[0] 957 115: umlal v14.4s, v18.4h, v1.h[7] 958 umlal2 v15.4s, v18.8h, v1.h[7] 959 umlal v14.4s, v5.4h, v1.h[7] 960 umlal2 v15.4s, v5.8h, v1.h[7] 961 114: umlal2 v14.4s, v18.8h, v1.h[6] 962 umlal v15.4s, v19.4h, v1.h[6] 963 umlal2 v14.4s, v4.8h, v1.h[6] 964 umlal v15.4s, v5.4h, v1.h[6] 965 113: umlal v14.4s, v19.4h, v1.h[5] 966 umlal2 v15.4s, v19.8h, v1.h[5] 967 umlal v14.4s, v4.4h, v1.h[5] 968 umlal2 v15.4s, v4.8h, v1.h[5] 969 112: umlal2 v14.4s, v19.8h, v1.h[4] 970 umlal v15.4s, v20.4h, v1.h[4] 971 umlal2 v14.4s, v31.8h, v1.h[4] 972 umlal v15.4s, v4.4h, v1.h[4] 973 111: umlal v14.4s, v20.4h, v1.h[3] 974 umlal2 v15.4s, v20.8h, v1.h[3] 975 umlal v14.4s, v31.4h, v1.h[3] 976 umlal2 v15.4s, v31.8h, v1.h[3] 977 110: umlal2 v14.4s, v20.8h, v1.h[2] 978 umlal v15.4s, v21.4h, v1.h[2] 979 umlal2 v14.4s, v30.8h, v1.h[2] 980 umlal v15.4s, v31.4h, v1.h[2] 981 109: umlal v14.4s, v21.4h, v1.h[1] 982 umlal2 v15.4s, v21.8h, v1.h[1] 983 umlal v14.4s, v30.4h, v1.h[1] 984 umlal2 v15.4s, v30.8h, v1.h[1] 985 108: umlal2 v14.4s, v21.8h, v1.h[0] 986 umlal v15.4s, v22.4h, v1.h[0] 987 umlal2 v14.4s, v29.8h, v1.h[0] 988 umlal v15.4s, v30.4h, v1.h[0] 989 107: umlal v14.4s, v22.4h, v0.h[7] 990 umlal2 v15.4s, v22.8h, v0.h[7] 991 umlal v14.4s, v29.4h, v0.h[7] 992 umlal2 v15.4s, v29.8h, v0.h[7] 993 106: umlal2 v14.4s, v22.8h, v0.h[6] 994 umlal v15.4s, v23.4h, v0.h[6] 995 umlal2 v14.4s, v28.8h, v0.h[6] 996 umlal v15.4s, v29.4h, v0.h[6] 997 105: umlal v14.4s, v23.4h, v0.h[5] 998 umlal2 v15.4s, v23.8h, v0.h[5] 999 umlal v14.4s, v28.4h, v0.h[5] 1000 umlal2 v15.4s, v28.8h, v0.h[5] 1001 104: umlal2 v14.4s, v23.8h, v0.h[4] 1002 umlal v15.4s, v24.4h, v0.h[4] 1003 umlal2 v14.4s, v27.8h, v0.h[4] 1004 umlal v15.4s, v28.4h, v0.h[4] 1005 103: umlal v14.4s, v24.4h, v0.h[3] 1006 umlal2 v15.4s, v24.8h, v0.h[3] 1007 umlal v14.4s, v27.4h, v0.h[3] 1008 umlal2 v15.4s, v27.8h, v0.h[3] 1009 102: umlal2 v14.4s, v24.8h, v0.h[2] 1010 umlal v15.4s, v25.4h, v0.h[2] 1011 umlal2 v14.4s, v26.8h, v0.h[2] 1012 umlal v15.4s, v27.4h, v0.h[2] 1013 101: umlal v14.4s, v25.4h, v0.h[1] 1014 umlal2 v15.4s, v25.8h, v0.h[1] 1015 umlal v14.4s, v26.4h, v0.h[1] 1016 umlal2 v15.4s, v26.8h, v0.h[1] 1017 1018 uqrshrn v14.4h, v14.4s, #16 1019 uqrshrn2 v14.8h, v15.4s, #16 1020 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 1021 1022 st1 {v17.16b}, [x9], #16 1023 bic x9, x9, #0x40 1024 mov v17.16b, v18.16b 1025 mov v18.16b, v19.16b 1026 mov v19.16b, v20.16b 1027 mov v20.16b, v21.16b 1028 mov v21.16b, v22.16b 1029 mov v22.16b, v23.16b 1030 mov v23.16b, v24.16b 1031 mov v24.16b, v25.16b 1032 mov v25.16b, v26.16b 1033 mov v26.16b, v27.16b 1034 mov v27.16b, v28.16b 1035 mov v28.16b, v29.16b 1036 mov v29.16b, v30.16b 1037 mov v30.16b, v31.16b 1038 mov v31.16b, v4.16b 1039 mov v4.16b, v5.16b 1040 mov v5.16b, v6.16b 1041 mov v6.16b, v7.16b 1042 mov v7.16b, v8.16b 1043 mov v8.16b, v9.16b 1044 mov v9.16b, v10.16b 1045 mov v10.16b, v11.16b 1046.endm/*}}}*/ 1047 1048/* Dedicated function wrapper for the fetch macro, for the cases where 1049 * performance isn't that important, to keep code size down. 1050 */ 1051PRIVATE(fetch_generic_asm) 1052 stp x10, x11, [sp, #-16]! 1053 fetch 1054 ldp x10, x11, [sp], #16 1055 ret 1056END(fetch_generic_asm) 1057 1058/* Given values in v10 and v11, and an index in x11, sweep the (x11&15)th value 1059 * across to fill the rest of the register pair. Used for filling the right 1060 * hand edge of the window when starting too close to the right hand edge of 1061 * the image. 1062 * Also returns a dup-ed copy of the last element in v12 for the tail-fill 1063 * case (this happens incidentally in common path, but must be done 1064 * deliberately in the fast-out path). 1065 */ 1066PRIVATE(prefetch_clampright1) 1067 ands x12, x11, #15 1068 beq 1f 1069 sub x12, x12, #1 1070 sub sp, sp, #64 1071 st1 {v10.8h,v11.8h}, [sp] 1072 add x12, sp, x12, LSL #1 1073 ld1r {v12.8h}, [x12] 1074 st1 {v12.8h}, [x12], #16 1075 st1 {v12.8h}, [x12] 1076 ld1 {v10.8h,v11.8h}, [sp] 1077 add sp, sp, #64 1078 ret 10791: dup v12.8h, v11.h[7] 1080 ret 1081END(prefetch_clampright1) 1082 1083PRIVATE(prefetch_clampright4) 1084 ands x12, x11, #15 1085 beq 1f 1086 sub x12, x12, #4 1087 sub sp, sp, #64 1088 st1 {v10.8h,v11.8h}, [sp] 1089 add x12, sp, x12, LSL #1 1090 ld1r {v12.2d}, [x12] 1091 st1 {v12.8h}, [x12], #16 1092 st1 {v12.8h}, [x12] 1093 ld1 {v10.8h,v11.8h}, [sp] 1094 add sp, sp, #64 1095 ret 10961: dup v12.2d, v11.d[1] 1097 ret 1098END(prefetch_clampright4) 1099 1100 1101/* Helpers for prefetch, below. 1102 */ 1103.macro prefetch_out qa, qb, store, qsa, qsb, qsb_hi 1104 .if \store == 2 1105 .ifc \qsa,\qsb 1106 st1 {\qsa}, [x9], #16 1107 st1 {\qsb}, [x9], #16 1108 .else 1109 st1 {\qsa,\qsb}, [x9], #32 1110 .endif 1111 .elseif \store == 1 1112 bic x9, x9, #0x40 1113 st1 {\qsa}, [x9], #16 1114 mov \qb, \qsb 1115 .elseif \store == 0 1116 mov \qa, \qsa 1117 mov \qb, \qsb 1118 .endif 1119.endm 1120 1121.macro prefetch_one qa, qb, rem, c, store=0, step=1 1122.set i, (need - 16) - \rem 1123.if i >= 0 11241: cmp x10, #i+16 1125 blo 2f 1126 prefetch_out \qa, \qb, \store, v9.16b, v9.16b, v9.d[1] 1127 b 1f 11282: cmp x11, #i+16 1129 bls 3f 1130 prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1] 1131 bl fetch_generic_asm 1132 b 2f 11333: bl prefetch_clampright\step 1134 prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1] 11354: b 4f+4 1136 //v12 contains pad word from prefetch_clampright call 1137 prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1] 1138 .if \rem > 0 1139 b 4f+4 1140 .else 11411: 11422: 11433: 11444: nop 1145 .endif 1146.endif 1147.endm 1148 1149/* Fill the convolution window with context data. The aim here is to load 1150 * exactly rlf + rrt columns, and in the main loop to read as many columns as 1151 * will be written. This is complicated by the need to handle cases when the 1152 * input starts very close to the left or right (or both) edges of the image, 1153 * and where these do not fall on 16-byte boundaries. 1154 * 1155 * Input: 1156 * x1 -- src 1157 * x2 -- pitch 1158 * x3 -- count 1159 * x4 -- inlen 1160 * x5 -- r 1161 * x6 -- rup 1162 * x7 -- rdn 1163 * x8 -- rlf 1164 * x9 -- buffer (if needed) 1165 * x13 = -pitch 1166 * x15 = top-row in 1167 * x19 = bottom-row in 1168 * Output: 1169 * x1 += rlf + min(count, rrt) 1170 * Modifies: 1171 * x10 -- fill start index in the window 1172 * x11 -- fill stop index in the window 1173 * x12 -- scratch 1174 */ 1175.macro prefetch step=1, max_r=25 1176.set need, ((\max_r + \max_r) * \step + 15) & ~15 1177 .if \step == 1 1178 mov x10, #need - (\max_r * \step) 1179 sub x10, x10, x8 1180 .else 1181 mov x10, #need - (\max_r * \step) 1182 sub x10, x10, x8, LSL #2 1183 .endif 1184 add x11, x10, x4 1185 subs x11, x11, #need 1186 csel x11, xzr, x11, hi 1187 add x11, x11, #need 1188 1189 bl fetch_generic_asm 1190 .if \step == 1 1191 dup v9.8h, v10.h[0] 1192 .else 1193 dup v9.2d, v10.d[0] 1194 .endif 1195 ands x12, x10, #15 1196 beq 2f 1197 sub sp, sp, #32 1198 st1 {v10.8h,v11.8h}, [sp] 1199 sub x12, sp, x12, LSL #1 1200 sub sp, sp, #16 1201 st1 {v9.8h}, [sp] 1202 sub sp, sp, #16 1203 st1 {v9.8h}, [sp] 1204 ld1 {v10.8h,v11.8h}, [x12] 1205 add sp, sp, #64 1206 sub x1, x1, x10 1207 sub x15, x15, x10 1208 sub x19, x19, x10 1209 bic x10, x10, #15 1210 add x1, x1, x10 1211 add x15, x15, x10 1212 add x19, x19, x10 12132: 1214 .if \step > 1 1215 /* it's only in the uchar2 and uchar4 cases where the register file 1216 * is insufficient (given MAX_R <= 25). 1217 */ 1218 prefetch_one xx, xx, 192, c=\max_r, step=\step, store=2 1219 prefetch_one xx, xx, 176, c=\max_r, step=\step, store=2 1220 prefetch_one xx, v17.16b, 160, c=\max_r, step=\step, store=1 1221 prefetch_one v18.16b, v19.16b, 144, c=\max_r, step=\step, store=0 1222 prefetch_one v20.16b, v21.16b, 128, c=\max_r, step=\step, store=0 1223 prefetch_one v22.16b, v23.16b, 112, c=\max_r, step=\step, store=0 1224 prefetch_one v24.16b, v25.16b, 96, c=\max_r, step=\step, store=0 1225 prefetch_one v26.16b, v27.16b, 80, c=\max_r, step=\step, store=0 1226 prefetch_one v28.16b, v29.16b, 64, c=\max_r, step=\step, store=0 1227 .endif 1228 prefetch_one v30.16b, v31.16b, 48, c=\max_r, step=\step, store=0 1229 prefetch_one v4.16b, v5.16b, 32, c=\max_r, step=\step, store=0 1230 prefetch_one v6.16b, v7.16b, 16, c=\max_r, step=\step, store=0 1231 prefetch_one v8.16b, v9.16b, 0, c=\max_r, step=\step, store=0 1232 1233 .if \step == 1 1234 add x10, x8, #\max_r * \step 1235 .else 1236 lsl x10, x8, #2 1237 add x10, x10, #\max_r * \step 1238 .endif 1239 subs x4, x4, x10 1240 csel x4, xzr, x4, lo 1241.endm 1242 1243/* The main loop. 1244 * 1245 * Input: 1246 * x0 = dst 1247 * x1 = src 1248 * x2 = pitch 1249 * x3 = count 1250 * x4 = inlen 1251 * x5 = r 1252 * x6 = rup 1253 * x7 = rdn 1254 * x9 = buffer 1255 * x13 = -pitch 1256 * x15 = top-row in 1257 * x19 = bottom-row in 1258 * Modifies 1259 * x8 = fetch code pointer 1260 */ 1261.macro mainloop core, step=1, max_r=25, labelc="", labelnc="" 1262 adrp x8, \labelnc 1263 add x8, x8, #:lo12:\labelnc 1264 sub x8, x8, x5, LSL #5 1265 sub x8, x8, x5, LSL #3 1266 cmp x5, x6 1267 ccmp x5, x7, #0, eq 1268 beq 5f 1269 1270 /* if (r != rup || r != rdn) then the address-clamping table should 1271 * be used rather than the short-cut version. 1272 */ 1273 adrp x8, \labelc 1274 add x8, x8, #:lo12:\labelc 1275 sub x8, x8, x5, LSL #6 1276 add x8, x8, x5, LSL #3 1277 b 5f 1278 .align 4 12793: fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8 1280 1281 /* For each call to fetch two are made to \core. It would be 1282 * preferable to have twice the work done in \core. 1283 */ 1284 \core 1285 st1 {v15.8b}, [x0], #8 1286 \core 1287 st1 {v15.8b}, [x0], #8 1288 1289 sub x3, x3, #16 12905: subs x4, x4, #16 1291 bhs 3b 1292 adds x4, x4, #16 1293 bne 1f 1294 .if \step==1 1295 dup v10.8h, v9.h[7] 1296 dup v11.8h, v9.h[7] 1297 .else 1298 dup v10.2d, v9.d[1] 1299 dup v11.2d, v9.d[1] 1300 .endif 1301 b 4f 1302 13031: sub x1, x1, #16 1304 sub x15, x15, #16 1305 sub x19, x19, #16 1306 add x1, x1, x4 1307 add x15, x15, x4 1308 add x19, x19, x4 1309 bl fetch_generic_asm 1310 1311 .if \step==1 1312 dup v12.8h, v11.h[7] 1313 .else 1314 dup v12.2d, v11.d[1] 1315 .endif 1316 sub x4, xzr, x4 1317 tbz x4, #3, 1f 1318 mov v10.16b, v11.16b 1319 mov v11.16b, v12.16b 13201: tbz x4, #2, 1f 1321 ext v10.16b, v10.16b, v11.16b, #4*2 1322 ext v11.16b, v11.16b, v12.16b, #4*2 13231: tbz x4, #1, 1f 1324 ext v10.16b, v10.16b, v11.16b, #2*2 1325 ext v11.16b, v11.16b, v12.16b, #2*2 13261: tbz x4, #0, 4f 1327 ext v10.16b, v10.16b, v11.16b, #1*2 1328 ext v11.16b, v11.16b, v12.16b, #1*2 13294: cbz x3, 5f 13303: \core 1331 .if \step==1 1332 dup v11.8h, v11.h[7] 1333 .else 1334 dup v11.2d, v11.d[1] 1335 .endif 1336 subs x3, x3, #8 1337 blo 4f 1338 st1 {v15.8b}, [x0], #8 1339 beq 5f 1340 b 3b 13414: tbz x3, #2, 1f 1342 st1 {v15.s}[0], [x0], #4 1343 ext v15.8b, v15.8b, v15.8b, #4 13441: tbz x3, #1, 1f 1345 st1 {v15.h}[0], [x0], #2 1346 ext v15.8b, v15.8b, v15.8b, #2 13471: tbz x3, #0, 5f 1348 st1 {v15.b}[0], [x0], #1 1349 ext v15.8b, v15.8b, v15.8b, #1 13505: nop 1351.endm 1352 1353.irep r, TUNED_LIST1, 25 1354PRIVATE(convolve1_\r) 1355 stp x29,x30, [sp, #-16]! 1356 1357 prefetch step=1, max_r=\r 1358 1359 mainloop core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r 1360 1361 ldp x29,x30, [sp], #16 1362 ret 1363END(convolve1_\r) 1364.endr 1365 1366.irep r, TUNED_LIST4, 25 1367PRIVATE(convolve4_\r) 1368 sub x12, sp, #0x040 1369 bic x9, x12, #0x07f 1370 mov sp, x9 1371 stp x12,x30, [sp, #-16]! 1372 1373 /* x9 now points to a buffer on the stack whose address has the low 1374 * 7 bits clear. This allows easy address calculation in the 1375 * wrap-around cases. 1376 */ 1377 1378 1379 prefetch step=4, max_r=\r 1380 1381 mainloop core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r 1382 1383 ldp x12,x30, [sp] 1384 add sp, x12, #0x40 1385 ret 1386END(convolve4_\r) 1387.endr 1388 1389/* void rsdIntrinsicBlurU1_K( 1390 * void *out, // x0 1391 * void *in, // x1 1392 * size_t w, // x2 1393 * size_t h, // x3 1394 * size_t p, // x4 1395 * size_t x, // x5 1396 * size_t y, // x6 1397 * size_t count, // x7 1398 * size_t r, // [sp] 1399 * uint16_t *tab); // [sp,#8] 1400 */ 1401ENTRY(rsdIntrinsicBlurU1_K) 1402 stp x19,x30, [sp, #-16]! 1403 sub x8, sp, #32 1404 sub sp, sp, #64 1405 st1 {v8.1d - v11.1d}, [sp] 1406 st1 {v12.1d - v15.1d}, [x8] 1407 mov x8, x5 // x 1408 ldr w5, [sp,#80] // r 1409 sub x9, x2, x8 1410 sub x10, x3, x6 1411 mov x2, x4 // pitch 1412 mov x3, x7 // count 1413 sub x7, x10, #1 1414 sub x9, x9, x3 1415 1416 ldr x12, [sp, #88] // tab 1417 1418 add x1, x1, x8 1419 1420 cmp x6, x5 1421 csel x6, x5, x6, hs 1422 cmp x7, x5 1423 csel x7, x5, x7, hs 1424 cmp x8, x5 1425 csel x8, x5, x8, hs 1426 cmp x9, x5 1427 csel x9, x5, x9, hs 1428 1429 add x4, x8, x9 1430 add x4, x4, x3 1431 1432 sub x1, x1, x8 1433 1434 sub x13, xzr, x2 1435 msub x15, x2, x6, x1 1436 madd x19, x2, x7, x1 1437 1438 ld1 {v0.8h,v1.8h}, [x12], #32 1439 ld1 {v2.8h,v3.8h}, [x12], #32 1440 1441 adr x30, 1f 1442 .irep r, TUNED_LIST1 1443 cmp x5, #\r 1444 bls convolve1_\r 1445 .endr 1446 b convolve1_25 1447 14481: ld1 {v8.1d - v11.1d}, [sp], #32 1449 ld1 {v12.1d - v15.1d}, [sp], #32 1450 ldp x19,x30, [sp], #16 1451 ret 1452END(rsdIntrinsicBlurU1_K) 1453 1454/* void rsdIntrinsicBlurU4_K( 1455 * void *out, // x0 1456 * void *in, // x1 1457 * size_t w, // x2 1458 * size_t h, // x3 1459 * size_t p, // x4 1460 * size_t x, // x5 1461 * size_t y, // x6 1462 * size_t count, // x7 1463 * size_t r, // [sp] 1464 * uint16_t *tab); // [sp,#8] 1465 */ 1466ENTRY(rsdIntrinsicBlurU4_K) 1467 stp x19,x30, [sp, #-16]! 1468 sub x8, sp, #32 1469 sub sp, sp, #64 1470 st1 {v8.1d - v11.1d}, [sp] 1471 st1 {v12.1d - v15.1d}, [x8] 1472 mov x8, x5 // x 1473 ldr w5, [sp,#80] // r 1474 sub x9, x2, x8 1475 sub x10, x3, x6 1476 mov x2, x4 // pitch 1477 mov x3, x7 // count 1478 sub x7, x10, #1 1479 sub x9, x9, x3 1480 1481 ldr x12, [sp, #88] 1482 1483 add x1, x1, x8, LSL #2 1484 1485 cmp x6, x5 1486 csel x6, x5, x6, hs 1487 cmp x7, x5 1488 csel x7, x5, x7, hs 1489 cmp x8, x5 1490 csel x8, x5, x8, hs 1491 cmp x9, x5 1492 csel x9, x5, x9, hs 1493 1494 lsl x3, x3, #2 1495 add x4, x8, x9 1496 add x4, x3, x4, LSL #2 1497 1498 sub x1, x1, x8, LSL #2 1499 1500 sub x13, xzr, x2 1501 msub x15, x2, x6, x1 1502 madd x19, x2, x7, x1 1503 1504 ld1 {v0.8h,v1.8h}, [x12], #32 1505 ld1 {v2.8h,v3.8h}, [x12], #32 1506 1507 adr x30, 1f 1508 .irep r, TUNED_LIST4 1509 cmp x5, #\r 1510 bls convolve4_\r 1511 .endr 1512 b convolve4_25 1513 15141: ld1 {v8.1d - v11.1d}, [sp], #32 1515 ld1 {v12.1d - v15.1d}, [sp], #32 1516 ldp x19,x30, [sp], #16 1517 ret 1518END(rsdIntrinsicBlurU4_K) 1519