1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18#define PRIVATE(f) .text; .align 4; .type f,#function; f: 19#define END(f) .size f, .-f; 20 21.set FRACTION_BITS, 7 22.set MAX_R, 25 23 24 25/* A quick way of making a line of code conditional on some other condition. 26 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with 27 * `ifcc`: 28 */ 29.macro ifcc zzz:vararg 30.if cc 31 \zzz 32.endif 33.endm 34 35/* Fetch 16 columns of bytes (regardless of image format), convolve these 36 * vertically, and leave them in the register file. If working near the top or 37 * bottom of an image then clamp the addressing while loading the data in. 38 * 39 * The convolution is fully unrolled for windows up to max_r, with the 40 * outermost edges calculated first. This way it's possible to branch directly 41 * into the relevant part of the code for an arbitrary convolution radius. Two 42 * variants of the loop are produced; one eliminates the clamping code for a 43 * slight speed advantage. 44 * 45 * Where the macro is called with reg=x, the specified register is taken to 46 * contain a pre-calculated pointer into one of the two loops. 47 * 48 * Input: 49 * x1 -- src 50 * x2 -- pitch 51 * x5 -- r 52 * x6 -- rup 53 * x7 -- rdn 54 * x12 -- switch index 55 * q0-q3 -- coefficient table 56 * x13 = -pitch 57 * x15 = top-row in 58 * x19 = bottom-row in 59 * Output: 60 * x1 += 16 61 * q10,q11 -- 16 convolved columns 62 * Modifies: 63 * x10 = upper row pointer 64 * x11 = lower row pointer 65 * q12-q15 = temporary sums 66 */ 67.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/ 68 .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif 69 70 ld1 {v15.16b}, [x1], #16 71 mov x10, x15 72 73 uxtl v14.8h, v15.8b 74// prfm PLDL1KEEP,[x1, #16] // TODO: confirm 75 uxtl2 v15.8h, v15.16b 76 .if \max_r < 16 // approximate 77 ifcc adr \reg, 1f 78 .else 79 ifcc adrp \reg, 1f 80 ifcc add \reg, \reg, #:lo12:1f 81 .endif 82 83 umull v12.4s, v14.4h, v0.h[0] 84 ifcc sub \reg, \reg, x5, LSL #6 85 umull2 v13.4s, v14.8h, v0.h[0] 86 mov x11, x19 87 umull v14.4s, v15.4h, v0.h[0] 88 ifcc add \reg, \reg, x5, LSL #3 89 umull2 v15.4s, v15.8h, v0.h[0] 90 br \reg 91 92 .irp rowclamp, 1, 0 93 .set cc, \rowclamp 94 .align 4 95 .irp dreg, 4, 3, 2, 1, 0 ; .irp lane, 7, 6, 5, 4, 3, 2, 1, 0 ; .irp doth, .h 96 .set i, \dreg * 8 + \lane 97 .if 0 < i && i <= \max_r 98 ld1 {v10.16b}, [x10], x2 99 ifcc cmp x6, #i 100 ld1 {v11.16b}, [x11], x13 101 ifcc csel x10, x15, x10, lo 102 uaddl v16.8h, v10.8b, v11.8b 103 ifcc cmp x7, #i 104 uaddl2 v11.8h, v10.16b, v11.16b 105 ifcc csel x11, x19, x11, lo 106 umlal v12.4s, v16.4h, v\dreg\doth[\lane] 107 umlal2 v13.4s, v16.8h, v\dreg\doth[\lane] 108// prfm PLDL1KEEP,[x10, #32] // TODO: confirm 109nop 110 umlal v14.4s, v11.4h, v\dreg\doth[\lane] 111// prfm PLDL1KEEP,[x11, #32] // TODO: confirm 112nop 113 umlal2 v15.4s, v11.8h, v\dreg\doth[\lane] 114 .endif 115 .endr ; .endr ; .endr 116 .if \rowclamp == 1 117 1: \labelc : 118 b 2f 119 .else 120 2: \labelnc : 121 .endif 122 .endr 123 124 uqrshrn v10.4h, v12.4s, #16 - FRACTION_BITS 125 add x15, x15, #16 126 uqrshrn2 v10.8h, v13.4s, #16 - FRACTION_BITS 127 add x19, x19, #16 128 uqrshrn v11.4h, v14.4s, #16 - FRACTION_BITS 129 uqrshrn2 v11.8h, v15.4s, #16 - FRACTION_BITS 130.endm /*}}}*/ 131 132/* Some portion of the convolution window (as much as will fit, and all of it 133 * for the uchar1 cases) is kept in the register file to avoid unnecessary 134 * memory accesses. This forces the horizontal loops to be unrolled because 135 * there's no indexed addressing into the register file. 136 * 137 * As in the fetch macro, the operations are ordered from outside to inside, so 138 * that jumping into the middle of the block bypasses the unwanted window taps. 139 * 140 * There are several variants of the macro because of the fixed offets of the 141 * taps -- the wider the maximum radius the further the centre tap is from the 142 * most recently fetched data. This means that pre-filling the window requires 143 * more data that won't be used and it means that rotating the window involves 144 * more mov operations. 145 * 146 * When the buffer gets too big the buffer at [x9] is used. 147 * 148 * Input: 149 * q4-q11 -- convoltion window 150 * x9 -- pointer to additional convolution window data 151 * Output: 152 * x9 -- updated buffer pointer (if used) 153 * d31 -- result to be stored 154 * Modifies: 155 * x12 -- temp buffer pointer 156 * q12-q13 -- temporaries for load and vext operations. 157 * q14-q15 -- intermediate sums 158 */ 159#define TUNED_LIST1 8, 16 160.macro hconv1_8/*{{{*/ 161 umull v14.4s, v9.4h, v0.h[0] 162 umull2 v15.4s, v9.8h, v0.h[0] 163 164 adr x16, 100f 165 ldrsh x12, [x16, x5, LSL #1] 166 add x12, x12, x16 167 br x12 168 100: .hword -4 169 .hword 101f-100b 170 .hword 102f-100b 171 .hword 103f-100b 172 .hword 104f-100b 173 .hword 105f-100b 174 .hword 106f-100b 175 .hword 107f-100b 176 .hword 108f-100b 177 .align 4 178 108: umlal v14.4s, v8.4h, v1.h[0] 179 umlal2 v15.4s, v8.8h, v1.h[0] 180 umlal v14.4s, v10.4h, v1.h[0] 181 umlal2 v15.4s, v10.8h, v1.h[0] 182 107: ext v12.16b, v8.16b, v9.16b, #1*2 183 ext v13.16b, v9.16b, v10.16b, #7*2 184 umlal v14.4s, v12.4h, v0.h[7] 185 umlal2 v15.4s, v12.8h, v0.h[7] 186 umlal v14.4s, v13.4h, v0.h[7] 187 umlal2 v15.4s, v13.8h, v0.h[7] 188 106: ext v12.16b, v8.16b, v9.16b, #2*2 189 ext v13.16b, v9.16b, v10.16b, #6*2 190 umlal v14.4s, v12.4h, v0.h[6] 191 umlal2 v15.4s, v12.8h, v0.h[6] 192 umlal v14.4s, v13.4h, v0.h[6] 193 umlal2 v15.4s, v13.8h, v0.h[6] 194 105: ext v12.16b, v8.16b, v9.16b, #3*2 195 ext v13.16b, v9.16b, v10.16b, #5*2 196 umlal v14.4s, v12.4h, v0.h[5] 197 umlal2 v15.4s, v12.8h, v0.h[5] 198 umlal v14.4s, v13.4h, v0.h[5] 199 umlal2 v15.4s, v13.8h, v0.h[5] 200 104: //ext v12.16b, v8.16b, v9.16b, #4*2 201 //ext v13.16b, v9.16b, v10.16b, #4*2 202 umlal2 v14.4s, v8.8h, v0.h[4] 203 umlal v15.4s, v9.4h, v0.h[4] 204 umlal2 v14.4s, v9.8h, v0.h[4] 205 umlal v15.4s, v10.4h, v0.h[4] 206 103: ext v12.16b, v8.16b, v9.16b, #5*2 207 ext v13.16b, v9.16b, v10.16b, #3*2 208 umlal v14.4s, v12.4h, v0.h[3] 209 umlal2 v15.4s, v12.8h, v0.h[3] 210 umlal v14.4s, v13.4h, v0.h[3] 211 umlal2 v15.4s, v13.8h, v0.h[3] 212 102: ext v12.16b, v8.16b, v9.16b, #6*2 213 ext v13.16b, v9.16b, v10.16b, #2*2 214 umlal v14.4s, v12.4h, v0.h[2] 215 umlal2 v15.4s, v12.8h, v0.h[2] 216 umlal v14.4s, v13.4h, v0.h[2] 217 umlal2 v15.4s, v13.8h, v0.h[2] 218 101: ext v12.16b, v8.16b, v9.16b, #7*2 219 ext v13.16b, v9.16b, v10.16b, #1*2 220 umlal v14.4s, v12.4h, v0.h[1] 221 umlal2 v15.4s, v12.8h, v0.h[1] 222 umlal v14.4s, v13.4h, v0.h[1] 223 umlal2 v15.4s, v13.8h, v0.h[1] 224 225 uqrshrn v14.4h, v14.4s, #16 226 uqrshrn2 v14.8h, v15.4s, #16 227 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 228 229 mov v8.16b, v9.16b 230 mov v9.16b, v10.16b 231 mov v10.16b, v11.16b 232.endm/*}}}*/ 233 234.macro hconv1_16/*{{{*/ 235 umull v14.4s, v8.4h, v0.h[0] 236 umull2 v15.4s, v8.8h, v0.h[0] 237 238 adr x16, 100f 239 ldrsh x12, [x16, x5, LSL #1] 240 add x12, x12, x16 241 br x12 242 100: .hword -4 243 .hword 101f-100b 244 .hword 102f-100b 245 .hword 103f-100b 246 .hword 104f-100b 247 .hword 105f-100b 248 .hword 106f-100b 249 .hword 107f-100b 250 .hword 108f-100b 251 .hword 109f-100b 252 .hword 110f-100b 253 .hword 111f-100b 254 .hword 112f-100b 255 .hword 113f-100b 256 .hword 114f-100b 257 .hword 115f-100b 258 .hword 116f-100b 259 .align 4 260 116: //ext v12.16b, v6.16b, v7.16b, #0*2 261 //ext v13.16b, v10.16b, v11.16b, #0*2 262 umlal v14.4s, v6.4h, v2.h[0] 263 umlal2 v15.4s, v6.8h, v2.h[0] 264 umlal v14.4s, v10.4h, v2.h[0] 265 umlal2 v15.4s, v10.8h, v2.h[0] 266 115: ext v12.16b, v6.16b, v7.16b, #1*2 267 ext v13.16b, v9.16b, v10.16b, #7*2 268 umlal v14.4s, v12.4h, v1.h[7] 269 umlal2 v15.4s, v12.8h, v1.h[7] 270 umlal v14.4s, v13.4h, v1.h[7] 271 umlal2 v15.4s, v13.8h, v1.h[7] 272 114: ext v12.16b, v6.16b, v7.16b, #2*2 273 ext v13.16b, v9.16b, v10.16b, #6*2 274 umlal v14.4s, v12.4h, v1.h[6] 275 umlal2 v15.4s, v12.8h, v1.h[6] 276 umlal v14.4s, v13.4h, v1.h[6] 277 umlal2 v15.4s, v13.8h, v1.h[6] 278 113: ext v12.16b, v6.16b, v7.16b, #3*2 279 ext v13.16b, v9.16b, v10.16b, #5*2 280 umlal v14.4s, v12.4h, v1.h[5] 281 umlal2 v15.4s, v12.8h, v1.h[5] 282 umlal v14.4s, v13.4h, v1.h[5] 283 umlal2 v15.4s, v13.8h, v1.h[5] 284 112: //ext v12.16b, v6.16b, v7.16b, #4*2 285 //ext v13.16b, v9.16b, v10.16b, #4*2 286 umlal2 v14.4s, v6.8h, v1.h[4] 287 umlal v15.4s, v7.4h, v1.h[4] 288 umlal2 v14.4s, v9.8h, v1.h[4] 289 umlal v15.4s, v10.4h, v1.h[4] 290 111: ext v12.16b, v6.16b, v7.16b, #5*2 291 ext v13.16b, v9.16b, v10.16b, #3*2 292 umlal v14.4s, v12.4h, v1.h[3] 293 umlal2 v15.4s, v12.8h, v1.h[3] 294 umlal v14.4s, v13.4h, v1.h[3] 295 umlal2 v15.4s, v13.8h, v1.h[3] 296 110: ext v12.16b, v6.16b, v7.16b, #6*2 297 ext v13.16b, v9.16b, v10.16b, #2*2 298 umlal v14.4s, v12.4h, v1.h[2] 299 umlal2 v15.4s, v12.8h, v1.h[2] 300 umlal v14.4s, v13.4h, v1.h[2] 301 umlal2 v15.4s, v13.8h, v1.h[2] 302 109: ext v12.16b, v6.16b, v7.16b, #7*2 303 ext v13.16b, v9.16b, v10.16b, #1*2 304 umlal v14.4s, v12.4h, v1.h[1] 305 umlal2 v15.4s, v12.8h, v1.h[1] 306 umlal v14.4s, v13.4h, v1.h[1] 307 umlal2 v15.4s, v13.8h, v1.h[1] 308 108: //ext v12.16b, v7.16b, v8.16b, #0*2 309 //ext v13.16b, v9.16b, v10.16b, #0*2 310 umlal v14.4s, v7.4h, v1.h[0] 311 umlal2 v15.4s, v7.8h, v1.h[0] 312 umlal v14.4s, v9.4h, v1.h[0] 313 umlal2 v15.4s, v9.8h, v1.h[0] 314 107: ext v12.16b, v7.16b, v8.16b, #1*2 315 ext v13.16b, v8.16b, v9.16b, #7*2 316 umlal v14.4s, v12.4h, v0.h[7] 317 umlal2 v15.4s, v12.8h, v0.h[7] 318 umlal v14.4s, v13.4h, v0.h[7] 319 umlal2 v15.4s, v13.8h, v0.h[7] 320 106: ext v12.16b, v7.16b, v8.16b, #2*2 321 ext v13.16b, v8.16b, v9.16b, #6*2 322 umlal v14.4s, v12.4h, v0.h[6] 323 umlal2 v15.4s, v12.8h, v0.h[6] 324 umlal v14.4s, v13.4h, v0.h[6] 325 umlal2 v15.4s, v13.8h, v0.h[6] 326 105: ext v12.16b, v7.16b, v8.16b, #3*2 327 ext v13.16b, v8.16b, v9.16b, #5*2 328 umlal v14.4s, v12.4h, v0.h[5] 329 umlal2 v15.4s, v12.8h, v0.h[5] 330 umlal v14.4s, v13.4h, v0.h[5] 331 umlal2 v15.4s, v13.8h, v0.h[5] 332 104: //ext v12.16b, v7.16b, v8.16b, #4*2 333 //ext v13.16b, v8.16b, v9.16b, #4*2 334 umlal2 v14.4s, v7.8h, v0.h[4] 335 umlal v15.4s, v8.4h, v0.h[4] 336 umlal2 v14.4s, v8.8h, v0.h[4] 337 umlal v15.4s, v9.4h, v0.h[4] 338 103: ext v12.16b, v7.16b, v8.16b, #5*2 339 ext v13.16b, v8.16b, v9.16b, #3*2 340 umlal v14.4s, v12.4h, v0.h[3] 341 umlal2 v15.4s, v12.8h, v0.h[3] 342 umlal v14.4s, v13.4h, v0.h[3] 343 umlal2 v15.4s, v13.8h, v0.h[3] 344 102: ext v12.16b, v7.16b, v8.16b, #6*2 345 ext v13.16b, v8.16b, v9.16b, #2*2 346 umlal v14.4s, v12.4h, v0.h[2] 347 umlal2 v15.4s, v12.8h, v0.h[2] 348 umlal v14.4s, v13.4h, v0.h[2] 349 umlal2 v15.4s, v13.8h, v0.h[2] 350 101: ext v12.16b, v7.16b, v8.16b, #7*2 351 ext v13.16b, v8.16b, v9.16b, #1*2 352 umlal v14.4s, v12.4h, v0.h[1] 353 umlal2 v15.4s, v12.8h, v0.h[1] 354 umlal v14.4s, v13.4h, v0.h[1] 355 umlal2 v15.4s, v13.8h, v0.h[1] 356 357 uqrshrn v14.4h, v14.4s, #16 358 uqrshrn2 v14.8h, v15.4s, #16 359 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 360 361 mov v6.16b, v7.16b 362 mov v7.16b, v8.16b 363 mov v8.16b, v9.16b 364 mov v9.16b, v10.16b 365 mov v10.16b, v11.16b 366.endm/*}}}*/ 367 368.macro hconv1_25/*{{{*/ 369 ext v12.16b, v6.16b, v7.16b, #7*2 370 umull v14.4s, v12.4h, v0.h[0] 371 umull2 v15.4s, v12.8h, v0.h[0] 372 373 adr x16, 100f 374 ldrsh x12, [x16, x5, LSL #1] 375 add x12, x12, x16 376 br x12 377 100: .hword -4 378 .hword 101f-100b 379 .hword 102f-100b 380 .hword 103f-100b 381 .hword 104f-100b 382 .hword 105f-100b 383 .hword 106f-100b 384 .hword 107f-100b 385 .hword 108f-100b 386 .hword 109f-100b 387 .hword 110f-100b 388 .hword 111f-100b 389 .hword 112f-100b 390 .hword 113f-100b 391 .hword 114f-100b 392 .hword 115f-100b 393 .hword 116f-100b 394 .hword 117f-100b 395 .hword 118f-100b 396 .hword 119f-100b 397 .hword 120f-100b 398 .hword 121f-100b 399 .hword 122f-100b 400 .hword 123f-100b 401 .hword 124f-100b 402 .hword 125f-100b 403 .align 4 404 125: ext v12.16b, v31.16b, v4.16b, #6*2 405 ext v13.16b, v10.16b, v11.16b, #0*2 406 umlal v14.4s, v12.4h, v3.h[1] 407 umlal2 v15.4s, v12.8h, v3.h[1] 408 umlal v14.4s, v13.4h, v3.h[1] 409 umlal2 v15.4s, v13.8h, v3.h[1] 410 124: ext v12.16b, v3.16b, v4.16b, #7*2 411 ext v13.16b, v9.16b, v10.16b, #7*2 412 umlal v14.4s, v12.4h, v3.h[0] 413 umlal2 v15.4s, v12.8h, v3.h[0] 414 umlal v14.4s, v13.4h, v3.h[0] 415 umlal2 v15.4s, v13.8h, v3.h[0] 416 123: ext v12.16b, v4.16b, v5.16b, #0*2 417 ext v13.16b, v9.16b, v10.16b, #6*2 418 umlal v14.4s, v12.4h, v2.h[7] 419 umlal2 v15.4s, v12.8h, v2.h[7] 420 umlal v14.4s, v13.4h, v2.h[7] 421 umlal2 v15.4s, v13.8h, v2.h[7] 422 122: ext v12.16b, v4.16b, v5.16b, #1*2 423 ext v13.16b, v9.16b, v10.16b, #5*2 424 umlal v14.4s, v12.4h, v2.h[6] 425 umlal2 v15.4s, v12.8h, v2.h[6] 426 umlal v14.4s, v13.4h, v2.h[6] 427 umlal2 v15.4s, v13.8h, v2.h[6] 428 121: ext v12.16b, v4.16b, v5.16b, #2*2 429 ext v13.16b, v9.16b, v10.16b, #4*2 430 umlal v14.4s, v12.4h, v2.h[5] 431 umlal2 v15.4s, v12.8h, v2.h[5] 432 umlal v14.4s, v13.4h, v2.h[5] 433 umlal2 v15.4s, v13.8h, v2.h[5] 434 120: ext v12.16b, v4.16b, v5.16b, #3*2 435 ext v13.16b, v9.16b, v10.16b, #3*2 436 umlal v14.4s, v12.4h, v2.h[4] 437 umlal2 v15.4s, v12.8h, v2.h[4] 438 umlal v14.4s, v13.4h, v2.h[4] 439 umlal2 v15.4s, v13.8h, v2.h[4] 440 119: ext v12.16b, v4.16b, v5.16b, #4*2 441 ext v13.16b, v9.16b, v10.16b, #2*2 442 umlal v14.4s, v12.4h, v2.h[3] 443 umlal2 v15.4s, v12.8h, v2.h[3] 444 umlal v14.4s, v13.4h, v2.h[3] 445 umlal2 v15.4s, v13.8h, v2.h[3] 446 118: ext v12.16b, v4.16b, v5.16b, #5*2 447 ext v13.16b, v9.16b, v10.16b, #1*2 448 umlal v14.4s, v12.4h, v2.h[2] 449 umlal2 v15.4s, v12.8h, v2.h[2] 450 umlal v14.4s, v13.4h, v2.h[2] 451 umlal2 v15.4s, v13.8h, v2.h[2] 452 117: ext v12.16b, v4.16b, v5.16b, #6*2 453 ext v13.16b, v9.16b, v10.16b, #0*2 454 umlal v14.4s, v12.4h, v2.h[1] 455 umlal2 v15.4s, v12.8h, v2.h[1] 456 umlal v14.4s, v13.4h, v2.h[1] 457 umlal2 v15.4s, v13.8h, v2.h[1] 458 116: ext v12.16b, v4.16b, v5.16b, #7*2 459 ext v13.16b, v8.16b, v9.16b, #7*2 460 umlal v14.4s, v12.4h, v2.h[0] 461 umlal2 v15.4s, v12.8h, v2.h[0] 462 umlal v14.4s, v13.4h, v2.h[0] 463 umlal2 v15.4s, v13.8h, v2.h[0] 464 115: ext v12.16b, v5.16b, v6.16b, #0*2 465 ext v13.16b, v8.16b, v9.16b, #6*2 466 umlal v14.4s, v12.4h, v1.h[7] 467 umlal2 v15.4s, v12.8h, v1.h[7] 468 umlal v14.4s, v13.4h, v1.h[7] 469 umlal2 v15.4s, v13.8h, v1.h[7] 470 114: ext v12.16b, v5.16b, v6.16b, #1*2 471 ext v13.16b, v8.16b, v9.16b, #5*2 472 umlal v14.4s, v12.4h, v1.h[6] 473 umlal2 v15.4s, v12.8h, v1.h[6] 474 umlal v14.4s, v13.4h, v1.h[6] 475 umlal2 v15.4s, v13.8h, v1.h[6] 476 113: ext v12.16b, v5.16b, v6.16b, #2*2 477 ext v13.16b, v8.16b, v9.16b, #4*2 478 umlal v14.4s, v12.4h, v1.h[5] 479 umlal2 v15.4s, v12.8h, v1.h[5] 480 umlal v14.4s, v13.4h, v1.h[5] 481 umlal2 v15.4s, v13.8h, v1.h[5] 482 112: ext v12.16b, v5.16b, v6.16b, #3*2 483 ext v13.16b, v8.16b, v9.16b, #3*2 484 umlal v14.4s, v12.4h, v1.h[4] 485 umlal2 v15.4s, v12.8h, v1.h[4] 486 umlal v14.4s, v13.4h, v1.h[4] 487 umlal2 v15.4s, v13.8h, v1.h[4] 488 111: ext v12.16b, v5.16b, v6.16b, #4*2 489 ext v13.16b, v8.16b, v9.16b, #2*2 490 umlal v14.4s, v12.4h, v1.h[3] 491 umlal2 v15.4s, v12.8h, v1.h[3] 492 umlal v14.4s, v13.4h, v1.h[3] 493 umlal2 v15.4s, v13.8h, v1.h[3] 494 110: ext v12.16b, v5.16b, v6.16b, #5*2 495 ext v13.16b, v8.16b, v9.16b, #1*2 496 umlal v14.4s, v12.4h, v1.h[2] 497 umlal2 v15.4s, v12.8h, v1.h[2] 498 umlal v14.4s, v13.4h, v1.h[2] 499 umlal2 v15.4s, v13.8h, v1.h[2] 500 109: ext v12.16b, v5.16b, v6.16b, #6*2 501 ext v13.16b, v8.16b, v9.16b, #0*2 502 umlal v14.4s, v12.4h, v1.h[1] 503 umlal2 v15.4s, v12.8h, v1.h[1] 504 umlal v14.4s, v13.4h, v1.h[1] 505 umlal2 v15.4s, v13.8h, v1.h[1] 506 108: ext v12.16b, v5.16b, v6.16b, #7*2 507 ext v13.16b, v7.16b, v8.16b, #7*2 508 umlal v14.4s, v12.4h, v1.h[0] 509 umlal2 v15.4s, v12.8h, v1.h[0] 510 umlal v14.4s, v13.4h, v1.h[0] 511 umlal2 v15.4s, v13.8h, v1.h[0] 512 107: ext v12.16b, v6.16b, v7.16b, #0*2 513 ext v13.16b, v7.16b, v8.16b, #6*2 514 umlal v14.4s, v12.4h, v0.h[7] 515 umlal2 v15.4s, v12.8h, v0.h[7] 516 umlal v14.4s, v13.4h, v0.h[7] 517 umlal2 v15.4s, v13.8h, v0.h[7] 518 106: ext v12.16b, v6.16b, v7.16b, #1*2 519 ext v13.16b, v7.16b, v8.16b, #5*2 520 umlal v14.4s, v12.4h, v0.h[6] 521 umlal2 v15.4s, v12.8h, v0.h[6] 522 umlal v14.4s, v13.4h, v0.h[6] 523 umlal2 v15.4s, v13.8h, v0.h[6] 524 105: ext v12.16b, v6.16b, v7.16b, #2*2 525 ext v13.16b, v7.16b, v8.16b, #4*2 526 umlal v14.4s, v12.4h, v0.h[5] 527 umlal2 v15.4s, v12.8h, v0.h[5] 528 umlal v14.4s, v13.4h, v0.h[5] 529 umlal2 v15.4s, v13.8h, v0.h[5] 530 104: ext v12.16b, v6.16b, v7.16b, #3*2 531 ext v13.16b, v7.16b, v8.16b, #3*2 532 umlal v14.4s, v12.4h, v0.h[4] 533 umlal2 v15.4s, v12.8h, v0.h[4] 534 umlal v14.4s, v13.4h, v0.h[4] 535 umlal2 v15.4s, v13.8h, v0.h[4] 536 103: ext v12.16b, v6.16b, v7.16b, #4*2 537 ext v13.16b, v7.16b, v8.16b, #2*2 538 umlal v14.4s, v12.4h, v0.h[3] 539 umlal2 v15.4s, v12.8h, v0.h[3] 540 umlal v14.4s, v13.4h, v0.h[3] 541 umlal2 v15.4s, v13.8h, v0.h[3] 542 102: ext v12.16b, v6.16b, v7.16b, #5*2 543 ext v13.16b, v7.16b, v8.16b, #1*2 544 umlal v14.4s, v12.4h, v0.h[2] 545 umlal2 v15.4s, v12.8h, v0.h[2] 546 umlal v14.4s, v13.4h, v0.h[2] 547 umlal2 v15.4s, v13.8h, v0.h[2] 548 101: ext v12.16b, v6.16b, v7.16b, #6*2 549 ext v13.16b, v7.16b, v8.16b, #0*2 550 umlal v14.4s, v12.4h, v0.h[1] 551 umlal2 v15.4s, v12.8h, v0.h[1] 552 umlal v14.4s, v13.4h, v0.h[1] 553 umlal2 v15.4s, v13.8h, v0.h[1] 554 555 uqrshrn v14.4h, v14.4s, #16 556 uqrshrn2 v14.8h, v15.4s, #16 557 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 558 559 mov v31.16b, v4.16b 560 mov v4.16b, v5.16b 561 mov v5.16b, v6.16b 562 mov v6.16b, v7.16b 563 mov v7.16b, v8.16b 564 mov v8.16b, v9.16b 565 mov v9.16b, v10.16b 566 mov v10.16b, v11.16b 567.endm/*}}}*/ 568 569#define TUNED_LIST4 6, 12, 20 570.macro hconv4_6/*{{{*/ 571 umull v14.4s, v7.4h, v0.h[0] 572 umull2 v15.4s, v7.8h, v0.h[0] 573 574 adr x16, 100f 575 ldrsh x12, [x16, x5, LSL #1] 576 add x12, x12, x16 577 br x12 578 100: .hword -4 579 .hword 101f-100b 580 .hword 102f-100b 581 .hword 103f-100b 582 .hword 104f-100b 583 .hword 105f-100b 584 .hword 106f-100b 585 .align 4 586 106: umlal v14.4s, v4.4h, v0.h[6] 587 umlal2 v15.4s, v4.8h, v0.h[6] 588 umlal v14.4s, v10.4h, v0.h[6] 589 umlal2 v15.4s, v10.8h, v0.h[6] 590 105: umlal2 v14.4s, v4.8h, v0.h[5] 591 umlal v15.4s, v5.4h, v0.h[5] 592 umlal2 v14.4s, v9.8h, v0.h[5] 593 umlal v15.4s, v10.4h, v0.h[5] 594 104: umlal v14.4s, v5.4h, v0.h[4] 595 umlal2 v15.4s, v5.8h, v0.h[4] 596 umlal v14.4s, v9.4h, v0.h[4] 597 umlal2 v15.4s, v9.8h, v0.h[4] 598 103: umlal2 v14.4s, v5.8h, v0.h[3] 599 umlal v15.4s, v6.4h, v0.h[3] 600 umlal2 v14.4s, v8.8h, v0.h[3] 601 umlal v15.4s, v9.4h, v0.h[3] 602 102: umlal v14.4s, v6.4h, v0.h[2] 603 umlal2 v15.4s, v6.8h, v0.h[2] 604 umlal v14.4s, v8.4h, v0.h[2] 605 umlal2 v15.4s, v8.8h, v0.h[2] 606 101: umlal2 v14.4s, v6.8h, v0.h[1] 607 umlal v15.4s, v7.4h, v0.h[1] 608 umlal2 v14.4s, v7.8h, v0.h[1] 609 umlal v15.4s, v8.4h, v0.h[1] 610 611 uqrshrn v14.4h, v14.4s, #16 612 uqrshrn2 v14.8h, v15.4s, #16 613 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 614 615 mov v4.16b, v5.16b 616 mov v5.16b, v6.16b 617 mov v6.16b, v7.16b 618 mov v7.16b, v8.16b 619 mov v8.16b, v9.16b 620 mov v9.16b, v10.16b 621 mov v10.16b, v11.16b 622.endm/*}}}*/ 623 624.macro hconv4_12/*{{{*/ 625 umull v14.4s, v4.4h, v0.h[0] 626 umull2 v15.4s, v4.8h, v0.h[0] 627 628 adr x16, 100f 629 ldrsh x12, [x16, x5, LSL #1] 630 add x12, x12, x16 631 br x12 632 100: .hword -4 633 .hword 101f-100b 634 .hword 102f-100b 635 .hword 103f-100b 636 .hword 104f-100b 637 .hword 105f-100b 638 .hword 106f-100b 639 .hword 107f-100b 640 .hword 108f-100b 641 .hword 109f-100b 642 .hword 110f-100b 643 .hword 111f-100b 644 .hword 112f-100b 645 .align 4 646 112: umlal v14.4s, v26.4h, v1.h[4] 647 umlal2 v15.4s, v26.8h, v1.h[4] 648 umlal v14.4s, v10.4h, v1.h[4] 649 umlal2 v15.4s, v10.8h, v1.h[4] 650 111: umlal2 v14.4s, v26.8h, v1.h[3] 651 umlal v15.4s, v27.4h, v1.h[3] 652 umlal2 v14.4s, v9.8h, v1.h[3] 653 umlal v15.4s, v10.4h, v1.h[3] 654 110: umlal v14.4s, v27.4h, v1.h[2] 655 umlal2 v15.4s, v27.8h, v1.h[2] 656 umlal v14.4s, v9.4h, v1.h[2] 657 umlal2 v15.4s, v9.8h, v1.h[2] 658 109: umlal2 v14.4s, v27.8h, v1.h[1] 659 umlal v15.4s, v28.4h, v1.h[1] 660 umlal2 v14.4s, v8.8h, v1.h[1] 661 umlal v15.4s, v9.4h, v1.h[1] 662 108: umlal v14.4s, v28.4h, v1.h[0] 663 umlal2 v15.4s, v28.8h, v1.h[0] 664 umlal v14.4s, v8.4h, v1.h[0] 665 umlal2 v15.4s, v8.8h, v1.h[0] 666 107: umlal2 v14.4s, v28.8h, v0.h[7] 667 umlal v15.4s, v29.4h, v0.h[7] 668 umlal2 v14.4s, v7.8h, v0.h[7] 669 umlal v15.4s, v8.4h, v0.h[7] 670 106: umlal v14.4s, v29.4h, v0.h[6] 671 umlal2 v15.4s, v29.8h, v0.h[6] 672 umlal v14.4s, v7.4h, v0.h[6] 673 umlal2 v15.4s, v7.8h, v0.h[6] 674 105: umlal2 v14.4s, v29.8h, v0.h[5] 675 umlal v15.4s, v30.4h, v0.h[5] 676 umlal2 v14.4s, v6.8h, v0.h[5] 677 umlal v15.4s, v7.4h, v0.h[5] 678 104: umlal v14.4s, v30.4h, v0.h[4] 679 umlal2 v15.4s, v30.8h, v0.h[4] 680 umlal v14.4s, v6.4h, v0.h[4] 681 umlal2 v15.4s, v6.8h, v0.h[4] 682 103: umlal2 v14.4s, v30.8h, v0.h[3] 683 umlal v15.4s, v31.4h, v0.h[3] 684 umlal2 v14.4s, v5.8h, v0.h[3] 685 umlal v15.4s, v6.4h, v0.h[3] 686 102: umlal v14.4s, v31.4h, v0.h[2] 687 umlal2 v15.4s, v31.8h, v0.h[2] 688 umlal v14.4s, v5.4h, v0.h[2] 689 umlal2 v15.4s, v5.8h, v0.h[2] 690 101: umlal2 v14.4s, v31.8h, v0.h[1] 691 umlal v15.4s, v4.4h, v0.h[1] 692 umlal2 v14.4s, v4.8h, v0.h[1] 693 umlal v15.4s, v5.4h, v0.h[1] 694 695 uqrshrn v14.4h, v14.4s, #16 696 uqrshrn2 v14.8h, v15.4s, #16 697 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 698 699 mov v26.16b, v27.16b 700 mov v27.16b, v28.16b 701 mov v28.16b, v29.16b 702 mov v29.16b, v30.16b 703 mov v30.16b, v31.16b 704 mov v31.16b, v4.16b 705 mov v4.16b, v5.16b 706 mov v5.16b, v6.16b 707 mov v6.16b, v7.16b 708 mov v7.16b, v8.16b 709 mov v8.16b, v9.16b 710 mov v9.16b, v10.16b 711 mov v10.16b, v11.16b 712.endm/*}}}*/ 713 714.macro hconv4_20/*{{{*/ 715 umull v14.4s, v28.4h, v0.h[0] 716 umull2 v15.4s, v28.8h, v0.h[0] 717 718 adr x16, 100f 719 ldrsh x12, [x16, x5, LSL #1] 720 add x12, x12, x16 721 br x12 722 100: .hword -4 723 .hword 101f-100b 724 .hword 102f-100b 725 .hword 103f-100b 726 .hword 104f-100b 727 .hword 105f-100b 728 .hword 106f-100b 729 .hword 107f-100b 730 .hword 108f-100b 731 .hword 109f-100b 732 .hword 110f-100b 733 .hword 111f-100b 734 .hword 112f-100b 735 .hword 113f-100b 736 .hword 114f-100b 737 .hword 115f-100b 738 .hword 116f-100b 739 .hword 117f-100b 740 .hword 118f-100b 741 .hword 119f-100b 742 .hword 120f-100b 743 .align 4 744 745 120: umlal v14.4s, v18.4h, v2.h[4] 746 umlal2 v15.4s, v18.8h, v2.h[4] 747 umlal v14.4s, v10.4h, v2.h[4] 748 umlal2 v15.4s, v10.8h, v2.h[4] 749 119: umlal2 v14.4s, v18.8h, v2.h[3] 750 umlal v15.4s, v19.4h, v2.h[3] 751 umlal2 v14.4s, v9.8h, v2.h[3] 752 umlal v15.4s, v10.4h, v2.h[3] 753 118: umlal v14.4s, v19.4h, v2.h[2] 754 umlal2 v15.4s, v19.8h, v2.h[2] 755 umlal v14.4s, v9.4h, v2.h[2] 756 umlal2 v15.4s, v9.8h, v2.h[2] 757 117: umlal2 v14.4s, v19.8h, v2.h[1] 758 umlal v15.4s, v20.4h, v2.h[1] 759 umlal2 v14.4s, v8.8h, v2.h[1] 760 umlal v15.4s, v9.4h, v2.h[1] 761 116: umlal v14.4s, v20.4h, v2.h[0] 762 umlal2 v15.4s, v20.8h, v2.h[0] 763 umlal v14.4s, v8.4h, v2.h[0] 764 umlal2 v15.4s, v8.8h, v2.h[0] 765 115: umlal2 v14.4s, v20.8h, v1.h[7] 766 umlal v15.4s, v21.4h, v1.h[7] 767 umlal2 v14.4s, v7.8h, v1.h[7] 768 umlal v15.4s, v8.4h, v1.h[7] 769 114: umlal v14.4s, v21.4h, v1.h[6] 770 umlal2 v15.4s, v21.8h, v1.h[6] 771 umlal v14.4s, v7.4h, v1.h[6] 772 umlal2 v15.4s, v7.8h, v1.h[6] 773 113: umlal2 v14.4s, v21.8h, v1.h[5] 774 umlal v15.4s, v22.4h, v1.h[5] 775 umlal2 v14.4s, v6.8h, v1.h[5] 776 umlal v15.4s, v7.4h, v1.h[5] 777 112: umlal v14.4s, v22.4h, v1.h[4] 778 umlal2 v15.4s, v22.8h, v1.h[4] 779 umlal v14.4s, v6.4h, v1.h[4] 780 umlal2 v15.4s, v6.8h, v1.h[4] 781 111: umlal2 v14.4s, v22.8h, v1.h[3] 782 umlal v15.4s, v23.4h, v1.h[3] 783 umlal2 v14.4s, v5.8h, v1.h[3] 784 umlal v15.4s, v6.4h, v1.h[3] 785 110: umlal v14.4s, v23.4h, v1.h[2] 786 umlal2 v15.4s, v23.8h, v1.h[2] 787 umlal v14.4s, v5.4h, v1.h[2] 788 umlal2 v15.4s, v5.8h, v1.h[2] 789 109: umlal2 v14.4s, v23.8h, v1.h[1] 790 umlal v15.4s, v24.4h, v1.h[1] 791 umlal2 v14.4s, v4.8h, v1.h[1] 792 umlal v15.4s, v5.4h, v1.h[1] 793 108: umlal v14.4s, v24.4h, v1.h[0] 794 umlal2 v15.4s, v24.8h, v1.h[0] 795 umlal v14.4s, v4.4h, v1.h[0] 796 umlal2 v15.4s, v4.8h, v1.h[0] 797 107: umlal2 v14.4s, v24.8h, v0.h[7] 798 umlal v15.4s, v25.4h, v0.h[7] 799 umlal2 v14.4s, v31.8h, v0.h[7] 800 umlal v15.4s, v4.4h, v0.h[7] 801 106: umlal v14.4s, v25.4h, v0.h[6] 802 umlal2 v15.4s, v25.8h, v0.h[6] 803 umlal v14.4s, v31.4h, v0.h[6] 804 umlal2 v15.4s, v31.8h, v0.h[6] 805 105: umlal2 v14.4s, v25.8h, v0.h[5] 806 umlal v15.4s, v26.4h, v0.h[5] 807 umlal2 v14.4s, v30.8h, v0.h[5] 808 umlal v15.4s, v31.4h, v0.h[5] 809 104: umlal v14.4s, v26.4h, v0.h[4] 810 umlal2 v15.4s, v26.8h, v0.h[4] 811 umlal v14.4s, v30.4h, v0.h[4] 812 umlal2 v15.4s, v30.8h, v0.h[4] 813 103: umlal2 v14.4s, v26.8h, v0.h[3] 814 umlal v15.4s, v27.4h, v0.h[3] 815 umlal2 v14.4s, v29.8h, v0.h[3] 816 umlal v15.4s, v30.4h, v0.h[3] 817 102: umlal v14.4s, v27.4h, v0.h[2] 818 umlal2 v15.4s, v27.8h, v0.h[2] 819 umlal v14.4s, v29.4h, v0.h[2] 820 umlal2 v15.4s, v29.8h, v0.h[2] 821 101: umlal2 v14.4s, v27.8h, v0.h[1] 822 umlal v15.4s, v28.4h, v0.h[1] 823 umlal2 v14.4s, v28.8h, v0.h[1] 824 umlal v15.4s, v29.4h, v0.h[1] 825 826 uqrshrn v14.4h, v14.4s, #16 827 uqrshrn2 v14.8h, v15.4s, #16 828 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 829 830 mov v18.16b, v19.16b 831 mov v19.16b, v20.16b 832 mov v20.16b, v21.16b 833 mov v21.16b, v22.16b 834 mov v22.16b, v23.16b 835 mov v23.16b, v24.16b 836 mov v24.16b, v25.16b 837 mov v25.16b, v26.16b 838 mov v26.16b, v27.16b 839 mov v27.16b, v28.16b 840 mov v28.16b, v29.16b 841 mov v29.16b, v30.16b 842 mov v30.16b, v31.16b 843 mov v31.16b, v4.16b 844 mov v4.16b, v5.16b 845 mov v5.16b, v6.16b 846 mov v6.16b, v7.16b 847 mov v7.16b, v8.16b 848 mov v8.16b, v9.16b 849 mov v9.16b, v10.16b 850 mov v10.16b, v11.16b 851.endm/*}}}*/ 852 853.macro hconv4_25/*{{{*/ 854 umull2 v14.4s, v25.8h, v0.h[0] 855 umull v15.4s, v26.4h, v0.h[0] 856 857 adr x16, 100f 858 ldrsh x12, [x16, x5, LSL #1] 859 add x12, x12, x16 860 br x12 861 100: .hword -4 862 .hword 101f-100b 863 .hword 102f-100b 864 .hword 103f-100b 865 .hword 104f-100b 866 .hword 105f-100b 867 .hword 106f-100b 868 .hword 107f-100b 869 .hword 108f-100b 870 .hword 109f-100b 871 .hword 110f-100b 872 .hword 111f-100b 873 .hword 112f-100b 874 .hword 113f-100b 875 .hword 114f-100b 876 .hword 115f-100b 877 .hword 116f-100b 878 .hword 117f-100b 879 .hword 118f-100b 880 .hword 119f-100b 881 .hword 120f-100b 882 .hword 121f-100b 883 .hword 122f-100b 884 .hword 123f-100b 885 .hword 124f-100b 886 .hword 125f-100b 887 .align 4 888 889 125: ld1 {v12.8h}, [x9] 890 umlal v14.4s, v12.4h, v3.h[1] 891 umlal2 v15.4s, v12.8h, v3.h[1] 892 umlal v14.4s, v10.4h, v3.h[1] 893 umlal2 v15.4s, v10.8h, v3.h[1] 894 124: add x12, x9, #0x08 895 bic x12, x12, #0x40 896 ld1 {v12.4h}, [x12], #8 897 bic x12, x12, #0x40 898 ld1 {v13.4h}, [x12] 899 umlal v14.4s, v12.4h, v3.h[0] 900 umlal v15.4s, v13.4h, v3.h[0] 901 umlal2 v14.4s, v9.8h, v3.h[0] 902 umlal v15.4s, v10.4h, v3.h[0] 903 123: add x12, x9, #0x10 904 bic x12, x12, #0x40 905 ld1 {v12.8h}, [x12] 906 umlal v14.4s, v12.4h, v2.h[7] 907 umlal2 v15.4s, v12.8h, v2.h[7] 908 umlal v14.4s, v9.4h, v2.h[7] 909 umlal2 v15.4s, v9.8h, v2.h[7] 910 122: add x12, x9, #0x18 911 bic x12, x12, #0x40 912 ld1 {v12.4h}, [x12], #8 913 bic x12, x12, #0x40 914 ld1 {v13.4h}, [x12] 915 umlal v14.4s, v12.4h, v2.h[6] 916 umlal v15.4s, v13.4h, v2.h[6] 917 umlal2 v14.4s, v8.8h, v2.h[6] 918 umlal v15.4s, v9.4h, v2.h[6] 919 121: add x12, x9, #0x20 920 bic x12, x12, #0x40 921 ld1 {v12.8h}, [x12] 922 umlal v14.4s, v12.4h, v2.h[5] 923 umlal2 v15.4s, v12.8h, v2.h[5] 924 umlal v14.4s, v8.4h, v2.h[5] 925 umlal2 v15.4s, v8.8h, v2.h[5] 926 120: add x12, x9, #0x28 927 bic x12, x12, #0x40 928 ld1 {v12.4h}, [x12], #8 929 bic x12, x12, #0x40 930 ld1 {v13.4h}, [x12] 931 umlal v14.4s, v12.4h, v2.h[4] 932 umlal v15.4s, v13.4h, v2.h[4] 933 umlal2 v14.4s, v7.8h, v2.h[4] 934 umlal v15.4s, v8.4h, v2.h[4] 935 119: add x12, x9, #0x30 936 bic x12, x12, #0x40 937 ld1 {v12.8h}, [x12] 938 umlal v14.4s, v12.4h, v2.h[3] 939 umlal2 v15.4s, v12.8h, v2.h[3] 940 umlal v14.4s, v7.4h, v2.h[3] 941 umlal2 v15.4s, v7.8h, v2.h[3] 942 118: add x12, x9, #0x38 943 bic x12, x12, #0x40 944 ld1 {v12.4h}, [x12] 945 umlal v14.4s, v12.4h, v2.h[2] 946 umlal v15.4s, v17.4h, v2.h[2] 947 umlal2 v14.4s, v6.8h, v2.h[2] 948 umlal v15.4s, v7.4h, v2.h[2] 949 117: umlal v14.4s, v17.4h, v2.h[1] 950 umlal2 v15.4s, v17.8h, v2.h[1] 951 umlal v14.4s, v6.4h, v2.h[1] 952 umlal2 v15.4s, v6.8h, v2.h[1] 953 116: umlal2 v14.4s, v17.8h, v2.h[0] 954 umlal v15.4s, v18.4h, v2.h[0] 955 umlal2 v14.4s, v5.8h, v2.h[0] 956 umlal v15.4s, v6.4h, v2.h[0] 957 115: umlal v14.4s, v18.4h, v1.h[7] 958 umlal2 v15.4s, v18.8h, v1.h[7] 959 umlal v14.4s, v5.4h, v1.h[7] 960 umlal2 v15.4s, v5.8h, v1.h[7] 961 114: umlal2 v14.4s, v18.8h, v1.h[6] 962 umlal v15.4s, v19.4h, v1.h[6] 963 umlal2 v14.4s, v4.8h, v1.h[6] 964 umlal v15.4s, v5.4h, v1.h[6] 965 113: umlal v14.4s, v19.4h, v1.h[5] 966 umlal2 v15.4s, v19.8h, v1.h[5] 967 umlal v14.4s, v4.4h, v1.h[5] 968 umlal2 v15.4s, v4.8h, v1.h[5] 969 112: umlal2 v14.4s, v19.8h, v1.h[4] 970 umlal v15.4s, v20.4h, v1.h[4] 971 umlal2 v14.4s, v31.8h, v1.h[4] 972 umlal v15.4s, v4.4h, v1.h[4] 973 111: umlal v14.4s, v20.4h, v1.h[3] 974 umlal2 v15.4s, v20.8h, v1.h[3] 975 umlal v14.4s, v31.4h, v1.h[3] 976 umlal2 v15.4s, v31.8h, v1.h[3] 977 110: umlal2 v14.4s, v20.8h, v1.h[2] 978 umlal v15.4s, v21.4h, v1.h[2] 979 umlal2 v14.4s, v30.8h, v1.h[2] 980 umlal v15.4s, v31.4h, v1.h[2] 981 109: umlal v14.4s, v21.4h, v1.h[1] 982 umlal2 v15.4s, v21.8h, v1.h[1] 983 umlal v14.4s, v30.4h, v1.h[1] 984 umlal2 v15.4s, v30.8h, v1.h[1] 985 108: umlal2 v14.4s, v21.8h, v1.h[0] 986 umlal v15.4s, v22.4h, v1.h[0] 987 umlal2 v14.4s, v29.8h, v1.h[0] 988 umlal v15.4s, v30.4h, v1.h[0] 989 107: umlal v14.4s, v22.4h, v0.h[7] 990 umlal2 v15.4s, v22.8h, v0.h[7] 991 umlal v14.4s, v29.4h, v0.h[7] 992 umlal2 v15.4s, v29.8h, v0.h[7] 993 106: umlal2 v14.4s, v22.8h, v0.h[6] 994 umlal v15.4s, v23.4h, v0.h[6] 995 umlal2 v14.4s, v28.8h, v0.h[6] 996 umlal v15.4s, v29.4h, v0.h[6] 997 105: umlal v14.4s, v23.4h, v0.h[5] 998 umlal2 v15.4s, v23.8h, v0.h[5] 999 umlal v14.4s, v28.4h, v0.h[5] 1000 umlal2 v15.4s, v28.8h, v0.h[5] 1001 104: umlal2 v14.4s, v23.8h, v0.h[4] 1002 umlal v15.4s, v24.4h, v0.h[4] 1003 umlal2 v14.4s, v27.8h, v0.h[4] 1004 umlal v15.4s, v28.4h, v0.h[4] 1005 103: umlal v14.4s, v24.4h, v0.h[3] 1006 umlal2 v15.4s, v24.8h, v0.h[3] 1007 umlal v14.4s, v27.4h, v0.h[3] 1008 umlal2 v15.4s, v27.8h, v0.h[3] 1009 102: umlal2 v14.4s, v24.8h, v0.h[2] 1010 umlal v15.4s, v25.4h, v0.h[2] 1011 umlal2 v14.4s, v26.8h, v0.h[2] 1012 umlal v15.4s, v27.4h, v0.h[2] 1013 101: umlal v14.4s, v25.4h, v0.h[1] 1014 umlal2 v15.4s, v25.8h, v0.h[1] 1015 umlal v14.4s, v26.4h, v0.h[1] 1016 umlal2 v15.4s, v26.8h, v0.h[1] 1017 1018 uqrshrn v14.4h, v14.4s, #16 1019 uqrshrn2 v14.8h, v15.4s, #16 1020 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 1021 1022 st1 {v17.16b}, [x9], #16 1023 bic x9, x9, #0x40 1024 mov v17.16b, v18.16b 1025 mov v18.16b, v19.16b 1026 mov v19.16b, v20.16b 1027 mov v20.16b, v21.16b 1028 mov v21.16b, v22.16b 1029 mov v22.16b, v23.16b 1030 mov v23.16b, v24.16b 1031 mov v24.16b, v25.16b 1032 mov v25.16b, v26.16b 1033 mov v26.16b, v27.16b 1034 mov v27.16b, v28.16b 1035 mov v28.16b, v29.16b 1036 mov v29.16b, v30.16b 1037 mov v30.16b, v31.16b 1038 mov v31.16b, v4.16b 1039 mov v4.16b, v5.16b 1040 mov v5.16b, v6.16b 1041 mov v6.16b, v7.16b 1042 mov v7.16b, v8.16b 1043 mov v8.16b, v9.16b 1044 mov v9.16b, v10.16b 1045 mov v10.16b, v11.16b 1046.endm/*}}}*/ 1047 1048/* Dedicated function wrapper for the fetch macro, for the cases where 1049 * performance isn't that important, to keep code size down. 1050 */ 1051PRIVATE(fetch_generic_asm) 1052 stp x10, x11, [sp, #-16]! 1053 fetch 1054 ldp x10, x11, [sp], #16 1055 ret 1056END(fetch_generic_asm) 1057 1058/* Given values in q10 and q11, and an index in x11, sweep the (x11&15)th value 1059 * across to fill the rest of the register pair. Used for filling the right 1060 * hand edge of the window when starting too close to the right hand edge of 1061 * the image. 1062 */ 1063PRIVATE(prefetch_clamp1) 1064 sub x11, xzr, x11 1065 sub x15, x15, x1 1066 sub x19, x19, x1 1067 tbz x11, #3, 1f 1068 mov v11.16b, v10.16b 1069 sub x1, x1, #16 10701: mov v12.16b, v11.16b 1071 movi v13.8b, #0xff 1072 tbz x11, #2, 1f 1073 ext v12.16b, v12.16b, v12.16b, #4*2 1074 sub x1, x1, #8 1075 shl v13.2d, v13.2d, #32 10761: tbz x11, #1, 1f 1077 ext v12.16b, v12.16b, v12.16b, #6*2 1078 sub x1, x1, #4 1079 shl v13.2d, v13.2d, #16 10801: tbz x11, #0, 1f 1081 ext v12.16b, v12.16b, v12.16b, #7*2 1082 sub x1, x1, #2 1083 shl v13.2d, v13.2d, #8 10841: dup v12.8h, v12.h[6] 1085 sxtl v13.8h, v13.8b 1086 bif v11.16b, v12.16b, v13.16b 10871: tbz x11, #3, 1f 1088 mov v10.16b, v11.16b 1089 mov v11.16b, v12.16b 10901: sub x11, xzr, x11 1091 add x15, x15, x1 1092 add x19, x19, x1 1093 ret 1094END(prefetch_clamp1) 1095 1096PRIVATE(prefetch_clamp4) 1097 sub x11, xzr, x11 1098 sub x15, x15, x1 1099 sub x19, x19, x1 1100 tbz x11, #3, 1f 1101 sub x1, x1, #16 // what's this? 1102 mov v11.16b, v10.16b 11031: dup v12.2d, v11.d[1] 1104 tbz x11, #2, 1f 1105 dup v12.2d, v11.d[0] 1106 sub x1, x1, #8 1107 dup v11.2d, v11.d[0] 11081: tbz x11, #3, 1f 1109 mov v10.16b, v11.16b 1110 mov v11.16b, v12.16b 11111: sub x11, xzr, x11 1112 add x15, x15, x1 1113 add x19, x19, x1 1114 ret 1115END(prefetch_clamp4) 1116 1117 1118/* Helpers for prefetch, below. 1119 */ 1120.macro prefetch_out qa, qb, store, qsa, qsb, qsb_hi 1121 .if \store == 2 1122 .ifc \qsa,\qsb 1123 st1 {\qsa}, [x9], #16 1124 st1 {\qsb}, [x9], #16 1125 .else 1126 st1 {\qsa,\qsb}, [x9], #32 1127 .endif 1128 .elseif \store == 1 1129 bic x9, x9, #0x40 1130 st1 {\qsa}, [x9], #16 1131 mov \qb, \qsb 1132 .elseif \store == 0 1133 mov \qa, \qsa 1134 mov \qb, \qsb 1135 .endif 1136.endm 1137 1138.macro prefetch_one qa, qb, rem, c, store=0, step=1 1139.set i, (need - 16) - \rem 1140.if i >= 0 11411: cmp x10, #i+16 1142 blo 2f 1143 prefetch_out \qa, \qb, \store, v9.16b, v9.16b, v9.d[1] 1144 b 1f 11452: cmp x11, #i+16 1146 bls 3f 1147 prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1] 1148 bl fetch_generic_asm 1149 b 2f 11503: bl prefetch_clamp\step 1151 prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1] 11524: b 4f+4 1153 //v12 contains pad word from prefetch_clamp call 1154 prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1] 1155 .if \rem > 0 1156 b 4f+4 1157 .else 11581: 11592: 11603: 11614: nop 1162 .endif 1163.endif 1164.endm 1165 1166/* Fill the convolution window with context data. The aim here is to load 1167 * exactly rlf + rrt columns, and in the main loop to read as many columns as 1168 * will be written. This is complicated by the need to handle cases when the 1169 * input starts very close to the left or right (or both) edges of the image, 1170 * and where these do not fall on 16-byte boundaries. 1171 * 1172 * Input: 1173 * x1 -- src 1174 * x2 -- pitch 1175 * x3 -- count 1176 * x4 -- inlen 1177 * x5 -- r 1178 * x6 -- rup 1179 * x7 -- rdn 1180 * x8 -- rlf 1181 * x9 -- buffer (if needed) 1182 * x13 = -pitch 1183 * x15 = top-row in 1184 * x19 = bottom-row in 1185 * Output: 1186 * x1 += rlf + min(count, rrt) 1187 * Modifies: 1188 * x10 -- fill start index in the window 1189 * x11 -- fill stop index in the window 1190 * x12 -- scratch 1191 */ 1192.macro prefetch step=1, max_r=25 1193.set need, ((\max_r + \max_r) * \step + 15) & ~15 1194 .if \step == 1 1195 mov x10, #need - (\max_r * \step) 1196 sub x10, x10, x8 1197 .else 1198 mov x10, #need - (\max_r * \step) 1199 sub x10, x10, x8, LSL #2 1200 .endif 1201 add x11, x10, x4 1202 subs x11, x11, #need 1203 csel x11, xzr, x11, hi 1204 add x11, x11, #need 1205 1206 bl fetch_generic_asm 1207 .if \step == 1 1208 dup v9.8h, v10.h[0] 1209 .else 1210 dup v9.2d, v10.d[0] 1211 .endif 1212 tst x10, #15 1213 beq 2f 1214 sub x12, xzr, x10 1215 tbz x10, #3, 1f 1216 mov v11.16b, v10.16b 1217 mov v10.16b, v9.16b 12181: tbz x12, #2, 1f 1219 ext v11.16b, v10.16b, v11.16b, #4*2 1220 ext v10.16b, v9.16b, v10.16b, #4*2 1221 .if \step == 1 1222 1: tbz x12, #1, 1f 1223 ext v11.16b, v10.16b, v11.16b, #2*2 1224 ext v10.16b, v9.16b, v10.16b, #2*2 1225 1: tbz x12, #0, 1f 1226 ext v11.16b, v10.16b, v11.16b, #1*2 1227 ext v10.16b, v9.16b, v10.16b, #1*2 1228 .endif 12291: sub x1, x1, x10 1230 sub x15, x15, x10 1231 sub x19, x19, x10 1232 bic x10, x10, #15 1233 add x1, x1, x10 1234 add x15, x15, x10 1235 add x19, x19, x10 12362: 1237 .if \step > 1 1238 /* it's only in the uchar2 and uchar4 cases where the register file 1239 * is insufficient (given MAX_R <= 25). 1240 */ 1241 prefetch_one xx, xx, 192, c=\max_r, step=\step, store=2 1242 prefetch_one xx, xx, 176, c=\max_r, step=\step, store=2 1243 prefetch_one xx, v17.16b, 160, c=\max_r, step=\step, store=1 1244 prefetch_one v18.16b, v19.16b, 144, c=\max_r, step=\step, store=0 1245 prefetch_one v20.16b, v21.16b, 128, c=\max_r, step=\step, store=0 1246 prefetch_one v22.16b, v23.16b, 112, c=\max_r, step=\step, store=0 1247 prefetch_one v24.16b, v25.16b, 96, c=\max_r, step=\step, store=0 1248 prefetch_one v26.16b, v27.16b, 80, c=\max_r, step=\step, store=0 1249 prefetch_one v28.16b, v29.16b, 64, c=\max_r, step=\step, store=0 1250 .endif 1251 prefetch_one v30.16b, v31.16b, 48, c=\max_r, step=\step, store=0 1252 prefetch_one v4.16b, v5.16b, 32, c=\max_r, step=\step, store=0 1253 prefetch_one v6.16b, v7.16b, 16, c=\max_r, step=\step, store=0 1254 prefetch_one v8.16b, v9.16b, 0, c=\max_r, step=\step, store=0 1255 1256 .if \step == 1 1257 add x10, x8, #\max_r * \step 1258 .else 1259 lsl x10, x8, #2 1260 add x10, x10, #\max_r * \step 1261 .endif 1262 subs x4, x4, x10 1263 csel x4, xzr, x4, lo 1264.endm 1265 1266/* The main loop. 1267 * 1268 * Input: 1269 * x0 = dst 1270 * x1 = src 1271 * x2 = pitch 1272 * x3 = count 1273 * x4 = inlen 1274 * x5 = r 1275 * x6 = rup 1276 * x7 = rdn 1277 * x9 = buffer 1278 * x13 = -pitch 1279 * x15 = top-row in 1280 * x19 = bottom-row in 1281 * Modifies 1282 * x8 = fetch code pointer 1283 */ 1284.macro mainloop core, step=1, max_r=25, labelc="", labelnc="" 1285 adrp x8, \labelnc 1286 add x8, x8, #:lo12:\labelnc 1287 sub x8, x8, x5, LSL #5 1288 sub x8, x8, x5, LSL #3 1289 cmp x5, x6 1290 ccmp x5, x7, #0, eq 1291 beq 5f 1292 1293 /* if (r != rup || r != rdn) then the address-clamping table should 1294 * be used rather than the short-cut version. 1295 */ 1296 adrp x8, \labelc 1297 add x8, x8, #:lo12:\labelc 1298 sub x8, x8, x5, LSL #6 1299 add x8, x8, x5, LSL #3 1300 b 5f 1301 .align 4 13023: fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8 1303 1304 /* For each call to fetch two are made to \core. It would be 1305 * preferable to have twice the work done in \core. 1306 */ 1307 \core 1308 st1 {v15.8b}, [x0], #8 1309 \core 1310 st1 {v15.8b}, [x0], #8 1311 1312 sub x3, x3, #16 13135: subs x4, x4, #16 1314 bhs 3b 1315 adds x4, x4, #16 1316 bne 1f 1317 .if \step==1 1318 dup v10.8h, v9.h[7] 1319 dup v11.8h, v9.h[7] 1320 .else 1321 dup v10.2d, v9.d[1] 1322 dup v11.2d, v9.d[1] 1323 .endif 1324 b 4f 1325 13261: sub x1, x1, #16 1327 sub x15, x15, #16 1328 sub x19, x19, #16 1329 add x1, x1, x4 1330 add x15, x15, x4 1331 add x19, x19, x4 1332 bl fetch_generic_asm 1333 1334 .if \step==1 1335 dup v12.8h, v11.h[7] 1336 .else 1337 dup v12.2d, v11.d[1] 1338 .endif 1339 sub x4, xzr, x4 1340 tbz x4, #3, 1f 1341 mov v10.16b, v11.16b 1342 mov v11.16b, v12.16b 13431: tbz x4, #2, 1f 1344 ext v10.16b, v10.16b, v11.16b, #4*2 1345 ext v11.16b, v11.16b, v12.16b, #4*2 13461: tbz x4, #1, 1f 1347 ext v10.16b, v10.16b, v11.16b, #2*2 1348 ext v11.16b, v11.16b, v12.16b, #2*2 13491: tbz x4, #0, 4f 1350 ext v10.16b, v10.16b, v11.16b, #1*2 1351 ext v11.16b, v11.16b, v12.16b, #1*2 13524: cbz x3, 5f 13533: \core 1354 .if \step==1 1355 dup v11.8h, v11.h[7] 1356 .else 1357 dup v11.2d, v11.d[1] 1358 .endif 1359 subs x3, x3, #8 1360 blo 4f 1361 st1 {v15.8b}, [x0], #8 1362 beq 5f 1363 b 3b 13644: tbz x3, #2, 1f 1365 st1 {v15.s}[0], [x0], #4 1366 ext v15.16b, v15.16b, v15.16b, #4*2 13671: tbz x3, #1, 1f 1368 st1 {v15.h}[0], [x0], #2 1369 ext v15.16b, v15.16b, v15.16b, #2*2 13701: tbz x3, #0, 5f 1371 st1 {v15.b}[0], [x0], #1 1372 ext v15.16b, v15.16b, v15.16b, #1*2 13735: nop 1374.endm 1375 1376.irep r, TUNED_LIST1, 25 1377PRIVATE(convolve1_\r) 1378 stp x29,x30, [sp, #-16]! 1379 1380 prefetch step=1, max_r=\r 1381 1382 mainloop core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r 1383 1384 ldp x29,x30, [sp], #16 1385 ret 1386END(convolve1_\r) 1387.endr 1388 1389.irep r, TUNED_LIST4, 25 1390PRIVATE(convolve4_\r) 1391 sub x12, sp, #0x040 1392 bic x9, x12, #0x07f 1393 mov sp, x9 1394 stp x12,x30, [sp, #-16]! 1395 1396 /* x9 now points to a buffer on the stack whose address has the low 1397 * 7 bits clear. This allows easy address calculation in the 1398 * wrap-around cases. 1399 */ 1400 1401 1402 prefetch step=4, max_r=\r 1403 1404 mainloop core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r 1405 1406 ldp x12,x30, [sp] 1407 add sp, x12, #0x40 1408 ret 1409END(convolve4_\r) 1410.endr 1411 1412/* void rsdIntrinsicBlurU1_K( 1413 * void *out, // x0 1414 * void *in, // x1 1415 * size_t w, // x2 1416 * size_t h, // x3 1417 * size_t p, // x4 1418 * size_t x, // x5 1419 * size_t y, // x6 1420 * size_t count, // x7 1421 * size_t r, // [sp] 1422 * uint16_t *tab); // [sp,#8] 1423 */ 1424ENTRY(rsdIntrinsicBlurU1_K) 1425 stp x19,x30, [sp, #-16]! 1426 sub x8, sp, #32 1427 sub sp, sp, #64 1428 st1 {v8.1d - v11.1d}, [sp] 1429 st1 {v12.1d - v15.1d}, [x8] 1430 mov x8, x5 // x 1431 ldr w5, [sp,#80] // r 1432 sub x9, x2, x8 1433 sub x10, x3, x6 1434 mov x2, x4 // pitch 1435 mov x3, x7 // count 1436 sub x7, x10, #1 1437 sub x9, x9, x3 1438 1439 ldr x12, [sp, #88] // tab 1440 1441 add x0, x0, x8 1442 add x1, x1, x8 1443 1444 cmp x6, x5 1445 csel x6, x5, x6, hs 1446 cmp x7, x5 1447 csel x7, x5, x7, hs 1448 cmp x8, x5 1449 csel x8, x5, x8, hs 1450 cmp x9, x5 1451 csel x9, x5, x8, hs 1452 1453 add x4, x8, x9 1454 add x4, x4, x3 1455 1456 sub x1, x1, x8 1457 1458 sub x13, xzr, x2 1459 msub x15, x2, x6, x1 1460 madd x19, x2, x7, x1 1461 1462 ld1 {v0.8h,v1.8h}, [x12], #32 1463 ld1 {v2.8h,v3.8h}, [x12], #32 1464 1465 adr x30, 1f 1466 .irep r, TUNED_LIST1 1467 cmp x5, #\r 1468 bls convolve1_\r 1469 .endr 1470 b convolve1_25 1471 14721: ld1 {v8.1d - v11.1d}, [sp], #32 1473 ld1 {v12.1d - v15.1d}, [sp], #32 1474 ldp x19,x30, [sp], #16 1475 ret 1476END(rsdIntrinsicBlurU1_K) 1477 1478/* void rsdIntrinsicBlurU4_K( 1479 * void *out, // x0 1480 * void *in, // x1 1481 * size_t w, // x2 1482 * size_t h, // x3 1483 * size_t p, // x4 1484 * size_t x, // x5 1485 * size_t y, // x6 1486 * size_t count, // x7 1487 * size_t r, // [sp] 1488 * uint16_t *tab); // [sp,#8] 1489 */ 1490ENTRY(rsdIntrinsicBlurU4_K) 1491 stp x19,x30, [sp, #-16]! 1492 sub x8, sp, #32 1493 sub sp, sp, #64 1494 st1 {v8.1d - v11.1d}, [sp] 1495 st1 {v12.1d - v15.1d}, [x8] 1496 mov x8, x5 // x 1497 ldr w5, [sp,#80] // r 1498 sub x9, x2, x8 1499 sub x10, x3, x6 1500 mov x2, x4 // pitch 1501 mov x3, x7 // count 1502 sub x7, x10, #1 1503 sub x9, x9, x3 1504 1505 ldr x12, [sp, #88] 1506 1507 add x0, x0, x8, LSL #2 1508 add x1, x1, x8, LSL #2 1509 1510 cmp x6, x5 1511 csel x6, x5, x6, hs 1512 cmp x7, x5 1513 csel x7, x5, x7, hs 1514 cmp x8, x5 1515 csel x8, x5, x8, hs 1516 cmp x9, x5 1517 csel x9, x5, x9, hs 1518 1519 lsl x3, x3, #2 1520 add x4, x8, x9 1521 add x4, x3, x4, LSL #2 1522 1523 sub x1, x1, x8, LSL #2 1524 1525 sub x13, xzr, x2 1526 msub x15, x2, x6, x1 1527 madd x19, x2, x7, x1 1528 1529 ld1 {v0.8h,v1.8h}, [x12], #32 1530 ld1 {v2.8h,v3.8h}, [x12], #32 1531 1532 adr x30, 1f 1533 .irep r, TUNED_LIST4 1534 cmp x5, #\r 1535 bls convolve4_\r 1536 .endr 1537 b convolve4_25 1538 15391: ld1 {v8.1d - v11.1d}, [sp], #32 1540 ld1 {v12.1d - v15.1d}, [sp], #32 1541 ldp x19,x30, [sp], #16 1542 ret 1543END(rsdIntrinsicBlurU4_K) 1544