1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18#define PRIVATE(f) .text; .align 4; .type f,#function; f: 19#define END(f) .size f, .-f; 20 21//#define ARCH_ARM64_USE_BLUR_PRELOAD 22 23/* Number of fractional bits to preserve in intermediate results. The 24 * intermediate storage is 16-bit, and we started with 8 bit data (the integer 25 * part), so this should be between 0 and 8. 26 */ 27.set FRACTION_BITS, 7 28.set MAX_R, 25 29 30 31/* A quick way of making a line of code conditional on some other condition. 32 * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with 33 * `ifcc`: 34 */ 35.macro ifcc zzz:vararg 36.if cc 37 \zzz 38.endif 39.endm 40 41/* It's not always clear that prefetching is beneficial and this needs further 42 * testing on different cores, so it's made switchable here. 43 */ 44#if defined(ARCH_ARM64_USE_BLUR_PRELOAD) 45#define VERTPLD(...) prfm PLDL1KEEP, [__VA_ARGS__] 46#else 47#define VERTPLD(...) nop 48#endif 49 50/* Fetch 16 columns of bytes (regardless of image format), convolve these 51 * vertically, and leave them in the register file. If working near the top or 52 * bottom of an image then clamp the addressing while loading the data in. 53 * 54 * The convolution is fully unrolled for windows up to max_r, with the 55 * outermost edges calculated first. This way it's possible to branch directly 56 * into the relevant part of the code for an arbitrary convolution radius. Two 57 * variants of the loop are produced; one eliminates the clamping code for a 58 * slight speed advantage. 59 * 60 * Where the macro is called with reg=x, the specified register is taken to 61 * contain a pre-calculated pointer into one of the two loops. 62 * 63 * Input: 64 * x1 -- src 65 * x2 -- pitch 66 * x5 -- r 67 * x6 -- rup (r, unless clipped to top of source image) 68 * x7 -- rdn (r, unless clipped to bottom of source image) 69 * x12 -- switch index 70 * v0-v3 -- coefficient table 71 * x13 = -pitch 72 * x15 = top-row in 73 * x19 = bottom-row in 74 * Output: 75 * x1 += 16 76 * v10,v11 -- 16 convolved columns 77 * Modifies: 78 * x10 = upper row pointer 79 * x11 = lower row pointer 80 * v12-v15 = temporary sums 81 */ 82.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/ 83 .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif 84 85 ld1 {v15.16b}, [x1], #16 86 mov x10, x15 87 88 uxtl v14.8h, v15.8b 89 VERTPLD(x1, #16) 90 uxtl2 v15.8h, v15.16b 91 .if \max_r < 16 // approximate 92 ifcc adr \reg, 1f 93 .else 94 ifcc adrp \reg, 1f 95 ifcc add \reg, \reg, #:lo12:1f 96 .endif 97 98 umull v12.4s, v14.4h, v0.h[0] 99 ifcc sub \reg, \reg, x5, LSL #6 100 umull2 v13.4s, v14.8h, v0.h[0] 101 mov x11, x19 102 umull v14.4s, v15.4h, v0.h[0] 103 ifcc add \reg, \reg, x5, LSL #3 104 umull2 v15.4s, v15.8h, v0.h[0] 105 br \reg 106 107 /* This version of the vertical fetch loop body is used away from the edges 108 * of the source image. The pointers start at the top and bottom source rows 109 * and work their way towards the centre on each iteration. This way the 110 * number of taps used can be controlled by jumping directly into the middle 111 * of the loop and running to completion. 112 * If the loop body changes size then the code which caculates the address of 113 * the initial iteration must be updated to accordingly. 114 */ 115 .macro vertfetch_noclamp i, dreg 116 .if 0 < \i && \i <= \max_r 117 ld1 {v10.16b}, [x10], x2 118 ld1 {v11.16b}, [x11], x13 119 uaddl v16.8h, v10.8b, v11.8b 120 uaddl2 v11.8h, v10.16b, v11.16b 121 umlal v12.4s, v16.4h, \dreg 122 umlal2 v13.4s, v16.8h, \dreg 123 VERTPLD(x10, #32) 124 umlal v14.4s, v11.4h, \dreg 125 VERTPLD(x11, #32) 126 umlal2 v15.4s, v11.8h, \dreg 127 .endif 128 .endm 129 130 /* This version of the vertical fetch loop body is used near the edges of the 131 * source image, where one or both of the accesses may start with a clamped 132 * value, and the row addresses only begin to change after some number of 133 * iterations before the end. 134 * If the loop body changes size then the code which caculates the address of 135 * the initial iteration must be updated to accordingly. 136 */ 137 .macro vertfetch_clamped i, dreg 138 .if 0 < \i && \i <= \max_r 139 ld1 {v10.16b}, [x10], x2 140 cmp x6, #\i 141 ld1 {v11.16b}, [x11], x13 142 csel x10, x15, x10, lo 143 uaddl v16.8h, v10.8b, v11.8b 144 cmp x7, #\i 145 uaddl2 v11.8h, v10.16b, v11.16b 146 csel x11, x19, x11, lo 147 umlal v12.4s, v16.4h, \dreg 148 umlal2 v13.4s, v16.8h, \dreg 149 VERTPLD(x10, #32) 150 umlal v14.4s, v11.4h, \dreg 151 VERTPLD(x11, #32) 152 umlal2 v15.4s, v11.8h, \dreg 153 .endif 154 .endm 155 156 /* Entry into this unrolled loop is computed as a negative index from 157 * \labelc at the end of the block. 158 */ 159 .align 4 160 vertfetch_clamped 27, v3.h[3] 161 vertfetch_clamped 26, v3.h[2] 162 vertfetch_clamped 25, v3.h[1] 163 vertfetch_clamped 24, v3.h[0] 164 vertfetch_clamped 23, v2.h[7] 165 vertfetch_clamped 22, v2.h[6] 166 vertfetch_clamped 21, v2.h[5] 167 vertfetch_clamped 20, v2.h[4] 168 vertfetch_clamped 19, v2.h[3] 169 vertfetch_clamped 18, v2.h[2] 170 vertfetch_clamped 17, v2.h[1] 171 vertfetch_clamped 16, v2.h[0] 172 vertfetch_clamped 15, v1.h[7] 173 vertfetch_clamped 14, v1.h[6] 174 vertfetch_clamped 13, v1.h[5] 175 vertfetch_clamped 12, v1.h[4] 176 vertfetch_clamped 11, v1.h[3] 177 vertfetch_clamped 10, v1.h[2] 178 vertfetch_clamped 9, v1.h[1] 179 vertfetch_clamped 8, v1.h[0] 180 vertfetch_clamped 7, v0.h[7] 181 vertfetch_clamped 6, v0.h[6] 182 vertfetch_clamped 5, v0.h[5] 183 vertfetch_clamped 4, v0.h[4] 184 vertfetch_clamped 3, v0.h[3] 185 vertfetch_clamped 2, v0.h[2] 186 vertfetch_clamped 1, v0.h[1] 187 vertfetch_clamped 0, v0.h[0] 188 1: 189 \labelc : b 2f /* done with clamped loop, skip over non-clamped loop */ 190 191 /* Entry into this unrolled loop is computed as a negative index from 192 * \labelnc at the end of the block. 193 */ 194 .align 4 195 vertfetch_noclamp 27, v3.h[3] 196 vertfetch_noclamp 26, v3.h[2] 197 vertfetch_noclamp 25, v3.h[1] 198 vertfetch_noclamp 24, v3.h[0] 199 vertfetch_noclamp 23, v2.h[7] 200 vertfetch_noclamp 22, v2.h[6] 201 vertfetch_noclamp 21, v2.h[5] 202 vertfetch_noclamp 20, v2.h[4] 203 vertfetch_noclamp 19, v2.h[3] 204 vertfetch_noclamp 18, v2.h[2] 205 vertfetch_noclamp 17, v2.h[1] 206 vertfetch_noclamp 16, v2.h[0] 207 vertfetch_noclamp 15, v1.h[7] 208 vertfetch_noclamp 14, v1.h[6] 209 vertfetch_noclamp 13, v1.h[5] 210 vertfetch_noclamp 12, v1.h[4] 211 vertfetch_noclamp 11, v1.h[3] 212 vertfetch_noclamp 10, v1.h[2] 213 vertfetch_noclamp 9, v1.h[1] 214 vertfetch_noclamp 8, v1.h[0] 215 vertfetch_noclamp 7, v0.h[7] 216 vertfetch_noclamp 6, v0.h[6] 217 vertfetch_noclamp 5, v0.h[5] 218 vertfetch_noclamp 4, v0.h[4] 219 vertfetch_noclamp 3, v0.h[3] 220 vertfetch_noclamp 2, v0.h[2] 221 vertfetch_noclamp 1, v0.h[1] 222 vertfetch_noclamp 0, v0.h[0] 223 \labelnc : 224 225 .purgem vertfetch_clamped 226 .purgem vertfetch_noclamp 227 228 2: uqrshrn v10.4h, v12.4s, #16 - FRACTION_BITS 229 add x15, x15, #16 230 uqrshrn2 v10.8h, v13.4s, #16 - FRACTION_BITS 231 add x19, x19, #16 232 uqrshrn v11.4h, v14.4s, #16 - FRACTION_BITS 233 uqrshrn2 v11.8h, v15.4s, #16 - FRACTION_BITS 234.endm /*}}}*/ 235 236/* Some portion of the convolution window (as much as will fit, and all of it 237 * for the uchar1 cases) is kept in the register file to avoid unnecessary 238 * memory accesses. This forces the horizontal loops to be unrolled because 239 * there's no indexed addressing into the register file. 240 * 241 * As in the fetch macro, the operations are ordered from outside to inside, so 242 * that jumping into the middle of the block bypasses the unwanted window taps. 243 * 244 * There are several variants of the macro because of the fixed offets of the 245 * taps -- the wider the maximum radius the further the centre tap is from the 246 * most recently fetched data. This means that pre-filling the window requires 247 * more data that won't be used and it means that rotating the window involves 248 * more mov operations. 249 * 250 * When the buffer gets too big the buffer at [x9] is used. 251 * 252 * Input: 253 * v16-v31,v4-v11 -- convoltion window 254 * x9 -- pointer to additional convolution window data 255 * Output: 256 * x9 -- updated buffer pointer (if used) 257 * d31 -- result to be stored 258 * Modifies: 259 * x12 -- temp buffer pointer 260 * v12-v13 -- temporaries for load and vext operations. 261 * v14-v15 -- intermediate sums 262 */ 263#define TUNED_LIST1 8, 16 264.macro hconv1_8/*{{{*/ 265 umull v14.4s, v9.4h, v0.h[0] 266 umull2 v15.4s, v9.8h, v0.h[0] 267 268 adr x16, 100f 269 ldrsh x12, [x16, x5, LSL #1] 270 add x12, x12, x16 271 br x12 272 100: .hword -4 273 .hword 101f-100b 274 .hword 102f-100b 275 .hword 103f-100b 276 .hword 104f-100b 277 .hword 105f-100b 278 .hword 106f-100b 279 .hword 107f-100b 280 .hword 108f-100b 281 .align 4 282 108: umlal v14.4s, v8.4h, v1.h[0] 283 umlal2 v15.4s, v8.8h, v1.h[0] 284 umlal v14.4s, v10.4h, v1.h[0] 285 umlal2 v15.4s, v10.8h, v1.h[0] 286 107: ext v12.16b, v8.16b, v9.16b, #1*2 287 ext v13.16b, v9.16b, v10.16b, #7*2 288 umlal v14.4s, v12.4h, v0.h[7] 289 umlal2 v15.4s, v12.8h, v0.h[7] 290 umlal v14.4s, v13.4h, v0.h[7] 291 umlal2 v15.4s, v13.8h, v0.h[7] 292 106: ext v12.16b, v8.16b, v9.16b, #2*2 293 ext v13.16b, v9.16b, v10.16b, #6*2 294 umlal v14.4s, v12.4h, v0.h[6] 295 umlal2 v15.4s, v12.8h, v0.h[6] 296 umlal v14.4s, v13.4h, v0.h[6] 297 umlal2 v15.4s, v13.8h, v0.h[6] 298 105: ext v12.16b, v8.16b, v9.16b, #3*2 299 ext v13.16b, v9.16b, v10.16b, #5*2 300 umlal v14.4s, v12.4h, v0.h[5] 301 umlal2 v15.4s, v12.8h, v0.h[5] 302 umlal v14.4s, v13.4h, v0.h[5] 303 umlal2 v15.4s, v13.8h, v0.h[5] 304 104: //ext v12.16b, v8.16b, v9.16b, #4*2 305 //ext v13.16b, v9.16b, v10.16b, #4*2 306 umlal2 v14.4s, v8.8h, v0.h[4] 307 umlal v15.4s, v9.4h, v0.h[4] 308 umlal2 v14.4s, v9.8h, v0.h[4] 309 umlal v15.4s, v10.4h, v0.h[4] 310 103: ext v12.16b, v8.16b, v9.16b, #5*2 311 ext v13.16b, v9.16b, v10.16b, #3*2 312 umlal v14.4s, v12.4h, v0.h[3] 313 umlal2 v15.4s, v12.8h, v0.h[3] 314 umlal v14.4s, v13.4h, v0.h[3] 315 umlal2 v15.4s, v13.8h, v0.h[3] 316 102: ext v12.16b, v8.16b, v9.16b, #6*2 317 ext v13.16b, v9.16b, v10.16b, #2*2 318 umlal v14.4s, v12.4h, v0.h[2] 319 umlal2 v15.4s, v12.8h, v0.h[2] 320 umlal v14.4s, v13.4h, v0.h[2] 321 umlal2 v15.4s, v13.8h, v0.h[2] 322 101: ext v12.16b, v8.16b, v9.16b, #7*2 323 ext v13.16b, v9.16b, v10.16b, #1*2 324 umlal v14.4s, v12.4h, v0.h[1] 325 umlal2 v15.4s, v12.8h, v0.h[1] 326 umlal v14.4s, v13.4h, v0.h[1] 327 umlal2 v15.4s, v13.8h, v0.h[1] 328 329 uqrshrn v14.4h, v14.4s, #16 330 uqrshrn2 v14.8h, v15.4s, #16 331 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 332 333 mov v8.16b, v9.16b 334 mov v9.16b, v10.16b 335 mov v10.16b, v11.16b 336.endm/*}}}*/ 337 338.macro hconv1_16/*{{{*/ 339 umull v14.4s, v8.4h, v0.h[0] 340 umull2 v15.4s, v8.8h, v0.h[0] 341 342 adr x16, 100f 343 ldrsh x12, [x16, x5, LSL #1] 344 add x12, x12, x16 345 br x12 346 100: .hword -4 347 .hword 101f-100b 348 .hword 102f-100b 349 .hword 103f-100b 350 .hword 104f-100b 351 .hword 105f-100b 352 .hword 106f-100b 353 .hword 107f-100b 354 .hword 108f-100b 355 .hword 109f-100b 356 .hword 110f-100b 357 .hword 111f-100b 358 .hword 112f-100b 359 .hword 113f-100b 360 .hword 114f-100b 361 .hword 115f-100b 362 .hword 116f-100b 363 .align 4 364 116: //ext v12.16b, v6.16b, v7.16b, #0*2 365 //ext v13.16b, v10.16b, v11.16b, #0*2 366 umlal v14.4s, v6.4h, v2.h[0] 367 umlal2 v15.4s, v6.8h, v2.h[0] 368 umlal v14.4s, v10.4h, v2.h[0] 369 umlal2 v15.4s, v10.8h, v2.h[0] 370 115: ext v12.16b, v6.16b, v7.16b, #1*2 371 ext v13.16b, v9.16b, v10.16b, #7*2 372 umlal v14.4s, v12.4h, v1.h[7] 373 umlal2 v15.4s, v12.8h, v1.h[7] 374 umlal v14.4s, v13.4h, v1.h[7] 375 umlal2 v15.4s, v13.8h, v1.h[7] 376 114: ext v12.16b, v6.16b, v7.16b, #2*2 377 ext v13.16b, v9.16b, v10.16b, #6*2 378 umlal v14.4s, v12.4h, v1.h[6] 379 umlal2 v15.4s, v12.8h, v1.h[6] 380 umlal v14.4s, v13.4h, v1.h[6] 381 umlal2 v15.4s, v13.8h, v1.h[6] 382 113: ext v12.16b, v6.16b, v7.16b, #3*2 383 ext v13.16b, v9.16b, v10.16b, #5*2 384 umlal v14.4s, v12.4h, v1.h[5] 385 umlal2 v15.4s, v12.8h, v1.h[5] 386 umlal v14.4s, v13.4h, v1.h[5] 387 umlal2 v15.4s, v13.8h, v1.h[5] 388 112: //ext v12.16b, v6.16b, v7.16b, #4*2 389 //ext v13.16b, v9.16b, v10.16b, #4*2 390 umlal2 v14.4s, v6.8h, v1.h[4] 391 umlal v15.4s, v7.4h, v1.h[4] 392 umlal2 v14.4s, v9.8h, v1.h[4] 393 umlal v15.4s, v10.4h, v1.h[4] 394 111: ext v12.16b, v6.16b, v7.16b, #5*2 395 ext v13.16b, v9.16b, v10.16b, #3*2 396 umlal v14.4s, v12.4h, v1.h[3] 397 umlal2 v15.4s, v12.8h, v1.h[3] 398 umlal v14.4s, v13.4h, v1.h[3] 399 umlal2 v15.4s, v13.8h, v1.h[3] 400 110: ext v12.16b, v6.16b, v7.16b, #6*2 401 ext v13.16b, v9.16b, v10.16b, #2*2 402 umlal v14.4s, v12.4h, v1.h[2] 403 umlal2 v15.4s, v12.8h, v1.h[2] 404 umlal v14.4s, v13.4h, v1.h[2] 405 umlal2 v15.4s, v13.8h, v1.h[2] 406 109: ext v12.16b, v6.16b, v7.16b, #7*2 407 ext v13.16b, v9.16b, v10.16b, #1*2 408 umlal v14.4s, v12.4h, v1.h[1] 409 umlal2 v15.4s, v12.8h, v1.h[1] 410 umlal v14.4s, v13.4h, v1.h[1] 411 umlal2 v15.4s, v13.8h, v1.h[1] 412 108: //ext v12.16b, v7.16b, v8.16b, #0*2 413 //ext v13.16b, v9.16b, v10.16b, #0*2 414 umlal v14.4s, v7.4h, v1.h[0] 415 umlal2 v15.4s, v7.8h, v1.h[0] 416 umlal v14.4s, v9.4h, v1.h[0] 417 umlal2 v15.4s, v9.8h, v1.h[0] 418 107: ext v12.16b, v7.16b, v8.16b, #1*2 419 ext v13.16b, v8.16b, v9.16b, #7*2 420 umlal v14.4s, v12.4h, v0.h[7] 421 umlal2 v15.4s, v12.8h, v0.h[7] 422 umlal v14.4s, v13.4h, v0.h[7] 423 umlal2 v15.4s, v13.8h, v0.h[7] 424 106: ext v12.16b, v7.16b, v8.16b, #2*2 425 ext v13.16b, v8.16b, v9.16b, #6*2 426 umlal v14.4s, v12.4h, v0.h[6] 427 umlal2 v15.4s, v12.8h, v0.h[6] 428 umlal v14.4s, v13.4h, v0.h[6] 429 umlal2 v15.4s, v13.8h, v0.h[6] 430 105: ext v12.16b, v7.16b, v8.16b, #3*2 431 ext v13.16b, v8.16b, v9.16b, #5*2 432 umlal v14.4s, v12.4h, v0.h[5] 433 umlal2 v15.4s, v12.8h, v0.h[5] 434 umlal v14.4s, v13.4h, v0.h[5] 435 umlal2 v15.4s, v13.8h, v0.h[5] 436 104: //ext v12.16b, v7.16b, v8.16b, #4*2 437 //ext v13.16b, v8.16b, v9.16b, #4*2 438 umlal2 v14.4s, v7.8h, v0.h[4] 439 umlal v15.4s, v8.4h, v0.h[4] 440 umlal2 v14.4s, v8.8h, v0.h[4] 441 umlal v15.4s, v9.4h, v0.h[4] 442 103: ext v12.16b, v7.16b, v8.16b, #5*2 443 ext v13.16b, v8.16b, v9.16b, #3*2 444 umlal v14.4s, v12.4h, v0.h[3] 445 umlal2 v15.4s, v12.8h, v0.h[3] 446 umlal v14.4s, v13.4h, v0.h[3] 447 umlal2 v15.4s, v13.8h, v0.h[3] 448 102: ext v12.16b, v7.16b, v8.16b, #6*2 449 ext v13.16b, v8.16b, v9.16b, #2*2 450 umlal v14.4s, v12.4h, v0.h[2] 451 umlal2 v15.4s, v12.8h, v0.h[2] 452 umlal v14.4s, v13.4h, v0.h[2] 453 umlal2 v15.4s, v13.8h, v0.h[2] 454 101: ext v12.16b, v7.16b, v8.16b, #7*2 455 ext v13.16b, v8.16b, v9.16b, #1*2 456 umlal v14.4s, v12.4h, v0.h[1] 457 umlal2 v15.4s, v12.8h, v0.h[1] 458 umlal v14.4s, v13.4h, v0.h[1] 459 umlal2 v15.4s, v13.8h, v0.h[1] 460 461 uqrshrn v14.4h, v14.4s, #16 462 uqrshrn2 v14.8h, v15.4s, #16 463 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 464 465 mov v6.16b, v7.16b 466 mov v7.16b, v8.16b 467 mov v8.16b, v9.16b 468 mov v9.16b, v10.16b 469 mov v10.16b, v11.16b 470.endm/*}}}*/ 471 472.macro hconv1_25/*{{{*/ 473 ext v12.16b, v6.16b, v7.16b, #7*2 474 umull v14.4s, v12.4h, v0.h[0] 475 umull2 v15.4s, v12.8h, v0.h[0] 476 477 adr x16, 100f 478 ldrsh x12, [x16, x5, LSL #1] 479 add x12, x12, x16 480 br x12 481 100: .hword -4 482 .hword 101f-100b 483 .hword 102f-100b 484 .hword 103f-100b 485 .hword 104f-100b 486 .hword 105f-100b 487 .hword 106f-100b 488 .hword 107f-100b 489 .hword 108f-100b 490 .hword 109f-100b 491 .hword 110f-100b 492 .hword 111f-100b 493 .hword 112f-100b 494 .hword 113f-100b 495 .hword 114f-100b 496 .hword 115f-100b 497 .hword 116f-100b 498 .hword 117f-100b 499 .hword 118f-100b 500 .hword 119f-100b 501 .hword 120f-100b 502 .hword 121f-100b 503 .hword 122f-100b 504 .hword 123f-100b 505 .hword 124f-100b 506 .hword 125f-100b 507 .align 4 508 125: ext v12.16b, v31.16b, v4.16b, #6*2 509 ext v13.16b, v10.16b, v11.16b, #0*2 510 umlal v14.4s, v12.4h, v3.h[1] 511 umlal2 v15.4s, v12.8h, v3.h[1] 512 umlal v14.4s, v13.4h, v3.h[1] 513 umlal2 v15.4s, v13.8h, v3.h[1] 514 124: ext v12.16b, v31.16b, v4.16b, #7*2 515 ext v13.16b, v9.16b, v10.16b, #7*2 516 umlal v14.4s, v12.4h, v3.h[0] 517 umlal2 v15.4s, v12.8h, v3.h[0] 518 umlal v14.4s, v13.4h, v3.h[0] 519 umlal2 v15.4s, v13.8h, v3.h[0] 520 123: ext v12.16b, v4.16b, v5.16b, #0*2 521 ext v13.16b, v9.16b, v10.16b, #6*2 522 umlal v14.4s, v12.4h, v2.h[7] 523 umlal2 v15.4s, v12.8h, v2.h[7] 524 umlal v14.4s, v13.4h, v2.h[7] 525 umlal2 v15.4s, v13.8h, v2.h[7] 526 122: ext v12.16b, v4.16b, v5.16b, #1*2 527 ext v13.16b, v9.16b, v10.16b, #5*2 528 umlal v14.4s, v12.4h, v2.h[6] 529 umlal2 v15.4s, v12.8h, v2.h[6] 530 umlal v14.4s, v13.4h, v2.h[6] 531 umlal2 v15.4s, v13.8h, v2.h[6] 532 121: ext v12.16b, v4.16b, v5.16b, #2*2 533 ext v13.16b, v9.16b, v10.16b, #4*2 534 umlal v14.4s, v12.4h, v2.h[5] 535 umlal2 v15.4s, v12.8h, v2.h[5] 536 umlal v14.4s, v13.4h, v2.h[5] 537 umlal2 v15.4s, v13.8h, v2.h[5] 538 120: ext v12.16b, v4.16b, v5.16b, #3*2 539 ext v13.16b, v9.16b, v10.16b, #3*2 540 umlal v14.4s, v12.4h, v2.h[4] 541 umlal2 v15.4s, v12.8h, v2.h[4] 542 umlal v14.4s, v13.4h, v2.h[4] 543 umlal2 v15.4s, v13.8h, v2.h[4] 544 119: ext v12.16b, v4.16b, v5.16b, #4*2 545 ext v13.16b, v9.16b, v10.16b, #2*2 546 umlal v14.4s, v12.4h, v2.h[3] 547 umlal2 v15.4s, v12.8h, v2.h[3] 548 umlal v14.4s, v13.4h, v2.h[3] 549 umlal2 v15.4s, v13.8h, v2.h[3] 550 118: ext v12.16b, v4.16b, v5.16b, #5*2 551 ext v13.16b, v9.16b, v10.16b, #1*2 552 umlal v14.4s, v12.4h, v2.h[2] 553 umlal2 v15.4s, v12.8h, v2.h[2] 554 umlal v14.4s, v13.4h, v2.h[2] 555 umlal2 v15.4s, v13.8h, v2.h[2] 556 117: ext v12.16b, v4.16b, v5.16b, #6*2 557 ext v13.16b, v9.16b, v10.16b, #0*2 558 umlal v14.4s, v12.4h, v2.h[1] 559 umlal2 v15.4s, v12.8h, v2.h[1] 560 umlal v14.4s, v13.4h, v2.h[1] 561 umlal2 v15.4s, v13.8h, v2.h[1] 562 116: ext v12.16b, v4.16b, v5.16b, #7*2 563 ext v13.16b, v8.16b, v9.16b, #7*2 564 umlal v14.4s, v12.4h, v2.h[0] 565 umlal2 v15.4s, v12.8h, v2.h[0] 566 umlal v14.4s, v13.4h, v2.h[0] 567 umlal2 v15.4s, v13.8h, v2.h[0] 568 115: ext v12.16b, v5.16b, v6.16b, #0*2 569 ext v13.16b, v8.16b, v9.16b, #6*2 570 umlal v14.4s, v12.4h, v1.h[7] 571 umlal2 v15.4s, v12.8h, v1.h[7] 572 umlal v14.4s, v13.4h, v1.h[7] 573 umlal2 v15.4s, v13.8h, v1.h[7] 574 114: ext v12.16b, v5.16b, v6.16b, #1*2 575 ext v13.16b, v8.16b, v9.16b, #5*2 576 umlal v14.4s, v12.4h, v1.h[6] 577 umlal2 v15.4s, v12.8h, v1.h[6] 578 umlal v14.4s, v13.4h, v1.h[6] 579 umlal2 v15.4s, v13.8h, v1.h[6] 580 113: ext v12.16b, v5.16b, v6.16b, #2*2 581 ext v13.16b, v8.16b, v9.16b, #4*2 582 umlal v14.4s, v12.4h, v1.h[5] 583 umlal2 v15.4s, v12.8h, v1.h[5] 584 umlal v14.4s, v13.4h, v1.h[5] 585 umlal2 v15.4s, v13.8h, v1.h[5] 586 112: ext v12.16b, v5.16b, v6.16b, #3*2 587 ext v13.16b, v8.16b, v9.16b, #3*2 588 umlal v14.4s, v12.4h, v1.h[4] 589 umlal2 v15.4s, v12.8h, v1.h[4] 590 umlal v14.4s, v13.4h, v1.h[4] 591 umlal2 v15.4s, v13.8h, v1.h[4] 592 111: ext v12.16b, v5.16b, v6.16b, #4*2 593 ext v13.16b, v8.16b, v9.16b, #2*2 594 umlal v14.4s, v12.4h, v1.h[3] 595 umlal2 v15.4s, v12.8h, v1.h[3] 596 umlal v14.4s, v13.4h, v1.h[3] 597 umlal2 v15.4s, v13.8h, v1.h[3] 598 110: ext v12.16b, v5.16b, v6.16b, #5*2 599 ext v13.16b, v8.16b, v9.16b, #1*2 600 umlal v14.4s, v12.4h, v1.h[2] 601 umlal2 v15.4s, v12.8h, v1.h[2] 602 umlal v14.4s, v13.4h, v1.h[2] 603 umlal2 v15.4s, v13.8h, v1.h[2] 604 109: ext v12.16b, v5.16b, v6.16b, #6*2 605 ext v13.16b, v8.16b, v9.16b, #0*2 606 umlal v14.4s, v12.4h, v1.h[1] 607 umlal2 v15.4s, v12.8h, v1.h[1] 608 umlal v14.4s, v13.4h, v1.h[1] 609 umlal2 v15.4s, v13.8h, v1.h[1] 610 108: ext v12.16b, v5.16b, v6.16b, #7*2 611 ext v13.16b, v7.16b, v8.16b, #7*2 612 umlal v14.4s, v12.4h, v1.h[0] 613 umlal2 v15.4s, v12.8h, v1.h[0] 614 umlal v14.4s, v13.4h, v1.h[0] 615 umlal2 v15.4s, v13.8h, v1.h[0] 616 107: ext v12.16b, v6.16b, v7.16b, #0*2 617 ext v13.16b, v7.16b, v8.16b, #6*2 618 umlal v14.4s, v12.4h, v0.h[7] 619 umlal2 v15.4s, v12.8h, v0.h[7] 620 umlal v14.4s, v13.4h, v0.h[7] 621 umlal2 v15.4s, v13.8h, v0.h[7] 622 106: ext v12.16b, v6.16b, v7.16b, #1*2 623 ext v13.16b, v7.16b, v8.16b, #5*2 624 umlal v14.4s, v12.4h, v0.h[6] 625 umlal2 v15.4s, v12.8h, v0.h[6] 626 umlal v14.4s, v13.4h, v0.h[6] 627 umlal2 v15.4s, v13.8h, v0.h[6] 628 105: ext v12.16b, v6.16b, v7.16b, #2*2 629 ext v13.16b, v7.16b, v8.16b, #4*2 630 umlal v14.4s, v12.4h, v0.h[5] 631 umlal2 v15.4s, v12.8h, v0.h[5] 632 umlal v14.4s, v13.4h, v0.h[5] 633 umlal2 v15.4s, v13.8h, v0.h[5] 634 104: ext v12.16b, v6.16b, v7.16b, #3*2 635 ext v13.16b, v7.16b, v8.16b, #3*2 636 umlal v14.4s, v12.4h, v0.h[4] 637 umlal2 v15.4s, v12.8h, v0.h[4] 638 umlal v14.4s, v13.4h, v0.h[4] 639 umlal2 v15.4s, v13.8h, v0.h[4] 640 103: ext v12.16b, v6.16b, v7.16b, #4*2 641 ext v13.16b, v7.16b, v8.16b, #2*2 642 umlal v14.4s, v12.4h, v0.h[3] 643 umlal2 v15.4s, v12.8h, v0.h[3] 644 umlal v14.4s, v13.4h, v0.h[3] 645 umlal2 v15.4s, v13.8h, v0.h[3] 646 102: ext v12.16b, v6.16b, v7.16b, #5*2 647 ext v13.16b, v7.16b, v8.16b, #1*2 648 umlal v14.4s, v12.4h, v0.h[2] 649 umlal2 v15.4s, v12.8h, v0.h[2] 650 umlal v14.4s, v13.4h, v0.h[2] 651 umlal2 v15.4s, v13.8h, v0.h[2] 652 101: ext v12.16b, v6.16b, v7.16b, #6*2 653 ext v13.16b, v7.16b, v8.16b, #0*2 654 umlal v14.4s, v12.4h, v0.h[1] 655 umlal2 v15.4s, v12.8h, v0.h[1] 656 umlal v14.4s, v13.4h, v0.h[1] 657 umlal2 v15.4s, v13.8h, v0.h[1] 658 659 uqrshrn v14.4h, v14.4s, #16 660 uqrshrn2 v14.8h, v15.4s, #16 661 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 662 663 mov v31.16b, v4.16b 664 mov v4.16b, v5.16b 665 mov v5.16b, v6.16b 666 mov v6.16b, v7.16b 667 mov v7.16b, v8.16b 668 mov v8.16b, v9.16b 669 mov v9.16b, v10.16b 670 mov v10.16b, v11.16b 671.endm/*}}}*/ 672 673#define TUNED_LIST4 6, 12, 20 674.macro hconv4_6/*{{{*/ 675 umull v14.4s, v7.4h, v0.h[0] 676 umull2 v15.4s, v7.8h, v0.h[0] 677 678 adr x16, 100f 679 ldrsh x12, [x16, x5, LSL #1] 680 add x12, x12, x16 681 br x12 682 100: .hword -4 683 .hword 101f-100b 684 .hword 102f-100b 685 .hword 103f-100b 686 .hword 104f-100b 687 .hword 105f-100b 688 .hword 106f-100b 689 .align 4 690 106: umlal v14.4s, v4.4h, v0.h[6] 691 umlal2 v15.4s, v4.8h, v0.h[6] 692 umlal v14.4s, v10.4h, v0.h[6] 693 umlal2 v15.4s, v10.8h, v0.h[6] 694 105: umlal2 v14.4s, v4.8h, v0.h[5] 695 umlal v15.4s, v5.4h, v0.h[5] 696 umlal2 v14.4s, v9.8h, v0.h[5] 697 umlal v15.4s, v10.4h, v0.h[5] 698 104: umlal v14.4s, v5.4h, v0.h[4] 699 umlal2 v15.4s, v5.8h, v0.h[4] 700 umlal v14.4s, v9.4h, v0.h[4] 701 umlal2 v15.4s, v9.8h, v0.h[4] 702 103: umlal2 v14.4s, v5.8h, v0.h[3] 703 umlal v15.4s, v6.4h, v0.h[3] 704 umlal2 v14.4s, v8.8h, v0.h[3] 705 umlal v15.4s, v9.4h, v0.h[3] 706 102: umlal v14.4s, v6.4h, v0.h[2] 707 umlal2 v15.4s, v6.8h, v0.h[2] 708 umlal v14.4s, v8.4h, v0.h[2] 709 umlal2 v15.4s, v8.8h, v0.h[2] 710 101: umlal2 v14.4s, v6.8h, v0.h[1] 711 umlal v15.4s, v7.4h, v0.h[1] 712 umlal2 v14.4s, v7.8h, v0.h[1] 713 umlal v15.4s, v8.4h, v0.h[1] 714 715 uqrshrn v14.4h, v14.4s, #16 716 uqrshrn2 v14.8h, v15.4s, #16 717 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 718 719 mov v4.16b, v5.16b 720 mov v5.16b, v6.16b 721 mov v6.16b, v7.16b 722 mov v7.16b, v8.16b 723 mov v8.16b, v9.16b 724 mov v9.16b, v10.16b 725 mov v10.16b, v11.16b 726.endm/*}}}*/ 727 728.macro hconv4_12/*{{{*/ 729 umull v14.4s, v4.4h, v0.h[0] 730 umull2 v15.4s, v4.8h, v0.h[0] 731 732 adr x16, 100f 733 ldrsh x12, [x16, x5, LSL #1] 734 add x12, x12, x16 735 br x12 736 100: .hword -4 737 .hword 101f-100b 738 .hword 102f-100b 739 .hword 103f-100b 740 .hword 104f-100b 741 .hword 105f-100b 742 .hword 106f-100b 743 .hword 107f-100b 744 .hword 108f-100b 745 .hword 109f-100b 746 .hword 110f-100b 747 .hword 111f-100b 748 .hword 112f-100b 749 .align 4 750 112: umlal v14.4s, v26.4h, v1.h[4] 751 umlal2 v15.4s, v26.8h, v1.h[4] 752 umlal v14.4s, v10.4h, v1.h[4] 753 umlal2 v15.4s, v10.8h, v1.h[4] 754 111: umlal2 v14.4s, v26.8h, v1.h[3] 755 umlal v15.4s, v27.4h, v1.h[3] 756 umlal2 v14.4s, v9.8h, v1.h[3] 757 umlal v15.4s, v10.4h, v1.h[3] 758 110: umlal v14.4s, v27.4h, v1.h[2] 759 umlal2 v15.4s, v27.8h, v1.h[2] 760 umlal v14.4s, v9.4h, v1.h[2] 761 umlal2 v15.4s, v9.8h, v1.h[2] 762 109: umlal2 v14.4s, v27.8h, v1.h[1] 763 umlal v15.4s, v28.4h, v1.h[1] 764 umlal2 v14.4s, v8.8h, v1.h[1] 765 umlal v15.4s, v9.4h, v1.h[1] 766 108: umlal v14.4s, v28.4h, v1.h[0] 767 umlal2 v15.4s, v28.8h, v1.h[0] 768 umlal v14.4s, v8.4h, v1.h[0] 769 umlal2 v15.4s, v8.8h, v1.h[0] 770 107: umlal2 v14.4s, v28.8h, v0.h[7] 771 umlal v15.4s, v29.4h, v0.h[7] 772 umlal2 v14.4s, v7.8h, v0.h[7] 773 umlal v15.4s, v8.4h, v0.h[7] 774 106: umlal v14.4s, v29.4h, v0.h[6] 775 umlal2 v15.4s, v29.8h, v0.h[6] 776 umlal v14.4s, v7.4h, v0.h[6] 777 umlal2 v15.4s, v7.8h, v0.h[6] 778 105: umlal2 v14.4s, v29.8h, v0.h[5] 779 umlal v15.4s, v30.4h, v0.h[5] 780 umlal2 v14.4s, v6.8h, v0.h[5] 781 umlal v15.4s, v7.4h, v0.h[5] 782 104: umlal v14.4s, v30.4h, v0.h[4] 783 umlal2 v15.4s, v30.8h, v0.h[4] 784 umlal v14.4s, v6.4h, v0.h[4] 785 umlal2 v15.4s, v6.8h, v0.h[4] 786 103: umlal2 v14.4s, v30.8h, v0.h[3] 787 umlal v15.4s, v31.4h, v0.h[3] 788 umlal2 v14.4s, v5.8h, v0.h[3] 789 umlal v15.4s, v6.4h, v0.h[3] 790 102: umlal v14.4s, v31.4h, v0.h[2] 791 umlal2 v15.4s, v31.8h, v0.h[2] 792 umlal v14.4s, v5.4h, v0.h[2] 793 umlal2 v15.4s, v5.8h, v0.h[2] 794 101: umlal2 v14.4s, v31.8h, v0.h[1] 795 umlal v15.4s, v4.4h, v0.h[1] 796 umlal2 v14.4s, v4.8h, v0.h[1] 797 umlal v15.4s, v5.4h, v0.h[1] 798 799 uqrshrn v14.4h, v14.4s, #16 800 uqrshrn2 v14.8h, v15.4s, #16 801 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 802 803 mov v26.16b, v27.16b 804 mov v27.16b, v28.16b 805 mov v28.16b, v29.16b 806 mov v29.16b, v30.16b 807 mov v30.16b, v31.16b 808 mov v31.16b, v4.16b 809 mov v4.16b, v5.16b 810 mov v5.16b, v6.16b 811 mov v6.16b, v7.16b 812 mov v7.16b, v8.16b 813 mov v8.16b, v9.16b 814 mov v9.16b, v10.16b 815 mov v10.16b, v11.16b 816.endm/*}}}*/ 817 818.macro hconv4_20/*{{{*/ 819 umull v14.4s, v28.4h, v0.h[0] 820 umull2 v15.4s, v28.8h, v0.h[0] 821 822 adr x16, 100f 823 ldrsh x12, [x16, x5, LSL #1] 824 add x12, x12, x16 825 br x12 826 100: .hword -4 827 .hword 101f-100b 828 .hword 102f-100b 829 .hword 103f-100b 830 .hword 104f-100b 831 .hword 105f-100b 832 .hword 106f-100b 833 .hword 107f-100b 834 .hword 108f-100b 835 .hword 109f-100b 836 .hword 110f-100b 837 .hword 111f-100b 838 .hword 112f-100b 839 .hword 113f-100b 840 .hword 114f-100b 841 .hword 115f-100b 842 .hword 116f-100b 843 .hword 117f-100b 844 .hword 118f-100b 845 .hword 119f-100b 846 .hword 120f-100b 847 .align 4 848 849 120: umlal v14.4s, v18.4h, v2.h[4] 850 umlal2 v15.4s, v18.8h, v2.h[4] 851 umlal v14.4s, v10.4h, v2.h[4] 852 umlal2 v15.4s, v10.8h, v2.h[4] 853 119: umlal2 v14.4s, v18.8h, v2.h[3] 854 umlal v15.4s, v19.4h, v2.h[3] 855 umlal2 v14.4s, v9.8h, v2.h[3] 856 umlal v15.4s, v10.4h, v2.h[3] 857 118: umlal v14.4s, v19.4h, v2.h[2] 858 umlal2 v15.4s, v19.8h, v2.h[2] 859 umlal v14.4s, v9.4h, v2.h[2] 860 umlal2 v15.4s, v9.8h, v2.h[2] 861 117: umlal2 v14.4s, v19.8h, v2.h[1] 862 umlal v15.4s, v20.4h, v2.h[1] 863 umlal2 v14.4s, v8.8h, v2.h[1] 864 umlal v15.4s, v9.4h, v2.h[1] 865 116: umlal v14.4s, v20.4h, v2.h[0] 866 umlal2 v15.4s, v20.8h, v2.h[0] 867 umlal v14.4s, v8.4h, v2.h[0] 868 umlal2 v15.4s, v8.8h, v2.h[0] 869 115: umlal2 v14.4s, v20.8h, v1.h[7] 870 umlal v15.4s, v21.4h, v1.h[7] 871 umlal2 v14.4s, v7.8h, v1.h[7] 872 umlal v15.4s, v8.4h, v1.h[7] 873 114: umlal v14.4s, v21.4h, v1.h[6] 874 umlal2 v15.4s, v21.8h, v1.h[6] 875 umlal v14.4s, v7.4h, v1.h[6] 876 umlal2 v15.4s, v7.8h, v1.h[6] 877 113: umlal2 v14.4s, v21.8h, v1.h[5] 878 umlal v15.4s, v22.4h, v1.h[5] 879 umlal2 v14.4s, v6.8h, v1.h[5] 880 umlal v15.4s, v7.4h, v1.h[5] 881 112: umlal v14.4s, v22.4h, v1.h[4] 882 umlal2 v15.4s, v22.8h, v1.h[4] 883 umlal v14.4s, v6.4h, v1.h[4] 884 umlal2 v15.4s, v6.8h, v1.h[4] 885 111: umlal2 v14.4s, v22.8h, v1.h[3] 886 umlal v15.4s, v23.4h, v1.h[3] 887 umlal2 v14.4s, v5.8h, v1.h[3] 888 umlal v15.4s, v6.4h, v1.h[3] 889 110: umlal v14.4s, v23.4h, v1.h[2] 890 umlal2 v15.4s, v23.8h, v1.h[2] 891 umlal v14.4s, v5.4h, v1.h[2] 892 umlal2 v15.4s, v5.8h, v1.h[2] 893 109: umlal2 v14.4s, v23.8h, v1.h[1] 894 umlal v15.4s, v24.4h, v1.h[1] 895 umlal2 v14.4s, v4.8h, v1.h[1] 896 umlal v15.4s, v5.4h, v1.h[1] 897 108: umlal v14.4s, v24.4h, v1.h[0] 898 umlal2 v15.4s, v24.8h, v1.h[0] 899 umlal v14.4s, v4.4h, v1.h[0] 900 umlal2 v15.4s, v4.8h, v1.h[0] 901 107: umlal2 v14.4s, v24.8h, v0.h[7] 902 umlal v15.4s, v25.4h, v0.h[7] 903 umlal2 v14.4s, v31.8h, v0.h[7] 904 umlal v15.4s, v4.4h, v0.h[7] 905 106: umlal v14.4s, v25.4h, v0.h[6] 906 umlal2 v15.4s, v25.8h, v0.h[6] 907 umlal v14.4s, v31.4h, v0.h[6] 908 umlal2 v15.4s, v31.8h, v0.h[6] 909 105: umlal2 v14.4s, v25.8h, v0.h[5] 910 umlal v15.4s, v26.4h, v0.h[5] 911 umlal2 v14.4s, v30.8h, v0.h[5] 912 umlal v15.4s, v31.4h, v0.h[5] 913 104: umlal v14.4s, v26.4h, v0.h[4] 914 umlal2 v15.4s, v26.8h, v0.h[4] 915 umlal v14.4s, v30.4h, v0.h[4] 916 umlal2 v15.4s, v30.8h, v0.h[4] 917 103: umlal2 v14.4s, v26.8h, v0.h[3] 918 umlal v15.4s, v27.4h, v0.h[3] 919 umlal2 v14.4s, v29.8h, v0.h[3] 920 umlal v15.4s, v30.4h, v0.h[3] 921 102: umlal v14.4s, v27.4h, v0.h[2] 922 umlal2 v15.4s, v27.8h, v0.h[2] 923 umlal v14.4s, v29.4h, v0.h[2] 924 umlal2 v15.4s, v29.8h, v0.h[2] 925 101: umlal2 v14.4s, v27.8h, v0.h[1] 926 umlal v15.4s, v28.4h, v0.h[1] 927 umlal2 v14.4s, v28.8h, v0.h[1] 928 umlal v15.4s, v29.4h, v0.h[1] 929 930 uqrshrn v14.4h, v14.4s, #16 931 uqrshrn2 v14.8h, v15.4s, #16 932 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 933 934 mov v18.16b, v19.16b 935 mov v19.16b, v20.16b 936 mov v20.16b, v21.16b 937 mov v21.16b, v22.16b 938 mov v22.16b, v23.16b 939 mov v23.16b, v24.16b 940 mov v24.16b, v25.16b 941 mov v25.16b, v26.16b 942 mov v26.16b, v27.16b 943 mov v27.16b, v28.16b 944 mov v28.16b, v29.16b 945 mov v29.16b, v30.16b 946 mov v30.16b, v31.16b 947 mov v31.16b, v4.16b 948 mov v4.16b, v5.16b 949 mov v5.16b, v6.16b 950 mov v6.16b, v7.16b 951 mov v7.16b, v8.16b 952 mov v8.16b, v9.16b 953 mov v9.16b, v10.16b 954 mov v10.16b, v11.16b 955.endm/*}}}*/ 956 957.macro hconv4_25/*{{{*/ 958 umull2 v14.4s, v25.8h, v0.h[0] 959 umull v15.4s, v26.4h, v0.h[0] 960 961 adr x16, 100f 962 ldrsh x12, [x16, x5, LSL #1] 963 add x12, x12, x16 964 br x12 965 100: .hword -4 966 .hword 101f-100b 967 .hword 102f-100b 968 .hword 103f-100b 969 .hword 104f-100b 970 .hword 105f-100b 971 .hword 106f-100b 972 .hword 107f-100b 973 .hword 108f-100b 974 .hword 109f-100b 975 .hword 110f-100b 976 .hword 111f-100b 977 .hword 112f-100b 978 .hword 113f-100b 979 .hword 114f-100b 980 .hword 115f-100b 981 .hword 116f-100b 982 .hword 117f-100b 983 .hword 118f-100b 984 .hword 119f-100b 985 .hword 120f-100b 986 .hword 121f-100b 987 .hword 122f-100b 988 .hword 123f-100b 989 .hword 124f-100b 990 .hword 125f-100b 991 .align 4 992 993 125: ld1 {v12.8h}, [x9] 994 umlal v14.4s, v12.4h, v3.h[1] 995 umlal2 v15.4s, v12.8h, v3.h[1] 996 umlal v14.4s, v10.4h, v3.h[1] 997 umlal2 v15.4s, v10.8h, v3.h[1] 998 124: add x12, x9, #0x08 999 bic x12, x12, #0x40 1000 ld1 {v12.4h}, [x12], #8 1001 bic x12, x12, #0x40 1002 ld1 {v13.4h}, [x12] 1003 umlal v14.4s, v12.4h, v3.h[0] 1004 umlal v15.4s, v13.4h, v3.h[0] 1005 umlal2 v14.4s, v9.8h, v3.h[0] 1006 umlal v15.4s, v10.4h, v3.h[0] 1007 123: add x12, x9, #0x10 1008 bic x12, x12, #0x40 1009 ld1 {v12.8h}, [x12] 1010 umlal v14.4s, v12.4h, v2.h[7] 1011 umlal2 v15.4s, v12.8h, v2.h[7] 1012 umlal v14.4s, v9.4h, v2.h[7] 1013 umlal2 v15.4s, v9.8h, v2.h[7] 1014 122: add x12, x9, #0x18 1015 bic x12, x12, #0x40 1016 ld1 {v12.4h}, [x12], #8 1017 bic x12, x12, #0x40 1018 ld1 {v13.4h}, [x12] 1019 umlal v14.4s, v12.4h, v2.h[6] 1020 umlal v15.4s, v13.4h, v2.h[6] 1021 umlal2 v14.4s, v8.8h, v2.h[6] 1022 umlal v15.4s, v9.4h, v2.h[6] 1023 121: add x12, x9, #0x20 1024 bic x12, x12, #0x40 1025 ld1 {v12.8h}, [x12] 1026 umlal v14.4s, v12.4h, v2.h[5] 1027 umlal2 v15.4s, v12.8h, v2.h[5] 1028 umlal v14.4s, v8.4h, v2.h[5] 1029 umlal2 v15.4s, v8.8h, v2.h[5] 1030 120: add x12, x9, #0x28 1031 bic x12, x12, #0x40 1032 ld1 {v12.4h}, [x12], #8 1033 bic x12, x12, #0x40 1034 ld1 {v13.4h}, [x12] 1035 umlal v14.4s, v12.4h, v2.h[4] 1036 umlal v15.4s, v13.4h, v2.h[4] 1037 umlal2 v14.4s, v7.8h, v2.h[4] 1038 umlal v15.4s, v8.4h, v2.h[4] 1039 119: add x12, x9, #0x30 1040 bic x12, x12, #0x40 1041 ld1 {v12.8h}, [x12] 1042 umlal v14.4s, v12.4h, v2.h[3] 1043 umlal2 v15.4s, v12.8h, v2.h[3] 1044 umlal v14.4s, v7.4h, v2.h[3] 1045 umlal2 v15.4s, v7.8h, v2.h[3] 1046 118: add x12, x9, #0x38 1047 bic x12, x12, #0x40 1048 ld1 {v12.4h}, [x12] 1049 umlal v14.4s, v12.4h, v2.h[2] 1050 umlal v15.4s, v17.4h, v2.h[2] 1051 umlal2 v14.4s, v6.8h, v2.h[2] 1052 umlal v15.4s, v7.4h, v2.h[2] 1053 117: umlal v14.4s, v17.4h, v2.h[1] 1054 umlal2 v15.4s, v17.8h, v2.h[1] 1055 umlal v14.4s, v6.4h, v2.h[1] 1056 umlal2 v15.4s, v6.8h, v2.h[1] 1057 116: umlal2 v14.4s, v17.8h, v2.h[0] 1058 umlal v15.4s, v18.4h, v2.h[0] 1059 umlal2 v14.4s, v5.8h, v2.h[0] 1060 umlal v15.4s, v6.4h, v2.h[0] 1061 115: umlal v14.4s, v18.4h, v1.h[7] 1062 umlal2 v15.4s, v18.8h, v1.h[7] 1063 umlal v14.4s, v5.4h, v1.h[7] 1064 umlal2 v15.4s, v5.8h, v1.h[7] 1065 114: umlal2 v14.4s, v18.8h, v1.h[6] 1066 umlal v15.4s, v19.4h, v1.h[6] 1067 umlal2 v14.4s, v4.8h, v1.h[6] 1068 umlal v15.4s, v5.4h, v1.h[6] 1069 113: umlal v14.4s, v19.4h, v1.h[5] 1070 umlal2 v15.4s, v19.8h, v1.h[5] 1071 umlal v14.4s, v4.4h, v1.h[5] 1072 umlal2 v15.4s, v4.8h, v1.h[5] 1073 112: umlal2 v14.4s, v19.8h, v1.h[4] 1074 umlal v15.4s, v20.4h, v1.h[4] 1075 umlal2 v14.4s, v31.8h, v1.h[4] 1076 umlal v15.4s, v4.4h, v1.h[4] 1077 111: umlal v14.4s, v20.4h, v1.h[3] 1078 umlal2 v15.4s, v20.8h, v1.h[3] 1079 umlal v14.4s, v31.4h, v1.h[3] 1080 umlal2 v15.4s, v31.8h, v1.h[3] 1081 110: umlal2 v14.4s, v20.8h, v1.h[2] 1082 umlal v15.4s, v21.4h, v1.h[2] 1083 umlal2 v14.4s, v30.8h, v1.h[2] 1084 umlal v15.4s, v31.4h, v1.h[2] 1085 109: umlal v14.4s, v21.4h, v1.h[1] 1086 umlal2 v15.4s, v21.8h, v1.h[1] 1087 umlal v14.4s, v30.4h, v1.h[1] 1088 umlal2 v15.4s, v30.8h, v1.h[1] 1089 108: umlal2 v14.4s, v21.8h, v1.h[0] 1090 umlal v15.4s, v22.4h, v1.h[0] 1091 umlal2 v14.4s, v29.8h, v1.h[0] 1092 umlal v15.4s, v30.4h, v1.h[0] 1093 107: umlal v14.4s, v22.4h, v0.h[7] 1094 umlal2 v15.4s, v22.8h, v0.h[7] 1095 umlal v14.4s, v29.4h, v0.h[7] 1096 umlal2 v15.4s, v29.8h, v0.h[7] 1097 106: umlal2 v14.4s, v22.8h, v0.h[6] 1098 umlal v15.4s, v23.4h, v0.h[6] 1099 umlal2 v14.4s, v28.8h, v0.h[6] 1100 umlal v15.4s, v29.4h, v0.h[6] 1101 105: umlal v14.4s, v23.4h, v0.h[5] 1102 umlal2 v15.4s, v23.8h, v0.h[5] 1103 umlal v14.4s, v28.4h, v0.h[5] 1104 umlal2 v15.4s, v28.8h, v0.h[5] 1105 104: umlal2 v14.4s, v23.8h, v0.h[4] 1106 umlal v15.4s, v24.4h, v0.h[4] 1107 umlal2 v14.4s, v27.8h, v0.h[4] 1108 umlal v15.4s, v28.4h, v0.h[4] 1109 103: umlal v14.4s, v24.4h, v0.h[3] 1110 umlal2 v15.4s, v24.8h, v0.h[3] 1111 umlal v14.4s, v27.4h, v0.h[3] 1112 umlal2 v15.4s, v27.8h, v0.h[3] 1113 102: umlal2 v14.4s, v24.8h, v0.h[2] 1114 umlal v15.4s, v25.4h, v0.h[2] 1115 umlal2 v14.4s, v26.8h, v0.h[2] 1116 umlal v15.4s, v27.4h, v0.h[2] 1117 101: umlal v14.4s, v25.4h, v0.h[1] 1118 umlal2 v15.4s, v25.8h, v0.h[1] 1119 umlal v14.4s, v26.4h, v0.h[1] 1120 umlal2 v15.4s, v26.8h, v0.h[1] 1121 1122 uqrshrn v14.4h, v14.4s, #16 1123 uqrshrn2 v14.8h, v15.4s, #16 1124 uqrshrn v15.8b, v14.8h, #FRACTION_BITS 1125 1126 st1 {v17.16b}, [x9], #16 1127 bic x9, x9, #0x40 1128 mov v17.16b, v18.16b 1129 mov v18.16b, v19.16b 1130 mov v19.16b, v20.16b 1131 mov v20.16b, v21.16b 1132 mov v21.16b, v22.16b 1133 mov v22.16b, v23.16b 1134 mov v23.16b, v24.16b 1135 mov v24.16b, v25.16b 1136 mov v25.16b, v26.16b 1137 mov v26.16b, v27.16b 1138 mov v27.16b, v28.16b 1139 mov v28.16b, v29.16b 1140 mov v29.16b, v30.16b 1141 mov v30.16b, v31.16b 1142 mov v31.16b, v4.16b 1143 mov v4.16b, v5.16b 1144 mov v5.16b, v6.16b 1145 mov v6.16b, v7.16b 1146 mov v7.16b, v8.16b 1147 mov v8.16b, v9.16b 1148 mov v9.16b, v10.16b 1149 mov v10.16b, v11.16b 1150.endm/*}}}*/ 1151 1152/* Dedicated function wrapper for the fetch macro, for the cases where 1153 * performance isn't that important, to keep code size down. 1154 */ 1155PRIVATE(fetch_generic_asm) 1156 stp x10, x11, [sp, #-16]! 1157 fetch 1158 ldp x10, x11, [sp], #16 1159 ret 1160END(fetch_generic_asm) 1161 1162 1163/* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory 1164 * beyond that limit, and filling the rest of the vector with the last legal 1165 * pixel. 1166 * Result is in v10 and v11. v8 and v9 are filled with the first legal pixel. 1167 * Note: This function can read beyond the right edge of input if the image is 1168 * narrower than 16 bytes. 1169 */ 1170PRIVATE(fetch_clampleft1) 1171 stp x29, x30, [sp, #-16]! 1172 bl fetch_generic_asm 1173 dup v8.8h, v10.h[0] 1174 dup v9.8h, v10.h[0] 1175 ands x12, x10, #15 1176 beq 1f 1177 sub x1, x1, x12 1178 sub x15, x15, x12 1179 sub x19, x19, x12 1180 sub x10, x10, x12 1181 sub x12, sp, x12, LSL #1 1182 sub sp, sp, #64 1183 sub x12, x12, #32 1184 st1 {v8.8h, v9.8h, v10.8h,v11.8h}, [sp] 1185 ld1 {v10.8h,v11.8h}, [x12] 1186 add sp, sp, #64 11871: ldp x29, x30, [sp], #16 1188 ret 1189END(fetch_clampleft1) 1190 1191PRIVATE(fetch_clampleft4) 1192 stp x29, x30, [sp, #-16]! 1193 bl fetch_generic_asm 1194 dup v8.2d, v10.d[0] 1195 dup v9.2d, v10.d[0] 1196 ands x12, x10, #15 1197 beq 1f 1198 sub x1, x1, x12 1199 sub x15, x15, x12 1200 sub x19, x19, x12 1201 sub x10, x10, x12 1202 sub x12, sp, x12, LSL #1 1203 sub sp, sp, #64 1204 sub x12, x12, #32 1205 st1 {v8.8h, v9.8h, v10.8h,v11.8h}, [sp] 1206 ld1 {v10.8h,v11.8h}, [x12] 1207 add sp, sp, #64 12081: ldp x29, x30, [sp], #16 1209 ret 1210END(fetch_clampleft4) 1211 1212/* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding 1213 * reading memory beyond that limit, and filling the rest of the vector with 1214 * the last legal pixel. 1215 * Result is in v10 and v11. v12 and v13 are filled with the last legal pixel. 1216 * Note: This function can read beyond the left edge of input if the image is 1217 * narrower than 16 bytes. 1218 */ 1219PRIVATE(fetch_clampright1) 1220 stp x29, x30, [sp, #-16]! 1221 sub x12, xzr, x11 1222 ands x12, x12, #15 1223 beq 1f 1224 sub x1, x1, x12 1225 sub x15, x15, x12 1226 sub x19, x19, x12 1227 bl fetch_generic_asm 1228 dup v12.8h, v11.h[7] 1229 dup v13.8h, v11.h[7] 1230 sub x12, xzr, x11 1231 and x12, x12, #15 1232 sub sp, sp, #64 1233 add x12, sp, x12, LSL #1 1234 st1 {v10.8h,v11.8h,v12.8h,v13.8h}, [sp] 1235 ld1 {v10.8h,v11.8h}, [x12] 1236 add sp, sp, #64 1237 ldp x29, x30, [sp], #16 1238 ret 12391: bl fetch_generic_asm 1240 dup v12.8h, v11.h[7] 1241 dup v13.8h, v11.h[7] 1242 ldp x29, x30, [sp], #16 1243 ret 1244END(fetch_clampright1) 1245 1246PRIVATE(fetch_clampright4) 1247 stp x29, x30, [sp, #-16]! 1248 sub x12, xzr, x11 1249 ands x12, x12, #15 1250 beq 1f 1251 sub x1, x1, x12 1252 sub x15, x15, x12 1253 sub x19, x19, x12 1254 bl fetch_generic_asm 1255 dup v12.2d, v11.d[1] 1256 dup v13.2d, v11.d[1] 1257 sub x12, xzr, x11 1258 and x12, x12, #15 1259 sub sp, sp, #64 1260 add x12, sp, x12, LSL #1 1261 st1 {v10.8h,v11.8h,v12.8h,v13.8h}, [sp] 1262 ld1 {v10.8h,v11.8h}, [x12] 1263 add sp, sp, #64 1264 ldp x29, x30, [sp], #16 1265 ret 12661: bl fetch_generic_asm 1267 dup v12.2d, v11.d[1] 1268 dup v13.2d, v11.d[1] 1269 ldp x29, x30, [sp], #16 1270 ret 1271END(fetch_clampright4) 1272 1273/* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th 1274 * value across to fill the rest of the register pair. Used for filling the 1275 * right hand edge of the window when reading too close to the right hand edge 1276 * of the image. 1277 * Also returns a dup-ed copy of the last element in v12 for the tail-fill 1278 * case (this happens incidentally in common path, but must be done 1279 * deliberately in the fast-out path). 1280 */ 1281PRIVATE(prefill_sweepright1) 1282 ands x12, x11, #15 1283 beq 1f 1284 sub x12, x12, #1 1285 sub sp, sp, #64 1286 st1 {v10.8h,v11.8h}, [sp] 1287 add x12, sp, x12, LSL #1 1288 ld1r {v12.8h}, [x12] 1289 ld1r {v13.8h}, [x12] 1290 st1 {v12.8h,v13.8h}, [x12] 1291 ld1 {v10.8h,v11.8h}, [sp] 1292 add sp, sp, #64 1293 ret 12941: dup v12.8h, v11.h[7] 1295 dup v13.8h, v11.h[7] 1296 ret 1297END(prefill_sweepright1) 1298 1299PRIVATE(prefill_sweepright4) 1300 ands x12, x11, #15 1301 beq 1f 1302 sub x12, x12, #4 1303 sub sp, sp, #64 1304 st1 {v10.8h,v11.8h}, [sp] 1305 add x12, sp, x12, LSL #1 1306 ld1r {v12.2d}, [x12] 1307 st1 {v13.8h}, [x12] 1308 ld1 {v10.8h,v11.8h}, [sp] 1309 add sp, sp, #64 1310 ret 13111: dup v12.2d, v11.d[1] 1312 dup v13.2d, v11.d[1] 1313 ret 1314END(prefill_sweepright4) 1315 1316/* The main loop keeps a sliding window of data that has already been convolved 1317 * in the vertical axis for the current line. This usually stays in the 1318 * register file, but spills to memory for large windows. The first thing that 1319 * needs to be done at start-up is to fill this window with image data, taking 1320 * into account the padding needed if the left or right edges of the image fall 1321 * within this window. 1322 */ 1323 1324/* Because the window is in the register file writes to it cannot be indexed 1325 * by another register. Consequently the fill loops are unrolled to address 1326 * the registers directly. This macro distinguishes between writes to the 1327 * register file and writes to the spill buffer (indicated by a destination 1328 * register named xx). 1329 */ 1330.macro prefill_out ra, rb, sra, srb 1331 .ifc \ra,xx 1332 .ifc \rb,xx 1333 st1 {\sra,\srb}, [x9], #32 1334 .else 1335 bic x9, x9, #0x40 1336 st1 {\sra}, [x9], #16 1337 mov \rb, \srb 1338 .endif 1339 .else 1340 .ifnc \ra,\sra 1341 mov \ra, \sra 1342 .endif 1343 .ifnc \rb,\srb 1344 mov \rb, \srb 1345 .endif 1346 .endif 1347.endm 1348 1349/* This macro provides the list of registers representing the window, and the 1350 * cases where the register file is too small and a spill buffer is used 1351 * instead. 1352 * Since several specialisations of each function are generated, this also 1353 * culls superfluous iterations, and sets the variable `i` for subsequent 1354 * macros indicating the current index into the window. 1355 */ 1356.macro prefill_list, macro, nextmacro, max_r, step, label 1357 .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label 1358 .if windowsize >= (\line * 16) 1359 .set i, windowsize - (\line * 16) 1360\label\macro\line: 1361 prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step 1362 .endif 1363 .endm 1364 ifneeded \macro \nextmacro, 13, 12, xx, xx, \step, \label 1365 ifneeded \macro \nextmacro, 12, 11, xx, xx, \step, \label 1366 ifneeded \macro \nextmacro, 11, 10, xx, v17.16b, \step, \label 1367 ifneeded \macro \nextmacro, 10, 9, v18.16b, v19.16b, \step, \label 1368 ifneeded \macro \nextmacro, 9, 8, v20.16b, v21.16b, \step, \label 1369 ifneeded \macro \nextmacro, 8, 7, v22.16b, v23.16b, \step, \label 1370 ifneeded \macro \nextmacro, 7, 6, v24.16b, v25.16b, \step, \label 1371 ifneeded \macro \nextmacro, 6, 5, v26.16b, v27.16b, \step, \label 1372 ifneeded \macro \nextmacro, 5, 4, v28.16b, v29.16b, \step, \label 1373 ifneeded \macro \nextmacro, 4, 3, v30.16b, v31.16b, \step, \label 1374 ifneeded \macro \nextmacro, 3, 2, v4.16b, v5.16b, \step, \label 1375 ifneeded \macro \nextmacro, 2, 1, v6.16b, v7.16b, \step, \label 1376 ifneeded \macro \nextmacro, 1, 0, v8.16b, v9.16b, \step, \label 1377\label\macro\()0: 1378 b \label\()_end 1379 .purgem ifneeded 1380.endm 1381 1382/* These macros represent the possible stages of filling the window. 1383 * Each macro is unrolled enough times that it can fill the entire window 1384 * itself, but normally it will have to hand control to subsequent macros 1385 * part-way through and this is done using labels named \next and \after, where 1386 * \next is the next macro starting at the same window position and \after is 1387 * the next macro starting after the current window position. 1388 */ 1389 1390/* leftfill: v8 and v9 contain the left padding value. While the window 1391 * extends outside of the image on the left-hand side, and at least 16 more 1392 * padding values are needed in the window, store v8 and v9 into the window. 1393 * Otherwise skip forward to storing image data. 1394 */ 1395.macro prefill_leftfill, next, after, ra, rb, step 1396 cmp x10, #i+16 1397 blo \next 1398 prefill_out \ra, \rb, v8.16b, v9.16b 1399.endm 1400 1401/* leftedge: The very first non-fill or partial-fill chunk from the image is 1402 * already loaded (as it was used to calculate the left padding value), so 1403 * store it here, and then drop into the regular load/store cycle in the next 1404 * macro. 1405 */ 1406.macro prefill_leftedge, next, after, ra, rb, step 14071: prefill_out \ra, \rb, v10.16b, v11.16b 1408 b \after 1409.endm 1410 1411/* dofetch: Copy chunks of the image into the window without any complications 1412 * from edge conditions. 1413 */ 1414.macro prefill_dofetch, next, after, ra, rb, step 1415 cmp x11, #i+16 1416 bls \next 1417 bl fetch_generic_asm 1418 prefill_out \ra, \rb, v10.16b, v11.16b 1419.endm 1420 1421/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond 1422 * the right-hand edge of the image. In that case sweep the last valid pixel 1423 * across the rest of the chunk, and in either case prepare padding data in v12 1424 * and v13 for the next macro. This is done in fetch_clampright. 1425 * This only happens once before going on to the next macro. 1426 * Sometimes leftedge also covers the rightedge case, in which case this has 1427 * to be skipped altogether. 1428 */ 1429.macro prefill_rightedge, next, after, ra, rb, step 1430 cmp x11, #i 1431 bls \next 1432 bl fetch_clampright\step 1433 prefill_out \ra, \rb, v10.16b, v11.16b 1434 b \after 1435.endm 1436 1437/* rightfill: The rest of the window is simply filled with right padding from 1438 * v12 and v13. 1439 */ 1440.macro prefill_rightfill, next, after, ra, rb, step 1441 prefill_out \ra, \rb, v12.16b, v13.16b 1442.endm 1443 1444/* Here all of the macros above are unrolled and laid out in the proper order. 1445 */ 1446.macro prefill_body, max_r, step, label 1447 prefill_list leftfill, leftedge, \max_r, \step, \label 1448 prefill_list leftedge, dofetch, \max_r, \step, \label 1449 prefill_list dofetch, rightedge, \max_r, \step, \label 1450 prefill_list rightedge, rightfill, \max_r, \step, \label 1451 prefill_list rightfill, oops, \max_r, \step, \label 1452\label\()_end: 1453.endm 1454 1455 1456/* Fill the convolution window with context data. The aim here is to load 1457 * exactly 2*r columns, and in the main loop to read as many columns as will be 1458 * written. This is complicated by the window being divided into chunks at 1459 * register boundaries, and the need to handle cases when the input starts very 1460 * close to the left or right (or both) edges of the image and the need to fill 1461 * the spaces that leaves with left and right edge padding values. 1462 * 1463 * Input: 1464 * x1 -- src 1465 * x2 -- pitch 1466 * x3 -- count 1467 * x4 -- available image data right of src pointer 1468 * x5 -- r 1469 * x6 -- rup 1470 * x7 -- rdn 1471 * x8 -- available image data left of src pointer 1472 * x9 -- buffer (if needed) 1473 * x13 = -pitch 1474 * x15 = top-row in 1475 * x19 = bottom-row in 1476 * Output: 1477 * x4 -= min(inlen, count + windowsize - centertap) 1478 * x1 += min(inlen, count + windowsize - centertap) 1479 * x15 += min(inlen, count + windowsize - centertap) 1480 * x19 += min(inlen, count + windowsize - centertap) 1481 * Modifies: 1482 * x10 -- fill start index in the window 1483 * x11 -- fill stop index in the window 1484 * x12 -- scratch 1485 */ 1486.macro prefill step=1, max_r=25, label=xx 1487.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15) 1488.set centertap, (windowsize - \max_r * \step) 1489 mov x10, #centertap 1490 subs x10, x10, x8 1491 csel x10, xzr, x10, lo 1492 1493 subs x11, x4, #windowsize - centertap 1494 csel x11, xzr, x11, hs 1495 add x11, x11, #windowsize 1496 1497 /* x10 indicates where in the window legal image data begins. 1498 * x11 indicates where in the window legal image date ends. 1499 * When starting near the centre of a large image these would be 1500 * zero and windowsize respectively, but when starting near the 1501 * edges this can change. 1502 * When starting on the leftmost pixel, x10 will be centertap. 1503 * When starting on the rightmost pixel, x11 will be centertap+1. 1504 */ 1505 1506 /* x4 indicates how much data there is between the current pointers 1507 * and the right edge of the image. The pointers currently point 1508 * to the data needed at centertap. The subsequent code will 1509 * consume (windowsize - x10) data, but only the data from 1510 * centertap to windowsize comes out of x4's budget. 1511 */ 15121: subs x4, x4, #windowsize - centertap 1513 csel x4, xzr, x4, lo 1514 1515 /* And the pointers need to rewind to the start of the window. 1516 */ 1517 sub x1, x1, #centertap 1518 sub x15, x15, #centertap 1519 sub x19, x19, #centertap 1520 1521 /* Unless x8 indicated that there wasn't that much data available. 1522 */ 1523 add x1, x1, x10 1524 add x15, x15, x10 1525 add x19, x19, x10 1526 1527 /* Get the first chunk, and add padding to align it to the window 1528 * if necessary. 1529 */ 1530 bl fetch_clampleft\step 1531 1532 /* Sometimes the start and the end of the window are in the same 1533 * chunk. In that case both ends need filler at the outset. 1534 */ 1535 sub x12, x11, #1 1536 eor x12, x10, x12 1537 cmp x12, #16 1538 bhs 1f 1539 bl prefill_sweepright\step 1540 1541 /* Iterate through all the points in the window and fill them in 1542 * with padding or image data as needed. 1543 */ 15441: prefill_body \max_r, \step, \label 1545.endm 1546 1547/* The main body of the convolve functions. Having already pre-filled the 1548 * convolution window with 2*r input values, the logic settles into a regular 1549 * pattern of reading and writing at a 1:1 rate until either input or output 1550 * expires. The input leads the output by r values, so when processing all the 1551 * way to the right-hand edge, or within r pixels of that edge, the input will 1552 * run out first. In the case of very narrow images, or sub-windows starting 1553 * near the right edge, the input may already have run out while the 1554 * convolution window was being filled and this loop will start with a 1555 * zero-length input. 1556 * 1557 * Once the input runs out, the rest of the output must be processed by padding 1558 * the remainder of the window with pad value from the last valid pixel from 1559 * the source. 1560 * 1561 * Input: 1562 * x0 = dst 1563 * x1 = src 1564 * x2 = pitch 1565 * x3 = count 1566 * x4 = inlen 1567 * x5 = r 1568 * x6 = rup 1569 * x7 = rdn 1570 * x9 = buffer 1571 * x13 = -pitch 1572 * x15 = top-row in 1573 * x19 = bottom-row in 1574 * Modifies 1575 * x8 = fetch code pointer 1576 */ 1577.macro conv_body core, step=1, max_r=25, labelc="", labelnc="" 1578 1579 /* If x4 >= x3 then there's no need for clipping. The main loop 1580 * needs to exit when either x3 or x4 runs out, so clamp x4 to be 1581 * no greater than x3 and use x4 for the loop. 1582 * However, if x4 comes out of the loop with less than 16 bytes 1583 * left, a partial read would be necessary to avoid reading beyond 1584 * the end of the image. To avoid this, clamp x4 to the next 1585 * multiple of 16, which is still sufficient to force it out of the 1586 * loop but doesn't imply a rewind. 1587 */ 1588 add x12, x3, #15 1589 bic x12, x12, #15 1590 cmp x4, x12 1591 csel x4, x12, x4, hi 1592 1593 /* First calculate the entry-point into the internal fetch logic. 1594 * This is done so the same function can service several kernel 1595 * sizes. 1596 */ 1597 adrp x8, \labelnc 1598 add x8, x8, #:lo12:\labelnc 1599 sub x8, x8, x5, LSL #5 1600 sub x8, x8, x5, LSL #3 1601 cmp x5, x6 1602 ccmp x5, x7, #0, eq 1603 beq 5f 1604 1605 /* if (r != rup || r != rdn) then the address-clamping table should 1606 * be used rather than the short-cut version. 1607 */ 1608 adrp x8, \labelc 1609 add x8, x8, #:lo12:\labelc 1610 sub x8, x8, x5, LSL #6 1611 add x8, x8, x5, LSL #3 1612 b 5f 1613 1614 /* Main loop: ... */ 1615 .align 4 16163: /* first perform a vertical convolution from memory to get the next 1617 * 16 taps of the horizontal window into the register file... 1618 */ 1619 fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8 1620 1621 /* ...then perform a horizontal convolution on that window to 1622 * produce eight output bytes, and slide the window along. 1623 * This has to be done twice to match the 16-way vertical pass. 1624 * It would be preferable to have twice the work done in \core, but 1625 * that would demand yet another variant on those macros and would 1626 * perturb the register allocation severely. 1627 */ 1628 \core 1629 st1 {v15.8b}, [x0], #8 1630 \core 1631 st1 {v15.8b}, [x0], #8 1632 1633 sub x3, x3, #16 16345: subs x4, x4, #16 1635 bhi 3b 1636 /* Here there's 16 or fewer bytes available before the edge of the 1637 * source image. x4 holds that count minus 16 (because it was 1638 * decremented before the first iteration ran). The last read may 1639 * not be a whole chunk, and beyond that a fill value must be used. 1640 * 1641 * Of course, none of that matters if there's no more output to 1642 * produce... 1643 */ 1644 cbz x3, 5f 1645 1646 /* Oh well. */ 1647 adds x4, x4, #16 1648 bne 1f 1649 .if \step==1 1650 dup v10.8h, v9.h[7] 1651 dup v11.8h, v9.h[7] 1652 .else 1653 dup v10.2d, v9.d[1] 1654 dup v11.2d, v9.d[1] 1655 .endif 1656 b 3f 1657 1658 /* To avoid reading past end of input, rewind pointers by (16-x4) 1659 * to ensure that they're exactly 16 bytes from the edge. 1660 */ 16611: mov x11, x4 1662 bl fetch_clampright\step 1663 /* Now to put this padding to use, perform any remaining 1664 * iterations. This is done at half the rate of the main loop, 1665 * because there's no longer pressure from a 16-lane window filler. 1666 */ 16673: \core 1668 .if \step==1 1669 dup v11.8h, v11.h[7] 1670 .else 1671 dup v11.2d, v11.d[1] 1672 .endif 1673 subs x3, x3, #8 1674 blo 4f 1675 st1 {v15.8b}, [x0], #8 1676 bne 3b 1677 b 5f 1678 1679 /* If the final iteration contained 0 < l < 8 values, then perform 1680 * a piecewise store of the final vector. 1681 */ 16824: tbz x3, #2, 1f 1683 st1 {v15.s}[0], [x0], #4 1684 ext v15.8b, v15.8b, v15.8b, #4 16851: tbz x3, #1, 1f 1686 st1 {v15.h}[0], [x0], #2 1687 ext v15.8b, v15.8b, v15.8b, #2 16881: tbz x3, #0, 5f 1689 st1 {v15.b}[0], [x0], #1 1690 ext v15.8b, v15.8b, v15.8b, #1 16915: mov x0, #0 1692.endm 1693 1694 1695.irp r, TUNED_LIST1, 25 1696PRIVATE(convolve1_\r) 1697 stp x29,x30, [sp, #-16]! 1698 1699 prefill step=1, max_r=\r, label=.Lcnv1_\r 1700 1701 conv_body core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r 1702 1703 ldp x29,x30, [sp], #16 1704 ret 1705END(convolve1_\r) 1706.endr 1707 1708.irp r, TUNED_LIST4, 25 1709PRIVATE(convolve4_\r) 1710 sub x9, sp, #0x40 1711 stp x29,x30, [sp, #-(16 + 0x40 + 0x80)]! 1712 bic x9, x9, #0x7f 1713 1714 /* x9 now points to a 0x40 byte buffer on the stack whose address 1715 * has the low 7 bits clear. This allows easy address calculation 1716 * in the wrap-around cases. 1717 */ 1718 1719 prefill step=4, max_r=\r, label=.Lcnv4_\r 1720 1721 conv_body core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r 1722 1723 ldp x29,x30, [sp], #(16 + 0x40 + 0x80) 1724 ret 1725END(convolve4_\r) 1726.endr 1727 1728/* void rsdIntrinsicBlurU1_K( 1729 * void *out, // x0 1730 * void *in, // x1 1731 * size_t w, // x2 1732 * size_t h, // x3 1733 * size_t p, // x4 1734 * size_t x, // x5 1735 * size_t y, // x6 1736 * size_t count, // x7 1737 * size_t r, // [sp] 1738 * uint16_t *tab); // [sp,#8] 1739 */ 1740ENTRY(rsdIntrinsicBlurU1_K) 1741 stp x19,x30, [sp, #-16]! 1742 sub x8, sp, #32 1743 sub sp, sp, #64 1744 st1 {v8.1d - v11.1d}, [sp] 1745 st1 {v12.1d - v15.1d}, [x8] 1746 mov x8, x5 // x 1747 ldr w5, [sp,#80] // r 1748 sub x9, x2, x8 // w - x 1749 sub x10, x3, x6 // h - y 1750 mov x2, x4 // pitch 1751 mov x3, x7 // count 1752 sub x7, x10, #1 // h - y - 1 1753 mov x4, x9 // inlen = (w - x) 1754 1755 ldr x12, [sp, #88] // tab 1756 1757 add x1, x1, x8 // src += x 1758 1759 cmp x6, x5 1760 csel x6, x5, x6, hs // rup = min(r, y) 1761 cmp x7, x5 1762 csel x7, x5, x7, hs // rdn = min(r, h - y - 1) 1763 1764 sub x13, xzr, x2 // -pitch 1765 msub x15, x2, x6, x1 1766 madd x19, x2, x7, x1 1767 1768 ld1 {v0.8h,v1.8h}, [x12], #32 1769 ld1 {v2.8h,v3.8h}, [x12], #32 1770 1771 adr x30, 1f 1772 .irp r, TUNED_LIST1 1773 cmp x5, #\r 1774 bls convolve1_\r 1775 .endr 1776 b convolve1_25 1777 17781: ld1 {v8.1d - v11.1d}, [sp], #32 1779 ld1 {v12.1d - v15.1d}, [sp], #32 1780 ldp x19,x30, [sp], #16 1781 ret 1782END(rsdIntrinsicBlurU1_K) 1783 1784/* void rsdIntrinsicBlurU4_K( 1785 * void *out, // x0 1786 * void *in, // x1 1787 * size_t w, // x2 1788 * size_t h, // x3 1789 * size_t p, // x4 1790 * size_t x, // x5 1791 * size_t y, // x6 1792 * size_t count, // x7 1793 * size_t r, // [sp] 1794 * uint16_t *tab); // [sp,#8] 1795 */ 1796ENTRY(rsdIntrinsicBlurU4_K) 1797 stp x19,x30, [sp, #-16]! 1798 sub x8, sp, #32 1799 sub sp, sp, #64 1800 st1 {v8.1d - v11.1d}, [sp] 1801 st1 {v12.1d - v15.1d}, [x8] 1802 lsl x8, x5, #2 // x 1803 lsl x2, x2, #2 1804 ldr w5, [sp,#80] // r 1805 sub x9, x2, x8 // w - x 1806 sub x10, x3, x6 // h - y 1807 mov x2, x4 // pitch 1808 lsl x3, x7, #2 // count 1809 sub x7, x10, #1 // h - y - 1 1810 mov x4, x9 // inlen = (w - x) 1811 1812 ldr x12, [sp, #88] 1813 1814 add x1, x1, x8 // in += x 1815 1816 cmp x6, x5 1817 csel x6, x5, x6, hs // rup = min(r, y) 1818 cmp x7, x5 1819 csel x7, x5, x7, hs // rdn = min(r, h - y - 1) 1820 1821 1822 sub x13, xzr, x2 1823 msub x15, x2, x6, x1 1824 madd x19, x2, x7, x1 1825 1826 ld1 {v0.8h,v1.8h}, [x12], #32 1827 ld1 {v2.8h,v3.8h}, [x12], #32 1828 1829 adr x30, 1f 1830 .irp r, TUNED_LIST4 1831 cmp x5, #\r 1832 bls convolve4_\r 1833 .endr 1834 b convolve4_25 1835 18361: ld1 {v8.1d - v11.1d}, [sp], #32 1837 ld1 {v12.1d - v15.1d}, [sp], #32 1838 ldp x19,x30, [sp], #16 1839 ret 1840END(rsdIntrinsicBlurU4_K) 1841