1@ This file was created from a .asm file 2@ using the ads2gas.pl script. 3 .equ DO1STROUNDING, 0 4 .syntax unified 5@ 6@ Copyright (c) 2014 The WebM project authors. All Rights Reserved. 7@ 8@ Use of this source code is governed by a BSD-style license 9@ that can be found in the LICENSE file in the root of the source 10@ tree. An additional intellectual property rights grant can be found 11@ in the file PATENTS. All contributing project authors may 12@ be found in the AUTHORS file in the root of the source tree. 13@ 14 15 .global vpx_v_predictor_4x4_neon 16 .type vpx_v_predictor_4x4_neon, function 17 .global vpx_v_predictor_8x8_neon 18 .type vpx_v_predictor_8x8_neon, function 19 .global vpx_v_predictor_16x16_neon 20 .type vpx_v_predictor_16x16_neon, function 21 .global vpx_v_predictor_32x32_neon 22 .type vpx_v_predictor_32x32_neon, function 23 .global vpx_h_predictor_4x4_neon 24 .type vpx_h_predictor_4x4_neon, function 25 .global vpx_h_predictor_8x8_neon 26 .type vpx_h_predictor_8x8_neon, function 27 .global vpx_h_predictor_16x16_neon 28 .type vpx_h_predictor_16x16_neon, function 29 .global vpx_h_predictor_32x32_neon 30 .type vpx_h_predictor_32x32_neon, function 31 .global vpx_tm_predictor_4x4_neon 32 .type vpx_tm_predictor_4x4_neon, function 33 .global vpx_tm_predictor_8x8_neon 34 .type vpx_tm_predictor_8x8_neon, function 35 .global vpx_tm_predictor_16x16_neon 36 .type vpx_tm_predictor_16x16_neon, function 37 .global vpx_tm_predictor_32x32_neon 38 .type vpx_tm_predictor_32x32_neon, function 39 .arm 40 .eabi_attribute 24, 1 @Tag_ABI_align_needed 41 .eabi_attribute 25, 1 @Tag_ABI_align_preserved 42 43.text 44.p2align 2 45 46@void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, 47@ const uint8_t *above, 48@ const uint8_t *left) 49@ r0 uint8_t *dst 50@ r1 ptrdiff_t y_stride 51@ r2 const uint8_t *above 52@ r3 const uint8_t *left 53 54_vpx_v_predictor_4x4_neon: 55 vpx_v_predictor_4x4_neon: @ PROC 56 vld1.32 {d0[0]}, [r2] 57 vst1.32 {d0[0]}, [r0], r1 58 vst1.32 {d0[0]}, [r0], r1 59 vst1.32 {d0[0]}, [r0], r1 60 vst1.32 {d0[0]}, [r0], r1 61 bx lr 62 .size vpx_v_predictor_4x4_neon, .-vpx_v_predictor_4x4_neon @ ENDP @ |vpx_v_predictor_4x4_neon| 63 64@void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, 65@ const uint8_t *above, 66@ const uint8_t *left) 67@ r0 uint8_t *dst 68@ r1 ptrdiff_t y_stride 69@ r2 const uint8_t *above 70@ r3 const uint8_t *left 71 72_vpx_v_predictor_8x8_neon: 73 vpx_v_predictor_8x8_neon: @ PROC 74 vld1.8 {d0}, [r2] 75 vst1.8 {d0}, [r0], r1 76 vst1.8 {d0}, [r0], r1 77 vst1.8 {d0}, [r0], r1 78 vst1.8 {d0}, [r0], r1 79 vst1.8 {d0}, [r0], r1 80 vst1.8 {d0}, [r0], r1 81 vst1.8 {d0}, [r0], r1 82 vst1.8 {d0}, [r0], r1 83 bx lr 84 .size vpx_v_predictor_8x8_neon, .-vpx_v_predictor_8x8_neon @ ENDP @ |vpx_v_predictor_8x8_neon| 85 86@void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, 87@ const uint8_t *above, 88@ const uint8_t *left) 89@ r0 uint8_t *dst 90@ r1 ptrdiff_t y_stride 91@ r2 const uint8_t *above 92@ r3 const uint8_t *left 93 94_vpx_v_predictor_16x16_neon: 95 vpx_v_predictor_16x16_neon: @ PROC 96 vld1.8 {q0}, [r2] 97 vst1.8 {q0}, [r0], r1 98 vst1.8 {q0}, [r0], r1 99 vst1.8 {q0}, [r0], r1 100 vst1.8 {q0}, [r0], r1 101 vst1.8 {q0}, [r0], r1 102 vst1.8 {q0}, [r0], r1 103 vst1.8 {q0}, [r0], r1 104 vst1.8 {q0}, [r0], r1 105 vst1.8 {q0}, [r0], r1 106 vst1.8 {q0}, [r0], r1 107 vst1.8 {q0}, [r0], r1 108 vst1.8 {q0}, [r0], r1 109 vst1.8 {q0}, [r0], r1 110 vst1.8 {q0}, [r0], r1 111 vst1.8 {q0}, [r0], r1 112 vst1.8 {q0}, [r0], r1 113 bx lr 114 .size vpx_v_predictor_16x16_neon, .-vpx_v_predictor_16x16_neon @ ENDP @ |vpx_v_predictor_16x16_neon| 115 116@void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, 117@ const uint8_t *above, 118@ const uint8_t *left) 119@ r0 uint8_t *dst 120@ r1 ptrdiff_t y_stride 121@ r2 const uint8_t *above 122@ r3 const uint8_t *left 123 124_vpx_v_predictor_32x32_neon: 125 vpx_v_predictor_32x32_neon: @ PROC 126 vld1.8 {q0, q1}, [r2] 127 mov r2, #2 128loop_v: 129 vst1.8 {q0, q1}, [r0], r1 130 vst1.8 {q0, q1}, [r0], r1 131 vst1.8 {q0, q1}, [r0], r1 132 vst1.8 {q0, q1}, [r0], r1 133 vst1.8 {q0, q1}, [r0], r1 134 vst1.8 {q0, q1}, [r0], r1 135 vst1.8 {q0, q1}, [r0], r1 136 vst1.8 {q0, q1}, [r0], r1 137 vst1.8 {q0, q1}, [r0], r1 138 vst1.8 {q0, q1}, [r0], r1 139 vst1.8 {q0, q1}, [r0], r1 140 vst1.8 {q0, q1}, [r0], r1 141 vst1.8 {q0, q1}, [r0], r1 142 vst1.8 {q0, q1}, [r0], r1 143 vst1.8 {q0, q1}, [r0], r1 144 vst1.8 {q0, q1}, [r0], r1 145 subs r2, r2, #1 146 bgt loop_v 147 bx lr 148 .size vpx_v_predictor_32x32_neon, .-vpx_v_predictor_32x32_neon @ ENDP @ |vpx_v_predictor_32x32_neon| 149 150@void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, 151@ const uint8_t *above, 152@ const uint8_t *left) 153@ r0 uint8_t *dst 154@ r1 ptrdiff_t y_stride 155@ r2 const uint8_t *above 156@ r3 const uint8_t *left 157 158_vpx_h_predictor_4x4_neon: 159 vpx_h_predictor_4x4_neon: @ PROC 160 vld1.32 {d1[0]}, [r3] 161 vdup.8 d0, d1[0] 162 vst1.32 {d0[0]}, [r0], r1 163 vdup.8 d0, d1[1] 164 vst1.32 {d0[0]}, [r0], r1 165 vdup.8 d0, d1[2] 166 vst1.32 {d0[0]}, [r0], r1 167 vdup.8 d0, d1[3] 168 vst1.32 {d0[0]}, [r0], r1 169 bx lr 170 .size vpx_h_predictor_4x4_neon, .-vpx_h_predictor_4x4_neon @ ENDP @ |vpx_h_predictor_4x4_neon| 171 172@void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, 173@ const uint8_t *above, 174@ const uint8_t *left) 175@ r0 uint8_t *dst 176@ r1 ptrdiff_t y_stride 177@ r2 const uint8_t *above 178@ r3 const uint8_t *left 179 180_vpx_h_predictor_8x8_neon: 181 vpx_h_predictor_8x8_neon: @ PROC 182 vld1.64 {d1}, [r3] 183 vdup.8 d0, d1[0] 184 vst1.64 {d0}, [r0], r1 185 vdup.8 d0, d1[1] 186 vst1.64 {d0}, [r0], r1 187 vdup.8 d0, d1[2] 188 vst1.64 {d0}, [r0], r1 189 vdup.8 d0, d1[3] 190 vst1.64 {d0}, [r0], r1 191 vdup.8 d0, d1[4] 192 vst1.64 {d0}, [r0], r1 193 vdup.8 d0, d1[5] 194 vst1.64 {d0}, [r0], r1 195 vdup.8 d0, d1[6] 196 vst1.64 {d0}, [r0], r1 197 vdup.8 d0, d1[7] 198 vst1.64 {d0}, [r0], r1 199 bx lr 200 .size vpx_h_predictor_8x8_neon, .-vpx_h_predictor_8x8_neon @ ENDP @ |vpx_h_predictor_8x8_neon| 201 202@void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, 203@ const uint8_t *above, 204@ const uint8_t *left) 205@ r0 uint8_t *dst 206@ r1 ptrdiff_t y_stride 207@ r2 const uint8_t *above 208@ r3 const uint8_t *left 209 210_vpx_h_predictor_16x16_neon: 211 vpx_h_predictor_16x16_neon: @ PROC 212 vld1.8 {q1}, [r3] 213 vdup.8 q0, d2[0] 214 vst1.8 {q0}, [r0], r1 215 vdup.8 q0, d2[1] 216 vst1.8 {q0}, [r0], r1 217 vdup.8 q0, d2[2] 218 vst1.8 {q0}, [r0], r1 219 vdup.8 q0, d2[3] 220 vst1.8 {q0}, [r0], r1 221 vdup.8 q0, d2[4] 222 vst1.8 {q0}, [r0], r1 223 vdup.8 q0, d2[5] 224 vst1.8 {q0}, [r0], r1 225 vdup.8 q0, d2[6] 226 vst1.8 {q0}, [r0], r1 227 vdup.8 q0, d2[7] 228 vst1.8 {q0}, [r0], r1 229 vdup.8 q0, d3[0] 230 vst1.8 {q0}, [r0], r1 231 vdup.8 q0, d3[1] 232 vst1.8 {q0}, [r0], r1 233 vdup.8 q0, d3[2] 234 vst1.8 {q0}, [r0], r1 235 vdup.8 q0, d3[3] 236 vst1.8 {q0}, [r0], r1 237 vdup.8 q0, d3[4] 238 vst1.8 {q0}, [r0], r1 239 vdup.8 q0, d3[5] 240 vst1.8 {q0}, [r0], r1 241 vdup.8 q0, d3[6] 242 vst1.8 {q0}, [r0], r1 243 vdup.8 q0, d3[7] 244 vst1.8 {q0}, [r0], r1 245 bx lr 246 .size vpx_h_predictor_16x16_neon, .-vpx_h_predictor_16x16_neon @ ENDP @ |vpx_h_predictor_16x16_neon| 247 248@void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, 249@ const uint8_t *above, 250@ const uint8_t *left) 251@ r0 uint8_t *dst 252@ r1 ptrdiff_t y_stride 253@ r2 const uint8_t *above 254@ r3 const uint8_t *left 255 256_vpx_h_predictor_32x32_neon: 257 vpx_h_predictor_32x32_neon: @ PROC 258 sub r1, r1, #16 259 mov r2, #2 260loop_h: 261 vld1.8 {q1}, [r3]! 262 vdup.8 q0, d2[0] 263 vst1.8 {q0}, [r0]! 264 vst1.8 {q0}, [r0], r1 265 vdup.8 q0, d2[1] 266 vst1.8 {q0}, [r0]! 267 vst1.8 {q0}, [r0], r1 268 vdup.8 q0, d2[2] 269 vst1.8 {q0}, [r0]! 270 vst1.8 {q0}, [r0], r1 271 vdup.8 q0, d2[3] 272 vst1.8 {q0}, [r0]! 273 vst1.8 {q0}, [r0], r1 274 vdup.8 q0, d2[4] 275 vst1.8 {q0}, [r0]! 276 vst1.8 {q0}, [r0], r1 277 vdup.8 q0, d2[5] 278 vst1.8 {q0}, [r0]! 279 vst1.8 {q0}, [r0], r1 280 vdup.8 q0, d2[6] 281 vst1.8 {q0}, [r0]! 282 vst1.8 {q0}, [r0], r1 283 vdup.8 q0, d2[7] 284 vst1.8 {q0}, [r0]! 285 vst1.8 {q0}, [r0], r1 286 vdup.8 q0, d3[0] 287 vst1.8 {q0}, [r0]! 288 vst1.8 {q0}, [r0], r1 289 vdup.8 q0, d3[1] 290 vst1.8 {q0}, [r0]! 291 vst1.8 {q0}, [r0], r1 292 vdup.8 q0, d3[2] 293 vst1.8 {q0}, [r0]! 294 vst1.8 {q0}, [r0], r1 295 vdup.8 q0, d3[3] 296 vst1.8 {q0}, [r0]! 297 vst1.8 {q0}, [r0], r1 298 vdup.8 q0, d3[4] 299 vst1.8 {q0}, [r0]! 300 vst1.8 {q0}, [r0], r1 301 vdup.8 q0, d3[5] 302 vst1.8 {q0}, [r0]! 303 vst1.8 {q0}, [r0], r1 304 vdup.8 q0, d3[6] 305 vst1.8 {q0}, [r0]! 306 vst1.8 {q0}, [r0], r1 307 vdup.8 q0, d3[7] 308 vst1.8 {q0}, [r0]! 309 vst1.8 {q0}, [r0], r1 310 subs r2, r2, #1 311 bgt loop_h 312 bx lr 313 .size vpx_h_predictor_32x32_neon, .-vpx_h_predictor_32x32_neon @ ENDP @ |vpx_h_predictor_32x32_neon| 314 315@void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, 316@ const uint8_t *above, 317@ const uint8_t *left) 318@ r0 uint8_t *dst 319@ r1 ptrdiff_t y_stride 320@ r2 const uint8_t *above 321@ r3 const uint8_t *left 322 323_vpx_tm_predictor_4x4_neon: 324 vpx_tm_predictor_4x4_neon: @ PROC 325 @ Load ytop_left = above[-1]; 326 sub r12, r2, #1 327 vld1.u8 {d0[]}, [r12] 328 329 @ Load above 4 pixels 330 vld1.32 {d2[0]}, [r2] 331 332 @ Compute above - ytop_left 333 vsubl.u8 q3, d2, d0 334 335 @ Load left row by row and compute left + (above - ytop_left) 336 @ 1st row and 2nd row 337 vld1.u8 {d2[]}, [r3]! 338 vld1.u8 {d4[]}, [r3]! 339 vmovl.u8 q1, d2 340 vmovl.u8 q2, d4 341 vadd.s16 q1, q1, q3 342 vadd.s16 q2, q2, q3 343 vqmovun.s16 d0, q1 344 vqmovun.s16 d1, q2 345 vst1.32 {d0[0]}, [r0], r1 346 vst1.32 {d1[0]}, [r0], r1 347 348 @ 3rd row and 4th row 349 vld1.u8 {d2[]}, [r3]! 350 vld1.u8 {d4[]}, [r3] 351 vmovl.u8 q1, d2 352 vmovl.u8 q2, d4 353 vadd.s16 q1, q1, q3 354 vadd.s16 q2, q2, q3 355 vqmovun.s16 d0, q1 356 vqmovun.s16 d1, q2 357 vst1.32 {d0[0]}, [r0], r1 358 vst1.32 {d1[0]}, [r0], r1 359 bx lr 360 .size vpx_tm_predictor_4x4_neon, .-vpx_tm_predictor_4x4_neon @ ENDP @ |vpx_tm_predictor_4x4_neon| 361 362@void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, 363@ const uint8_t *above, 364@ const uint8_t *left) 365@ r0 uint8_t *dst 366@ r1 ptrdiff_t y_stride 367@ r2 const uint8_t *above 368@ r3 const uint8_t *left 369 370_vpx_tm_predictor_8x8_neon: 371 vpx_tm_predictor_8x8_neon: @ PROC 372 @ Load ytop_left = above[-1]; 373 sub r12, r2, #1 374 vld1.8 {d0[]}, [r12] 375 376 @ preload 8 left 377 vld1.8 {d30}, [r3] 378 379 @ Load above 8 pixels 380 vld1.64 {d2}, [r2] 381 382 vmovl.u8 q10, d30 383 384 @ Compute above - ytop_left 385 vsubl.u8 q3, d2, d0 386 387 @ Load left row by row and compute left + (above - ytop_left) 388 @ 1st row and 2nd row 389 vdup.16 q0, d20[0] 390 vdup.16 q1, d20[1] 391 vadd.s16 q0, q3, q0 392 vadd.s16 q1, q3, q1 393 394 @ 3rd row and 4th row 395 vdup.16 q8, d20[2] 396 vdup.16 q9, d20[3] 397 vadd.s16 q8, q3, q8 398 vadd.s16 q9, q3, q9 399 400 vqmovun.s16 d0, q0 401 vqmovun.s16 d1, q1 402 vqmovun.s16 d2, q8 403 vqmovun.s16 d3, q9 404 405 vst1.64 {d0}, [r0], r1 406 vst1.64 {d1}, [r0], r1 407 vst1.64 {d2}, [r0], r1 408 vst1.64 {d3}, [r0], r1 409 410 @ 5th row and 6th row 411 vdup.16 q0, d21[0] 412 vdup.16 q1, d21[1] 413 vadd.s16 q0, q3, q0 414 vadd.s16 q1, q3, q1 415 416 @ 7th row and 8th row 417 vdup.16 q8, d21[2] 418 vdup.16 q9, d21[3] 419 vadd.s16 q8, q3, q8 420 vadd.s16 q9, q3, q9 421 422 vqmovun.s16 d0, q0 423 vqmovun.s16 d1, q1 424 vqmovun.s16 d2, q8 425 vqmovun.s16 d3, q9 426 427 vst1.64 {d0}, [r0], r1 428 vst1.64 {d1}, [r0], r1 429 vst1.64 {d2}, [r0], r1 430 vst1.64 {d3}, [r0], r1 431 432 bx lr 433 .size vpx_tm_predictor_8x8_neon, .-vpx_tm_predictor_8x8_neon @ ENDP @ |vpx_tm_predictor_8x8_neon| 434 435@void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, 436@ const uint8_t *above, 437@ const uint8_t *left) 438@ r0 uint8_t *dst 439@ r1 ptrdiff_t y_stride 440@ r2 const uint8_t *above 441@ r3 const uint8_t *left 442 443_vpx_tm_predictor_16x16_neon: 444 vpx_tm_predictor_16x16_neon: @ PROC 445 @ Load ytop_left = above[-1]; 446 sub r12, r2, #1 447 vld1.8 {d0[]}, [r12] 448 449 @ Load above 8 pixels 450 vld1.8 {q1}, [r2] 451 452 @ preload 8 left into r12 453 vld1.8 {d18}, [r3]! 454 455 @ Compute above - ytop_left 456 vsubl.u8 q2, d2, d0 457 vsubl.u8 q3, d3, d0 458 459 vmovl.u8 q10, d18 460 461 @ Load left row by row and compute left + (above - ytop_left) 462 @ Process 8 rows in each single loop and loop 2 times to process 16 rows. 463 mov r2, #2 464 465loop_16x16_neon: 466 @ Process two rows. 467 vdup.16 q0, d20[0] 468 vdup.16 q8, d20[1] 469 vadd.s16 q1, q0, q2 470 vadd.s16 q0, q0, q3 471 vadd.s16 q11, q8, q2 472 vadd.s16 q8, q8, q3 473 vqmovun.s16 d2, q1 474 vqmovun.s16 d3, q0 475 vqmovun.s16 d22, q11 476 vqmovun.s16 d23, q8 477 vdup.16 q0, d20[2] @ proload next 2 rows data 478 vdup.16 q8, d20[3] 479 vst1.64 {d2,d3}, [r0], r1 480 vst1.64 {d22,d23}, [r0], r1 481 482 @ Process two rows. 483 vadd.s16 q1, q0, q2 484 vadd.s16 q0, q0, q3 485 vadd.s16 q11, q8, q2 486 vadd.s16 q8, q8, q3 487 vqmovun.s16 d2, q1 488 vqmovun.s16 d3, q0 489 vqmovun.s16 d22, q11 490 vqmovun.s16 d23, q8 491 vdup.16 q0, d21[0] @ proload next 2 rows data 492 vdup.16 q8, d21[1] 493 vst1.64 {d2,d3}, [r0], r1 494 vst1.64 {d22,d23}, [r0], r1 495 496 vadd.s16 q1, q0, q2 497 vadd.s16 q0, q0, q3 498 vadd.s16 q11, q8, q2 499 vadd.s16 q8, q8, q3 500 vqmovun.s16 d2, q1 501 vqmovun.s16 d3, q0 502 vqmovun.s16 d22, q11 503 vqmovun.s16 d23, q8 504 vdup.16 q0, d21[2] @ proload next 2 rows data 505 vdup.16 q8, d21[3] 506 vst1.64 {d2,d3}, [r0], r1 507 vst1.64 {d22,d23}, [r0], r1 508 509 510 vadd.s16 q1, q0, q2 511 vadd.s16 q0, q0, q3 512 vadd.s16 q11, q8, q2 513 vadd.s16 q8, q8, q3 514 vqmovun.s16 d2, q1 515 vqmovun.s16 d3, q0 516 vqmovun.s16 d22, q11 517 vqmovun.s16 d23, q8 518 vld1.8 {d18}, [r3]! @ preload 8 left into r12 519 vmovl.u8 q10, d18 520 vst1.64 {d2,d3}, [r0], r1 521 vst1.64 {d22,d23}, [r0], r1 522 523 subs r2, r2, #1 524 bgt loop_16x16_neon 525 526 bx lr 527 .size vpx_tm_predictor_16x16_neon, .-vpx_tm_predictor_16x16_neon @ ENDP @ |vpx_tm_predictor_16x16_neon| 528 529@void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, 530@ const uint8_t *above, 531@ const uint8_t *left) 532@ r0 uint8_t *dst 533@ r1 ptrdiff_t y_stride 534@ r2 const uint8_t *above 535@ r3 const uint8_t *left 536 537_vpx_tm_predictor_32x32_neon: 538 vpx_tm_predictor_32x32_neon: @ PROC 539 @ Load ytop_left = above[-1]; 540 sub r12, r2, #1 541 vld1.8 {d0[]}, [r12] 542 543 @ Load above 32 pixels 544 vld1.8 {q1}, [r2]! 545 vld1.8 {q2}, [r2] 546 547 @ preload 8 left pixels 548 vld1.8 {d26}, [r3]! 549 550 @ Compute above - ytop_left 551 vsubl.u8 q8, d2, d0 552 vsubl.u8 q9, d3, d0 553 vsubl.u8 q10, d4, d0 554 vsubl.u8 q11, d5, d0 555 556 vmovl.u8 q3, d26 557 558 @ Load left row by row and compute left + (above - ytop_left) 559 @ Process 8 rows in each single loop and loop 4 times to process 32 rows. 560 mov r2, #4 561 562loop_32x32_neon: 563 @ Process two rows. 564 vdup.16 q0, d6[0] 565 vdup.16 q2, d6[1] 566 vadd.s16 q12, q0, q8 567 vadd.s16 q13, q0, q9 568 vadd.s16 q14, q0, q10 569 vadd.s16 q15, q0, q11 570 vqmovun.s16 d0, q12 571 vqmovun.s16 d1, q13 572 vadd.s16 q12, q2, q8 573 vadd.s16 q13, q2, q9 574 vqmovun.s16 d2, q14 575 vqmovun.s16 d3, q15 576 vadd.s16 q14, q2, q10 577 vadd.s16 q15, q2, q11 578 vst1.64 {d0-d3}, [r0], r1 579 vqmovun.s16 d24, q12 580 vqmovun.s16 d25, q13 581 vqmovun.s16 d26, q14 582 vqmovun.s16 d27, q15 583 vdup.16 q1, d6[2] 584 vdup.16 q2, d6[3] 585 vst1.64 {d24-d27}, [r0], r1 586 587 @ Process two rows. 588 vadd.s16 q12, q1, q8 589 vadd.s16 q13, q1, q9 590 vadd.s16 q14, q1, q10 591 vadd.s16 q15, q1, q11 592 vqmovun.s16 d0, q12 593 vqmovun.s16 d1, q13 594 vadd.s16 q12, q2, q8 595 vadd.s16 q13, q2, q9 596 vqmovun.s16 d2, q14 597 vqmovun.s16 d3, q15 598 vadd.s16 q14, q2, q10 599 vadd.s16 q15, q2, q11 600 vst1.64 {d0-d3}, [r0], r1 601 vqmovun.s16 d24, q12 602 vqmovun.s16 d25, q13 603 vqmovun.s16 d26, q14 604 vqmovun.s16 d27, q15 605 vdup.16 q0, d7[0] 606 vdup.16 q2, d7[1] 607 vst1.64 {d24-d27}, [r0], r1 608 609 @ Process two rows. 610 vadd.s16 q12, q0, q8 611 vadd.s16 q13, q0, q9 612 vadd.s16 q14, q0, q10 613 vadd.s16 q15, q0, q11 614 vqmovun.s16 d0, q12 615 vqmovun.s16 d1, q13 616 vadd.s16 q12, q2, q8 617 vadd.s16 q13, q2, q9 618 vqmovun.s16 d2, q14 619 vqmovun.s16 d3, q15 620 vadd.s16 q14, q2, q10 621 vadd.s16 q15, q2, q11 622 vst1.64 {d0-d3}, [r0], r1 623 vqmovun.s16 d24, q12 624 vqmovun.s16 d25, q13 625 vqmovun.s16 d26, q14 626 vqmovun.s16 d27, q15 627 vdup.16 q0, d7[2] 628 vdup.16 q2, d7[3] 629 vst1.64 {d24-d27}, [r0], r1 630 631 @ Process two rows. 632 vadd.s16 q12, q0, q8 633 vadd.s16 q13, q0, q9 634 vadd.s16 q14, q0, q10 635 vadd.s16 q15, q0, q11 636 vqmovun.s16 d0, q12 637 vqmovun.s16 d1, q13 638 vadd.s16 q12, q2, q8 639 vadd.s16 q13, q2, q9 640 vqmovun.s16 d2, q14 641 vqmovun.s16 d3, q15 642 vadd.s16 q14, q2, q10 643 vadd.s16 q15, q2, q11 644 vst1.64 {d0-d3}, [r0], r1 645 vqmovun.s16 d24, q12 646 vqmovun.s16 d25, q13 647 vld1.8 {d0}, [r3]! @ preload 8 left pixels 648 vqmovun.s16 d26, q14 649 vqmovun.s16 d27, q15 650 vmovl.u8 q3, d0 651 vst1.64 {d24-d27}, [r0], r1 652 653 subs r2, r2, #1 654 bgt loop_32x32_neon 655 656 bx lr 657 .size vpx_tm_predictor_32x32_neon, .-vpx_tm_predictor_32x32_neon @ ENDP @ |vpx_tm_predictor_32x32_neon| 658 659 .section .note.GNU-stack,"",%progbits 660