1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21///* 22////---------------------------------------------------------------------------- 23//// File Name : impeg2_inter_pred.s 24//// 25//// Description : This file has motion compensation related 26//// interpolation functions on Neon + CortexA-8 platform 27//// 28//// Reference Document : 29//// 30//// Revision History : 31//// Date Author Detail Description 32//// ------------ ---------------- ---------------------------------- 33//// 18 jun 2010 S Hamsalekha Created 34//// 35////------------------------------------------------------------------------- 36//*/ 37 38///* 39//// ---------------------------------------------------------------------------- 40//// Include Files 41//// ---------------------------------------------------------------------------- 42//*/ 43// PRESERVE8 44.text 45.include "impeg2_neon_macros.s" 46 47///* 48//// ---------------------------------------------------------------------------- 49//// Struct/Union Types and Define 50//// ---------------------------------------------------------------------------- 51//*/ 52 53 54///* 55//// ---------------------------------------------------------------------------- 56//// Static Global Data section variables 57//// ---------------------------------------------------------------------------- 58//*/ 59//// -------------------------- NONE -------------------------------------------- 60 61 62///* 63//// ---------------------------------------------------------------------------- 64//// Static Prototype Functions 65//// ---------------------------------------------------------------------------- 66//*/ 67//// -------------------------- NONE -------------------------------------------- 68 69///* 70//// ---------------------------------------------------------------------------- 71//// Exported functions 72//// ---------------------------------------------------------------------------- 73//*/ 74 75 76///* 77////--------------------------------------------------------------------------- 78//// Function Name : impeg2_copy_mb_av8() 79//// 80//// Detail Description : Copies one MB worth of data from src to the dst 81//// 82//// Inputs : x0 - pointer to src 83//// x1 - pointer to dst 84//// x2 - source width 85//// x3 - destination width 86//// Registers Used : v0, v1 87//// 88//// Stack Usage : 64 bytes 89//// 90//// Outputs : 91//// 92//// Return Data : None 93//// 94//// Programming Note : <program limitation> 95////----------------------------------------------------------------------------- 96//*/ 97 98 99 100.global impeg2_copy_mb_av8 101 102 103impeg2_copy_mb_av8: 104 105//STMFD x13!,{x4,x5,x12,x14} 106 push_v_regs 107 108 109 ldr x4, [x0] //src->y 110 ldr x5, [x1] //dst->y 111 112 //Read one row of data from the src 113 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 114 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 115 116 ////Repeat 15 times for y 117 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 118 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 119 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 120 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 121 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 122 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 123 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 124 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 125 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 126 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 127 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 128 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 129 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 130 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 131 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 132 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 133 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 134 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 135 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 136 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 137 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 138 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 139 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 140 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 141 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 142 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 143 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 144 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 145 ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src 146 st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst 147 148 lsr x2, x2, #1 //src_offset /= 2 149 lsr x3, x3, #1 //dst_offset /= 2 150 151 ldr x4, [x0, #8] //src->u 152 ldr x5, [x1, #8] //dst->u 153 154 //Read one row of data from the src 155 ld1 {v0.8b}, [x4], x2 //Load and increment src 156 st1 {v0.8b}, [x5], x3 //Store and increment dst 157 158 ////Repeat 7 times for u 159 ld1 {v0.8b}, [x4], x2 //Load and increment src 160 st1 {v0.8b}, [x5], x3 //Store and increment dst 161 ld1 {v0.8b}, [x4], x2 //Load and increment src 162 st1 {v0.8b}, [x5], x3 //Store and increment dst 163 ld1 {v0.8b}, [x4], x2 //Load and increment src 164 st1 {v0.8b}, [x5], x3 //Store and increment dst 165 ld1 {v0.8b}, [x4], x2 //Load and increment src 166 st1 {v0.8b}, [x5], x3 //Store and increment dst 167 ld1 {v0.8b}, [x4], x2 //Load and increment src 168 st1 {v0.8b}, [x5], x3 //Store and increment dst 169 ld1 {v0.8b}, [x4], x2 //Load and increment src 170 st1 {v0.8b}, [x5], x3 //Store and increment dst 171 ld1 {v0.8b}, [x4], x2 //Load and increment src 172 st1 {v0.8b}, [x5], x3 //Store and increment dst 173 174 ldr x4, [x0, #16] //src->v 175 ldr x5, [x1, #16] //dst->v 176 177 //Read one row of data from the src 178 ld1 {v0.8b}, [x4], x2 //Load and increment src 179 st1 {v0.8b}, [x5], x3 //Store and increment dst 180 181 ////Repeat 7 times for v 182 ld1 {v0.8b}, [x4], x2 //Load and increment src 183 st1 {v0.8b}, [x5], x3 //Store and increment dst 184 ld1 {v0.8b}, [x4], x2 //Load and increment src 185 st1 {v0.8b}, [x5], x3 //Store and increment dst 186 ld1 {v0.8b}, [x4], x2 //Load and increment src 187 st1 {v0.8b}, [x5], x3 //Store and increment dst 188 ld1 {v0.8b}, [x4], x2 //Load and increment src 189 st1 {v0.8b}, [x5], x3 //Store and increment dst 190 ld1 {v0.8b}, [x4], x2 //Load and increment src 191 st1 {v0.8b}, [x5], x3 //Store and increment dst 192 ld1 {v0.8b}, [x4], x2 //Load and increment src 193 st1 {v0.8b}, [x5], x3 //Store and increment dst 194 ld1 {v0.8b}, [x4], x2 //Load and increment src 195 st1 {v0.8b}, [x5], x3 //Store and increment dst 196 197//LDMFD x13!,{x4,x5,x12,PC} 198 pop_v_regs 199 ret 200 201 202///* 203////--------------------------------------------------------------------------- 204//// Function Name : impeg2_mc_fullx_halfy_8x8_av8() 205//// 206//// Detail Description : This function pastes the reference block in the 207//// current frame buffer.This function is called for 208//// blocks that are not coded and have motion vectors 209//// with a half pel resolution. 210//// 211//// Inputs : x0 - out : Current Block Pointer 212//// x1 - ref : Refernce Block Pointer 213//// x2 - ref_wid : Refernce Block Width 214//// x3 - out_wid @ Current Block Width 215//// 216//// Registers Used : x14, D0-D9 217//// 218//// Stack Usage : 64 bytes 219//// 220//// Outputs : The Motion Compensated Block 221//// 222//// Return Data : None 223//// 224//// Programming Note : <program limitation> 225////----------------------------------------------------------------------------- 226//*/ 227 228.global impeg2_mc_fullx_halfy_8x8_av8 229 230impeg2_mc_fullx_halfy_8x8_av8: 231 232//STMFD x13!,{x12,x14} 233 push_v_regs 234 add x14, x1, x2 235 lsl x2, x2, #1 236 237///* Load 8 + 1 rows from reference block */ 238///* Do the addition with out rounding off as rounding value is 1 */ 239 ld1 {v0.8b}, [x1], x2 //// first row hence x1 = D0 240 ld1 {v2.8b}, [x14], x2 //// second row hence x2 = D2 241 ld1 {v4.8b}, [x1], x2 //// third row hence x3 = D4 242 ld1 {v6.8b}, [x14], x2 //// fourth row hence x4 = D6 243 ld1 {v1.8b}, [x1], x2 //// fifth row hence x5 = D1 244 ld1 {v3.8b}, [x14], x2 //// sixth row hence x6 = D3 245 urhadd v9.8b, v1.8b , v6.8b //// estimated row 4 = D9 246 ld1 {v5.8b}, [x1], x2 //// seventh row hence x7 = D5 247 urhadd v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1 248 urhadd v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1 249 ld1 {v7.8b}, [x14], x2 //// eighth row hence x8 = D7 250 urhadd v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3 251 urhadd v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3 252 ld1 {v8.8b}, [x1], x2 //// ninth row hence x9 = D8 253 urhadd v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5 254 urhadd v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5 255 256 add x14, x0, x3 257 lsl x3, x3, #1 258 259///* Store the eight rows calculated above */ 260 st1 {v2.8b}, [x14], x3 //// second row hence D2 261 urhadd v7.8b, v7.8b , v8.8b //// estimated row 8 = D7 262 st1 {v0.8b}, [x0], x3 //// first row hence D0 263 st1 {v9.8b}, [x14], x3 //// fourth row hence D9 264 st1 {v4.8b}, [x0], x3 //// third row hence D4 265 st1 {v3.8b}, [x14], x3 //// sixth row hence x6 = D3 266 st1 {v1.8b}, [x0], x3 //// fifth row hence x5 = D1 267 st1 {v7.8b}, [x14], x3 //// eighth row hence x8 = D7 268 st1 {v5.8b}, [x0], x3 //// seventh row hence x7 = D5 269 270// LDMFD sp!,{x12,pc} 271 pop_v_regs 272 ret 273 274 275 276 277 278///* 279////--------------------------------------------------------------------------- 280//// Function Name : impeg2_mc_halfx_fully_8x8_av8() 281//// 282//// Detail Description : This function pastes the reference block in the 283//// current frame buffer.This function is called for 284//// blocks that are not coded and have motion vectors 285//// with a half pel resolutionand VopRoundingType is 0 .. 286//// 287//// Inputs : x0 - out : Current Block Pointer 288//// x1 - ref : Refernce Block Pointer 289//// x2 - ref_wid : Refernce Block Width 290//// x3 - out_wid @ Current Block Width 291//// 292//// Registers Used : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22 293 294//// 295//// Stack Usage : 64 bytes 296//// 297//// Outputs : The Motion Compensated Block 298//// 299//// Return Data : None 300//// 301//// Programming Note : <program limitation> 302////----------------------------------------------------------------------------- 303//*/ 304 305 306 307.global impeg2_mc_halfx_fully_8x8_av8 308 309 310 311impeg2_mc_halfx_fully_8x8_av8: 312 313 // STMFD sp!,{x12,x14} 314 push_v_regs 315 316 add x14, x1, x2, lsl #2 317 318 add x12, x0, x3, lsl#2 319 320 ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1 321 322 ld1 {v2.8b, v3.8b}, [x14], x2 // row5 323 324 325 ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2 326 327 ld1 {v6.8b, v7.8b}, [x14], x2 //row6 328 329 330 ext v8.8b, v0.8b , v1.8b , #1 331 332 ext v12.8b, v2.8b , v3.8b , #1 333 334 ext v16.8b, v4.8b , v5.8b , #1 335 336 ext v20.8b, v6.8b , v7.8b , #1 337 338 339 ld1 {v9.8b, v10.8b}, [x1], x2 //load row3 340 341 ld1 {v13.8b, v14.8b}, [x14], x2 //load row7 342 343 ld1 {v17.8b, v18.8b}, [x1], x2 //load row4 344 345 ld1 {v21.8b, v22.8b}, [x14], x2 //load row8 346 347 348 ext v1.8b, v9.8b , v10.8b , #1 349 350 ext v3.8b, v13.8b , v14.8b , #1 351 352 353 354 ext v5.8b, v17.8b , v18.8b , #1 355 356 ext v7.8b, v21.8b , v22.8b , #1 357 358 359 urhadd v0.16b, v0.16b , v8.16b //operate on row1 and row3 360 urhadd v1.16b, v1.16b , v9.16b //operate on row1 and row3 361 362 urhadd v2.16b, v2.16b , v12.16b //operate on row5 and row7 363 urhadd v3.16b, v3.16b , v13.16b //operate on row5 and row7 364 365 366 urhadd v4.16b, v4.16b , v16.16b //operate on row2 and row4 367 urhadd v5.16b, v5.16b , v17.16b //operate on row2 and row4 368 369 370 urhadd v6.16b, v6.16b , v20.16b //operate on row6 and row8 371 urhadd v7.16b, v7.16b , v21.16b //operate on row6 and row8 372 373 st1 {v0.8b}, [x0], x3 //store row1 374 375 st1 {v2.8b}, [x12], x3 //store row5 376 377 st1 {v4.8b}, [x0], x3 //store row2 378 379 st1 {v6.8b}, [x12], x3 //store row6 380 381 st1 {v1.8b}, [x0], x3 //store row3 382 383 st1 {v3.8b}, [x12], x3 //store row7 384 385 st1 {v5.8b}, [x0], x3 //store row4 386 387 st1 {v7.8b}, [x12], x3 //store row8 388 389 390 391 // LDMFD sp!,{x12,pc} 392 pop_v_regs 393 ret 394 395 396 397 398 399 400 401///* 402////--------------------------------------------------------------------------- 403//// Function Name : impeg2_mc_halfx_halfy_8x8_av8() 404//// 405//// Detail Description : This function pastes the reference block in the 406//// current frame buffer.This function is called for 407//// blocks that are not coded and have motion vectors 408//// with a half pel resolutionand VopRoundingType is 0 .. 409//// 410//// Inputs : x0 - out : Current Block Pointer 411//// x1 - ref : Refernce Block Pointer 412//// x2 - ref_wid : Refernce Block Width 413//// x3 - out_wid @ Current Block Width 414//// 415//// Registers Used : x14, v0-v18, v22, v24, v26, v28, v30 416 417//// 418//// Stack Usage : 64 bytes 419//// 420//// Outputs : The Motion Compensated Block 421//// 422//// Return Data : None 423//// 424//// Programming Note : <program limitation> 425////----------------------------------------------------------------------------- 426//*/ 427 428 429.global impeg2_mc_halfx_halfy_8x8_av8 430 431impeg2_mc_halfx_halfy_8x8_av8: 432 433 // STMFD sp!,{x12,x14} 434 push_v_regs 435 436 add x14, x1, x2, lsl #2 437 438 ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1 439 440 ld1 {v2.8b, v3.8b}, [x14], x2 // row5 441 442 ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2 443 444 ld1 {v6.8b, v7.8b}, [x14], x2 //row6 445 446 ext v1.8b, v0.8b , v1.8b , #1 447 448 449 450 ext v3.8b, v2.8b , v3.8b , #1 451 452 453 454 ext v5.8b, v4.8b , v5.8b , #1 455 456 ext v7.8b, v6.8b , v7.8b , #1 457 458 459 460 461 ld1 {v8.8b, v9.8b}, [x1], x2 //load row3 462 463 464 465 ld1 {v10.8b, v11.8b}, [x14], x2 //load row7 466 467 ld1 {v12.8b, v13.8b}, [x1], x2 //load row4 468 469 ld1 {v14.8b, v15.8b}, [x14], x2 //load row8 470 471 ext v9.8b, v8.8b , v9.8b , #1 472 473 ld1 {v16.8b, v17.8b}, [x14], x2 //load row9 474 475 476 477 478 479 ext v11.8b, v10.8b , v11.8b , #1 480 481 482 483 ext v13.8b, v12.8b , v13.8b , #1 484 485 486 487 ext v15.8b, v14.8b , v15.8b , #1 488 489 ext v17.8b, v16.8b , v17.8b , #1 490 491 492 //interpolation in x direction 493 494 uaddl v0.8h, v0.8b, v1.8b //operate row1 495 496 uaddl v2.8h, v2.8b, v3.8b //operate row5 497 498 uaddl v4.8h, v4.8b, v5.8b //operate row2 499 500 uaddl v6.8h, v6.8b, v7.8b //operate row6 501 502 uaddl v8.8h, v8.8b, v9.8b //operate row3 503 504 uaddl v10.8h, v10.8b, v11.8b //operate row7 505 506 uaddl v12.8h, v12.8b, v13.8b //operate row4 507 508 uaddl v14.8h, v14.8b, v15.8b //operate row8 509 510 uaddl v16.8h, v16.8b, v17.8b //operate row9 511 512 //interpolation in y direction 513 514 add x14, x0, x3, lsl #2 515 516 517 518 add v18.8h, v0.8h , v4.8h //operate row1 and row2 519 520 add v26.8h, v2.8h , v6.8h //operate row5 and row6 521 522 add v20.8h, v4.8h , v8.8h //operate row2 and row3 523 524 add v28.8h, v6.8h , v10.8h //operate row6 and row7 525 526 rshrn v18.8b, v18.8h, #2 //row1 527 528 rshrn v26.8b, v26.8h, #2 //row5 529 530 rshrn v20.8b, v20.8h, #2 //row2 531 532 rshrn v28.8b, v28.8h, #2 //row6 533 534 add v22.8h, v8.8h , v12.8h //operate row3 and row4 535 536 st1 {v18.8b}, [x0], x3 //store row1 537 538 add v30.8h, v10.8h , v14.8h //operate row7 and row8 539 540 st1 {v26.8b}, [x14], x3 //store row5 541 542 add v24.8h, v12.8h , v2.8h //operate row4 and row5 543 544 st1 {v20.8b}, [x0], x3 //store row2 545 546 add v14.8h, v14.8h , v16.8h //operate row8 and row9 547 548 st1 {v28.8b}, [x14], x3 //store row6 549 550 551 552 rshrn v22.8b, v22.8h, #2 //row3 553 554 rshrn v30.8b, v30.8h, #2 //row7 555 556 rshrn v24.8b, v24.8h, #2 //row4 557 558 rshrn v14.8b, v14.8h, #2 //row8 559 560 561 st1 {v22.8b}, [x0], x3 //store row3 562 st1 {v30.8b}, [x14], x3 //store row7 563 st1 {v24.8b}, [x0], x3 //store row4 564 st1 {v14.8b}, [x14], x3 //store row8 565 566 567 568 // LDMFD sp!,{x12,pc} 569 pop_v_regs 570 ret 571 572 573 574 575///* 576////--------------------------------------------------------------------------- 577//// Function Name : impeg2_mc_fullx_fully_8x8_av8() 578//// 579//// Detail Description : This function pastes the reference block in the 580//// current frame buffer.This function is called for 581//// blocks that are not coded and have motion vectors 582//// with a half pel resolutionand .. 583//// 584//// Inputs : x0 - out : Current Block Pointer 585//// x1 - ref : Refernce Block Pointer 586//// x2 - ref_wid : Refernce Block Width 587//// x3 - out_wid @ Current Block Width 588//// 589//// Registers Used : x12, x14, v0-v3 590 591//// 592//// Stack Usage : 64 bytes 593//// 594//// Outputs : The Motion Compensated Block 595//// 596//// Return Data : None 597//// 598//// Programming Note : <program limitation> 599////----------------------------------------------------------------------------- 600//*/ 601 602 603.global impeg2_mc_fullx_fully_8x8_av8 604impeg2_mc_fullx_fully_8x8_av8: 605 606 607 // STMFD sp!,{x12,x14} 608 push_v_regs 609 610 add x14, x1, x2, lsl #2 611 612 add x12, x0, x3, lsl #2 613 614 615 ld1 {v0.8b}, [x1], x2 //load row1 616 617 ld1 {v1.8b}, [x14], x2 //load row4 618 619 ld1 {v2.8b}, [x1], x2 //load row2 620 621 ld1 {v3.8b}, [x14], x2 //load row5 622 623 624 st1 {v0.8b}, [x0], x3 //store row1 625 626 st1 {v1.8b}, [x12], x3 //store row4 627 628 st1 {v2.8b}, [x0], x3 //store row2 629 630 st1 {v3.8b}, [x12], x3 //store row5 631 632 633 ld1 {v0.8b}, [x1], x2 //load row3 634 635 ld1 {v1.8b}, [x14], x2 //load row6 636 637 ld1 {v2.8b}, [x1], x2 //load row4 638 639 ld1 {v3.8b}, [x14], x2 //load row8 640 641 642 st1 {v0.8b}, [x0], x3 //store row3 643 644 st1 {v1.8b}, [x12], x3 //store row6 645 646 st1 {v2.8b}, [x0], x3 //store row4 647 648 st1 {v3.8b}, [x12], x3 //store row8 649 650 651 // LDMFD sp!,{x12,pc} 652 pop_v_regs 653 ret 654 655 656 657 658///* 659////--------------------------------------------------------------------------- 660//// Function Name : impeg2_interpolate_av8() 661//// 662//// Detail Description : interpolates two buffers and adds pred 663//// 664//// Inputs : x0 - pointer to src1 665//// x1 - pointer to src2 666//// x2 - dest buf 667//// x3 - dst stride 668//// Registers Used : x12, v0-v15 669//// 670//// Stack Usage : 64 bytes 671//// 672//// Outputs : The Motion Compensated Block 673//// 674//// Return Data : None 675//// 676//// Programming Note : <program limitation> 677////----------------------------------------------------------------------------- 678//*/ 679 680 681.global impeg2_interpolate_av8 682 683 684impeg2_interpolate_av8: 685 686//STMFD x13!,{x4-x7,x12,x14} 687 push_v_regs 688 689 ldr x4, [x0, #0] //ptr_y src1 690 691 ldr x5, [x1, #0] //ptr_y src2 692 693 ldr x7, [x2, #0] //ptr_y dst buf 694 695 mov x12, #4 //counter for number of blocks 696 697 698interp_lumablocks_stride: 699 ld1 {v0.16b}, [x4], #16 //row1 src1 700 701 ld1 {v2.16b}, [x4], #16 //row2 src1 702 703 ld1 {v4.16b}, [x4], #16 //row3 src1 704 705 ld1 {v6.16b}, [x4], #16 //row4 src1 706 707 708 ld1 {v8.16b}, [x5], #16 //row1 src2 709 710 ld1 {v10.16b}, [x5], #16 //row2 src2 711 712 ld1 {v12.16b}, [x5], #16 //row3 src2 713 714 ld1 {v14.16b}, [x5], #16 //row4 src2 715 716 urhadd v0.16b, v0.16b , v8.16b //operate on row1 717 718 urhadd v2.16b, v2.16b , v10.16b //operate on row2 719 720 urhadd v4.16b, v4.16b , v12.16b //operate on row3 721 722 urhadd v6.16b, v6.16b , v14.16b //operate on row4 723 st1 {v0.16b}, [x7], x3 //row1 724 725 st1 {v2.16b}, [x7], x3 //row2 726 727 st1 {v4.16b}, [x7], x3 //row3 728 729 st1 {v6.16b}, [x7], x3 //row4 730 731 subs x12, x12, #1 732 733 bne interp_lumablocks_stride 734 735 736 lsr x3, x3, #1 //stride >> 1 737 738 ldr x4, [x0, #8] //ptr_u src1 739 740 ldr x5, [x1, #8] //ptr_u src2 741 742 ldr x7 , [x2, #8] //ptr_u dst buf 743 744 mov x12, #2 //counter for number of blocks 745 746 747 748//chroma blocks 749 750interp_chromablocks_stride: 751 ld1 {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1 752 753 ld1 {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1 754 755 ld1 {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1 756 757 ld1 {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1 758 759 760 ld1 {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2 761 762 ld1 {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2 763 764 ld1 {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2 765 766 ld1 {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2 767 768 urhadd v0.16b, v0.16b , v8.16b //operate on row1 & 2 769 urhadd v1.16b, v1.16b , v9.16b //operate on row1 & 2 770 771 urhadd v2.16b, v2.16b , v10.16b //operate on row3 & 4 772 urhadd v3.16b, v3.16b , v11.16b //operate on row3 & 4 773 774 urhadd v4.16b, v4.16b , v12.16b //operate on row5 & 6 775 urhadd v5.16b, v5.16b , v13.16b //operate on row5 & 6 776 777 urhadd v6.16b, v6.16b , v14.16b //operate on row7 & 8 778 urhadd v7.16b, v7.16b , v15.16b //operate on row7 & 8 779 780 st1 {v0.8b}, [x7], x3 //row1 781 782 st1 {v1.8b}, [x7], x3 //row2 783 784 st1 {v2.8b}, [x7], x3 //row3 785 786 st1 {v3.8b}, [x7], x3 //row4 787 788 st1 {v4.8b}, [x7], x3 //row5 789 790 st1 {v5.8b}, [x7], x3 //row6 791 792 st1 {v6.8b}, [x7], x3 //row7 793 794 st1 {v7.8b}, [x7], x3 //row8 795 796 797 ldr x4, [x0, #16] //ptr_v src1 798 799 ldr x5, [x1, #16] //ptr_v src2 800 801 ldr x7, [x2, #16] //ptr_v dst buf 802 803 subs x12, x12, #1 804 805 bne interp_chromablocks_stride 806 807 808 //LDMFD x13!,{x4-x7,x12,PC} 809 pop_v_regs 810 ret 811 812 813 814 815