1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* mb_copy.asm 33;* 34;* Abstract 35;* mb_copy and mb_copy1 36;* 37;* History 38;* 15/09/2009 Created 39;* 12/28/2009 Modified with larger throughput 40;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, 41;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc; 42;* 43;* 44;*********************************************************************************************/ 45%include "asm_inc.asm" 46 47%ifdef __NASM_VER__ 48 %use smartalign 49%endif 50 51;*********************************************************************** 52; Macros and other preprocessor constants 53;*********************************************************************** 54 55;*********************************************************************** 56; Code 57;*********************************************************************** 58 59SECTION .text 60 61 62;*********************************************************************** 63; void WelsCopy16x16_sse2( uint8_t* Dst, 64; int32_t iStrideD, 65; uint8_t* Src, 66; int32_t iStrideS ) 67;*********************************************************************** 68WELS_EXTERN WelsCopy16x16_sse2 69 70 push r4 71 push r5 72 %assign push_num 2 73 LOAD_4_PARA 74 PUSH_XMM 8 75 SIGN_EXTENSION r1, r1d 76 SIGN_EXTENSION r3, r3d 77 78 lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 79 lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 80 81 movdqa xmm0, [r2] 82 movdqa xmm1, [r2+r3] 83 movdqa xmm2, [r2+2*r3] 84 movdqa xmm3, [r2+r5] 85 lea r2, [r2+4*r3] 86 movdqa xmm4, [r2] 87 movdqa xmm5, [r2+r3] 88 movdqa xmm6, [r2+2*r3] 89 movdqa xmm7, [r2+r5] 90 lea r2, [r2+4*r3] 91 92 movdqa [r0], xmm0 93 movdqa [r0+r1], xmm1 94 movdqa [r0+2*r1], xmm2 95 movdqa [r0+r4], xmm3 96 lea r0, [r0+4*r1] 97 movdqa [r0], xmm4 98 movdqa [r0+r1], xmm5 99 movdqa [r0+2*r1], xmm6 100 movdqa [r0+r4], xmm7 101 lea r0, [r0+4*r1] 102 103 movdqa xmm0, [r2] 104 movdqa xmm1, [r2+r3] 105 movdqa xmm2, [r2+2*r3] 106 movdqa xmm3, [r2+r5] 107 lea r2, [r2+4*r3] 108 movdqa xmm4, [r2] 109 movdqa xmm5, [r2+r3] 110 movdqa xmm6, [r2+2*r3] 111 movdqa xmm7, [r2+r5] 112 113 movdqa [r0], xmm0 114 movdqa [r0+r1], xmm1 115 movdqa [r0+2*r1], xmm2 116 movdqa [r0+r4], xmm3 117 lea r0, [r0+4*r1] 118 movdqa [r0], xmm4 119 movdqa [r0+r1], xmm5 120 movdqa [r0+2*r1], xmm6 121 movdqa [r0+r4], xmm7 122 POP_XMM 123 LOAD_4_PARA_POP 124 pop r5 125 pop r4 126 ret 127 128;*********************************************************************** 129; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst, 130; int32_t iStrideD, 131; uint8_t* Src, 132; int32_t iStrideS ) 133;*********************************************************************** 134; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011 135WELS_EXTERN WelsCopy16x16NotAligned_sse2 136 push r4 137 push r5 138 %assign push_num 2 139 LOAD_4_PARA 140 PUSH_XMM 8 141 SIGN_EXTENSION r1, r1d 142 SIGN_EXTENSION r3, r3d 143 144 lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 145 lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 146 147 movdqu xmm0, [r2] 148 movdqu xmm1, [r2+r3] 149 movdqu xmm2, [r2+2*r3] 150 movdqu xmm3, [r2+r5] 151 lea r2, [r2+4*r3] 152 movdqu xmm4, [r2] 153 movdqu xmm5, [r2+r3] 154 movdqu xmm6, [r2+2*r3] 155 movdqu xmm7, [r2+r5] 156 lea r2, [r2+4*r3] 157 158 movdqa [r0], xmm0 159 movdqa [r0+r1], xmm1 160 movdqa [r0+2*r1], xmm2 161 movdqa [r0+r4], xmm3 162 lea r0, [r0+4*r1] 163 movdqa [r0], xmm4 164 movdqa [r0+r1], xmm5 165 movdqa [r0+2*r1], xmm6 166 movdqa [r0+r4], xmm7 167 lea r0, [r0+4*r1] 168 169 movdqu xmm0, [r2] 170 movdqu xmm1, [r2+r3] 171 movdqu xmm2, [r2+2*r3] 172 movdqu xmm3, [r2+r5] 173 lea r2, [r2+4*r3] 174 movdqu xmm4, [r2] 175 movdqu xmm5, [r2+r3] 176 movdqu xmm6, [r2+2*r3] 177 movdqu xmm7, [r2+r5] 178 179 movdqa [r0], xmm0 180 movdqa [r0+r1], xmm1 181 movdqa [r0+2*r1], xmm2 182 movdqa [r0+r4], xmm3 183 lea r0, [r0+4*r1] 184 movdqa [r0], xmm4 185 movdqa [r0+r1], xmm5 186 movdqa [r0+2*r1], xmm6 187 movdqa [r0+r4], xmm7 188 POP_XMM 189 LOAD_4_PARA_POP 190 pop r5 191 pop r4 192 ret 193 194; , 12/29/2011 195;*********************************************************************** 196; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst, 197; int32_t iStrideD, 198; uint8_t* Src, 199; int32_t iStrideS ) 200;*********************************************************************** 201WELS_EXTERN WelsCopy16x8NotAligned_sse2 202 push r4 203 push r5 204 %assign push_num 2 205 LOAD_4_PARA 206 PUSH_XMM 8 207 SIGN_EXTENSION r1, r1d 208 SIGN_EXTENSION r3, r3d 209 210 lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3 211 lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3 212 213 movdqu xmm0, [r2] 214 movdqu xmm1, [r2+r3] 215 movdqu xmm2, [r2+2*r3] 216 movdqu xmm3, [r2+r5] 217 lea r2, [r2+4*r3] 218 movdqu xmm4, [r2] 219 movdqu xmm5, [r2+r3] 220 movdqu xmm6, [r2+2*r3] 221 movdqu xmm7, [r2+r5] 222 223 movdqa [r0], xmm0 224 movdqa [r0+r1], xmm1 225 movdqa [r0+2*r1], xmm2 226 movdqa [r0+r4], xmm3 227 lea r0, [r0+4*r1] 228 movdqa [r0], xmm4 229 movdqa [r0+r1], xmm5 230 movdqa [r0+2*r1], xmm6 231 movdqa [r0+r4], xmm7 232 POP_XMM 233 LOAD_4_PARA_POP 234 pop r5 235 pop r4 236 ret 237 238 239;*********************************************************************** 240; void WelsCopy8x16_mmx(uint8_t* Dst, 241; int32_t iStrideD, 242; uint8_t* Src, 243; int32_t iStrideS ) 244;*********************************************************************** 245WELS_EXTERN WelsCopy8x16_mmx 246 %assign push_num 0 247 LOAD_4_PARA 248 SIGN_EXTENSION r1, r1d 249 SIGN_EXTENSION r3, r3d 250 251 movq mm0, [r2] 252 movq mm1, [r2+r3] 253 lea r2, [r2+2*r3] 254 movq mm2, [r2] 255 movq mm3, [r2+r3] 256 lea r2, [r2+2*r3] 257 movq mm4, [r2] 258 movq mm5, [r2+r3] 259 lea r2, [r2+2*r3] 260 movq mm6, [r2] 261 movq mm7, [r2+r3] 262 lea r2, [r2+2*r3] 263 264 movq [r0], mm0 265 movq [r0+r1], mm1 266 lea r0, [r0+2*r1] 267 movq [r0], mm2 268 movq [r0+r1], mm3 269 lea r0, [r0+2*r1] 270 movq [r0], mm4 271 movq [r0+r1], mm5 272 lea r0, [r0+2*r1] 273 movq [r0], mm6 274 movq [r0+r1], mm7 275 lea r0, [r0+2*r1] 276 277 movq mm0, [r2] 278 movq mm1, [r2+r3] 279 lea r2, [r2+2*r3] 280 movq mm2, [r2] 281 movq mm3, [r2+r3] 282 lea r2, [r2+2*r3] 283 movq mm4, [r2] 284 movq mm5, [r2+r3] 285 lea r2, [r2+2*r3] 286 movq mm6, [r2] 287 movq mm7, [r2+r3] 288 289 movq [r0], mm0 290 movq [r0+r1], mm1 291 lea r0, [r0+2*r1] 292 movq [r0], mm2 293 movq [r0+r1], mm3 294 lea r0, [r0+2*r1] 295 movq [r0], mm4 296 movq [r0+r1], mm5 297 lea r0, [r0+2*r1] 298 movq [r0], mm6 299 movq [r0+r1], mm7 300 301 WELSEMMS 302 LOAD_4_PARA_POP 303 ret 304 305;*********************************************************************** 306; void WelsCopy8x8_mmx( uint8_t* Dst, 307; int32_t iStrideD, 308; uint8_t* Src, 309; int32_t iStrideS ) 310;*********************************************************************** 311WELS_EXTERN WelsCopy8x8_mmx 312 push r4 313 %assign push_num 1 314 LOAD_4_PARA 315 SIGN_EXTENSION r1, r1d 316 SIGN_EXTENSION r3, r3d 317 lea r4, [r3+2*r3] ;edx, [ebx+2*ebx] 318 319 ; to prefetch next loop 320 prefetchnta [r2+2*r3] 321 prefetchnta [r2+r4] 322 movq mm0, [r2] 323 movq mm1, [r2+r3] 324 lea r2, [r2+2*r3] 325 ; to prefetch next loop 326 prefetchnta [r2+2*r3] 327 prefetchnta [r2+r4] 328 movq mm2, [r2] 329 movq mm3, [r2+r3] 330 lea r2, [r2+2*r3] 331 ; to prefetch next loop 332 prefetchnta [r2+2*r3] 333 prefetchnta [r2+r4] 334 movq mm4, [r2] 335 movq mm5, [r2+r3] 336 lea r2, [r2+2*r3] 337 movq mm6, [r2] 338 movq mm7, [r2+r3] 339 340 movq [r0], mm0 341 movq [r0+r1], mm1 342 lea r0, [r0+2*r1] 343 movq [r0], mm2 344 movq [r0+r1], mm3 345 lea r0, [r0+2*r1] 346 movq [r0], mm4 347 movq [r0+r1], mm5 348 lea r0, [r0+2*r1] 349 movq [r0], mm6 350 movq [r0+r1], mm7 351 352 WELSEMMS 353 LOAD_4_PARA_POP 354 pop r4 355 ret 356 357; (dunhuang@cisco), 12/21/2011 358;*********************************************************************** 359; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv ) 360;*********************************************************************** 361WELS_EXTERN UpdateMbMv_sse2 362 363 %assign push_num 0 364 LOAD_2_PARA 365 366 movd xmm0, r1d ; _mv 367 pshufd xmm1, xmm0, $00 368 movdqa [r0 ], xmm1 369 movdqa [r0+0x10], xmm1 370 movdqa [r0+0x20], xmm1 371 movdqa [r0+0x30], xmm1 372 ret 373 374;******************************************************************************* 375; Macros and other preprocessor constants 376;******************************************************************************* 377 378;******************************************************************************* 379; Code 380;******************************************************************************* 381 382SECTION .text 383 384 385 386 387;******************************************************************************* 388; void PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride, 389; uint8_t *pSrcA, int iSrcAStride, 390; uint8_t *pSrcB, int iSrcBStride, 391; int iHeight ); 392;******************************************************************************* 393WELS_EXTERN PixelAvgWidthEq4_mmx 394 395 %assign push_num 0 396 LOAD_7_PARA 397 398 SIGN_EXTENSION r1, r1d 399 SIGN_EXTENSION r3, r3d 400 SIGN_EXTENSION r5, r5d 401 SIGN_EXTENSION r6, r6d 402 403ALIGN 4 404.height_loop: 405 movd mm0, [r4] 406 pavgb mm0, [r2] 407 movd [r0], mm0 408 409 dec r6 410 lea r0, [r0+r1] 411 lea r2, [r2+r3] 412 lea r4, [r4+r5] 413 jne .height_loop 414 415 WELSEMMS 416 LOAD_7_PARA_POP 417 ret 418 419 420;******************************************************************************* 421; void PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride, 422; uint8_t *pSrcA, int iSrcAStride, 423; uint8_t *pSrcB, int iSrcBStride, 424; int iHeight ); 425;******************************************************************************* 426WELS_EXTERN PixelAvgWidthEq8_mmx 427 %assign push_num 0 428 LOAD_7_PARA 429 430 SIGN_EXTENSION r1, r1d 431 SIGN_EXTENSION r3, r3d 432 SIGN_EXTENSION r5, r5d 433 SIGN_EXTENSION r6, r6d 434 435ALIGN 4 436.height_loop: 437 movq mm0, [r2] 438 pavgb mm0, [r4] 439 movq [r0], mm0 440 movq mm0, [r2+r3] 441 pavgb mm0, [r4+r5] 442 movq [r0+r1], mm0 443 444 lea r2, [r2+2*r3] 445 lea r4, [r4+2*r5] 446 lea r0, [r0+2*r1] 447 448 sub r6, 2 449 jnz .height_loop 450 451 WELSEMMS 452 LOAD_7_PARA_POP 453 ret 454 455 456 457;******************************************************************************* 458; void PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride, 459; uint8_t *pSrcA, int iSrcAStride, 460; uint8_t *pSrcB, int iSrcBStride, 461; int iHeight ); 462;******************************************************************************* 463WELS_EXTERN PixelAvgWidthEq16_sse2 464 465 %assign push_num 0 466 LOAD_7_PARA 467 SIGN_EXTENSION r1, r1d 468 SIGN_EXTENSION r3, r3d 469 SIGN_EXTENSION r5, r5d 470 SIGN_EXTENSION r6, r6d 471ALIGN 4 472.height_loop: 473 movdqu xmm0, [r2] 474 movdqu xmm1, [r4] 475 pavgb xmm0, xmm1 476 ;pavgb xmm0, [r4] 477 movdqu [r0], xmm0 478 479 movdqu xmm0, [r2+r3] 480 movdqu xmm1, [r4+r5] 481 pavgb xmm0, xmm1 482 movdqu [r0+r1], xmm0 483 484 movdqu xmm0, [r2+2*r3] 485 movdqu xmm1, [r4+2*r5] 486 pavgb xmm0, xmm1 487 movdqu [r0+2*r1], xmm0 488 489 lea r2, [r2+2*r3] 490 lea r4, [r4+2*r5] 491 lea r0, [r0+2*r1] 492 493 movdqu xmm0, [r2+r3] 494 movdqu xmm1, [r4+r5] 495 pavgb xmm0, xmm1 496 movdqu [r0+r1], xmm0 497 498 lea r2, [r2+2*r3] 499 lea r4, [r4+2*r5] 500 lea r0, [r0+2*r1] 501 502 sub r6, 4 503 jne .height_loop 504 505 WELSEMMS 506 LOAD_7_PARA_POP 507 ret 508 509; load_instr=%1 store_instr=%2 p_dst=%3 i_dststride=%4 p_src=%5 i_srcstride=%6 cnt=%7 r_tmp=%8,%9 mm_tmp=%10,%11 510%macro CopyStrided4N 11 511 lea %8, [3 * %6] 512 lea %9, [3 * %4] 513ALIGN 32 514%%loop: 515 %1 %10, [%5] 516 %1 %11, [%5 + %6] 517 %2 [%3], %10 518 %2 [%3 + %4], %11 519 %1 %10, [%5 + 2 * %6] 520 %1 %11, [%5 + %8] 521 %2 [%3 + 2 * %4], %10 522 %2 [%3 + %9], %11 523 lea %5, [%5 + 4 * %6] 524 lea %3, [%3 + 4 * %4] 525 sub %7, 4 526 jg %%loop 527%endmacro 528 529;******************************************************************************* 530; void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride, 531; uint8_t *pDst, int iDstStride, int iHeight ) 532;******************************************************************************* 533WELS_EXTERN McCopyWidthEq8_mmx 534 %assign push_num 0 535%ifdef X86_32 536 push r5 537 push r6 538 %assign push_num 2 539%endif 540 LOAD_5_PARA 541 542 SIGN_EXTENSION r1, r1d 543 SIGN_EXTENSION r3, r3d 544 SIGN_EXTENSION r4, r4d 545 546 CopyStrided4N movq, movq, r2, r3, r0, r1, r4, r5, r6, mm0, mm1 547 548 WELSEMMS 549 LOAD_5_PARA_POP 550%ifdef X86_32 551 pop r6 552 pop r5 553%endif 554 ret 555 556 557;******************************************************************************* 558; void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight ) 559;******************************************************************************* 560;read unaligned memory 561%macro SSE_READ_UNA 2 562 movq %1, [%2] 563 movhps %1, [%2+8] 564%endmacro 565 566;write unaligned memory 567%macro SSE_WRITE_UNA 2 568 movq [%1], %2 569 movhps [%1+8], %2 570%endmacro 571WELS_EXTERN McCopyWidthEq16_sse2 572 %assign push_num 0 573 LOAD_5_PARA 574 SIGN_EXTENSION r1, r1d 575 SIGN_EXTENSION r3, r3d 576 SIGN_EXTENSION r4, r4d 577ALIGN 4 578.height_loop: 579 SSE_READ_UNA xmm0, r0 580 SSE_READ_UNA xmm1, r0+r1 581 SSE_WRITE_UNA r2, xmm0 582 SSE_WRITE_UNA r2+r3, xmm1 583 584 sub r4, 2 585 lea r0, [r0+r1*2] 586 lea r2, [r2+r3*2] 587 jnz .height_loop 588 589 LOAD_5_PARA_POP 590 ret 591 592 593;******************************************************************************* 594; void McCopyWidthEq16_sse3( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight ) 595;******************************************************************************* 596WELS_EXTERN McCopyWidthEq16_sse3 597 %assign push_num 0 598%ifdef X86_32 599 push r5 600 push r6 601 %assign push_num 2 602%endif 603 LOAD_5_PARA 604 SIGN_EXTENSION r1, r1d 605 SIGN_EXTENSION r3, r3d 606 SIGN_EXTENSION r4, r4d 607 608 CopyStrided4N lddqu, MOVDQ, r2, r3, r0, r1, r4, r5, r6, xmm0, xmm1 609 610 LOAD_5_PARA_POP 611%ifdef X86_32 612 pop r6 613 pop r5 614%endif 615 ret 616