1;*! 2;* \copy 3;* Copyright (c) 2009-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* expand_picture.asm 33;* 34;* Abstract 35;* mmxext/sse for expand_frame 36;* 37;* History 38;* 09/25/2009 Created 39;* 40;* 41;*************************************************************************/ 42 43%include "asm_inc.asm" 44 45 46 47;*********************************************************************** 48; Macros and other preprocessor constants 49;*********************************************************************** 50 51;*********************************************************************** 52; Code 53;*********************************************************************** 54 55 56 57SECTION .text 58 59 60;;;;;;;expanding result;;;;;;; 61 62;aaaa|attttttttttttttttb|bbbb 63;aaaa|attttttttttttttttb|bbbb 64;aaaa|attttttttttttttttb|bbbb 65;aaaa|attttttttttttttttb|bbbb 66;---------------------------- 67;aaaa|attttttttttttttttb|bbbb 68;llll|l r|rrrr 69;llll|l r|rrrr 70;llll|l r|rrrr 71;llll|l r|rrrr 72;llll|l r|rrrr 73;cccc|ceeeeeeeeeeeeeeeed|dddd 74;---------------------------- 75;cccc|ceeeeeeeeeeeeeeeed|dddd 76;cccc|ceeeeeeeeeeeeeeeed|dddd 77;cccc|ceeeeeeeeeeeeeeeed|dddd 78;cccc|ceeeeeeeeeeeeeeeed|dddd 79 80%macro mov_line_8x4_mmx 3 ; dst, stride, mm? 81 movq [%1], %3 82 movq [%1+%2], %3 83 lea %1, [%1+2*%2] 84 movq [%1], %3 85 movq [%1+%2], %3 86 lea %1, [%1+2*%2] 87%endmacro 88 89%macro mov_line_end8x4_mmx 3 ; dst, stride, mm? 90 movq [%1], %3 91 movq [%1+%2], %3 92 lea %1, [%1+2*%2] 93 movq [%1], %3 94 movq [%1+%2], %3 95 lea %1, [%1+%2] 96%endmacro 97 98%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a 99 movdq%4 [%1], %3 ; top(bottom)_0 100 movdq%4 [%1+%2], %3 ; top(bottom)_1 101 lea %1, [%1+2*%2] 102 movdq%4 [%1], %3 ; top(bottom)_2 103 movdq%4 [%1+%2], %3 ; top(bottom)_3 104 lea %1, [%1+2*%2] 105%endmacro 106 107%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a 108 movdq%4 [%1], %3 ; top(bottom)_0 109 movdq%4 [%1+%2], %3 ; top(bottom)_1 110 lea %1, [%1+2*%2] 111 movdq%4 [%1], %3 ; top(bottom)_2 112 movdq%4 [%1+%2], %3 ; top(bottom)_3 113 lea %1, [%1+%2] 114%endmacro 115 116%macro mov_line_32x4_sse2 3 ; dst, stride, xmm? 117 movdqa [%1], %3 ; top(bottom)_0 118 movdqa [%1+16], %3 ; top(bottom)_0 119 movdqa [%1+%2], %3 ; top(bottom)_1 120 movdqa [%1+%2+16], %3 ; top(bottom)_1 121 lea %1, [%1+2*%2] 122 movdqa [%1], %3 ; top(bottom)_2 123 movdqa [%1+16], %3 ; top(bottom)_2 124 movdqa [%1+%2], %3 ; top(bottom)_3 125 movdqa [%1+%2+16], %3 ; top(bottom)_3 126 lea %1, [%1+2*%2] 127%endmacro 128 129%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm? 130 movdqa [%1], %3 ; top(bottom)_0 131 movdqa [%1+16], %3 ; top(bottom)_0 132 movdqa [%1+%2], %3 ; top(bottom)_1 133 movdqa [%1+%2+16], %3 ; top(bottom)_1 134 lea %1, [%1+2*%2] 135 movdqa [%1], %3 ; top(bottom)_2 136 movdqa [%1+16], %3 ; top(bottom)_2 137 movdqa [%1+%2], %3 ; top(bottom)_3 138 movdqa [%1+%2+16], %3 ; top(bottom)_3 139 lea %1, [%1+%2] 140%endmacro 141 142%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)] 143 ;r2 [width/16(8)] 144 ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top 145 ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom 146 147%if %1 == 32 ; for luma 148 sar r2, 04h ; width / 16(8) pixels 149.top_bottom_loops: 150 ; top 151 movdqa xmm0, [r0] ; first line of picture pData 152 mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? 153 mov_line_16x4_sse2 r5, r1, xmm0, a 154 mov_line_16x4_sse2 r5, r1, xmm0, a 155 mov_line_16x4_sse2 r5, r1, xmm0, a 156 mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? 157 mov_line_16x4_sse2 r5, r1, xmm0, a 158 mov_line_16x4_sse2 r5, r1, xmm0, a 159 mov_line_end16x4_sse2 r5, r1, xmm0, a 160 161 ; bottom 162 movdqa xmm1, [r3] ; last line of picture pData 163 mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? 164 mov_line_16x4_sse2 r4, r1, xmm1, a 165 mov_line_16x4_sse2 r4, r1, xmm1, a 166 mov_line_16x4_sse2 r4, r1, xmm1, a 167 mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? 168 mov_line_16x4_sse2 r4, r1, xmm1, a 169 mov_line_16x4_sse2 r4, r1, xmm1, a 170 mov_line_end16x4_sse2 r4, r1, xmm1, a 171 172 lea r0, [r0+16] ; top pSrc 173 lea r5, [r5+16] ; top dst 174 lea r3, [r3+16] ; bottom pSrc 175 lea r4, [r4+16] ; bottom dst 176 neg r1 ; positive/negative stride need for next loop? 177 178 dec r2 179 jnz near .top_bottom_loops 180%elif %1 == 16 ; for chroma ?? 181 mov r6, r2 182 sar r2, 04h ; (width / 16) pixels 183.top_bottom_loops: 184 ; top 185 movdqa xmm0, [r0] ; first line of picture pData 186 mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm? 187 mov_line_16x4_sse2 r5, r1, xmm0, a 188 mov_line_16x4_sse2 r5, r1, xmm0, a 189 mov_line_end16x4_sse2 r5, r1, xmm0, a 190 191 ; bottom 192 movdqa xmm1, [r3] ; last line of picture pData 193 mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm? 194 mov_line_16x4_sse2 r4, r1, xmm1, a 195 mov_line_16x4_sse2 r4, r1, xmm1, a 196 mov_line_end16x4_sse2 r4, r1, xmm1, a 197 198 lea r0, [r0+16] ; top pSrc 199 lea r5, [r5+16] ; top dst 200 lea r3, [r3+16] ; bottom pSrc 201 lea r4, [r4+16] ; bottom dst 202 neg r1 ; positive/negative stride need for next loop? 203 204 dec r2 205 jnz near .top_bottom_loops 206 207 ; for remaining 8 bytes 208 and r6, 0fh ; any 8 bytes left? 209 test r6, r6 210 jz near .to_be_continued ; no left to exit here 211 212 ; top 213 movq mm0, [r0] ; remained 8 byte 214 mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? 215 mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? 216 mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm? 217 mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm? 218 ; bottom 219 movq mm1, [r3] 220 mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? 221 mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? 222 mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm? 223 mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm? 224 WELSEMMS 225 226.to_be_continued: 227%endif 228%endmacro 229 230%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a 231 ;r6 [height] 232 ;r0 [pSrc+0] r5[pSrc-32] r1[stride] 233 ;r3 [pSrc+(w-1)] r4[pSrc+w] 234 235%if %1 == 32 ; for luma 236.left_right_loops: 237 ; left 238 movzx r2d, byte [r0] ; pixel pData for left border 239 SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] 240 movdqa [r5], xmm0 241 movdqa [r5+16], xmm0 242 243 ; right 244 movzx r2d, byte [r3] 245 SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] 246 movdqa [r4], xmm1 247 movdqa [r4+16], xmm1 248 249 lea r0, [r0+r1] ; left pSrc 250 lea r5, [r5+r1] ; left dst 251 lea r3, [r3+r1] ; right pSrc 252 lea r4, [r4+r1] ; right dst 253 254 dec r6 255 jnz near .left_right_loops 256%elif %1 == 16 ; for chroma ?? 257.left_right_loops: 258 ; left 259 movzx r2d, byte [r0] ; pixel pData for left border 260 SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] 261 movdqa [r5], xmm0 262 263 ; right 264 movzx r2d, byte [r3] 265 SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d] 266 movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes 267 268 lea r0, [r0+r1] ; left pSrc 269 lea r5, [r5+r1] ; left dst 270 lea r3, [r3+r1] ; right pSrc 271 lea r4, [r4+r1] ; right dst 272 273 dec r6 274 jnz near .left_right_loops 275%endif 276%endmacro 277 278%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a 279 ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6 280 ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride 281 ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride 282%if %1 == 32 ; luma 283 ; TL 284 mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? 285 mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? 286 mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? 287 mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? 288 mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? 289 mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? 290 mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? 291 mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm? 292 293 ; TR 294 mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? 295 mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? 296 mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? 297 mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? 298 mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? 299 mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? 300 mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? 301 mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm? 302 303 ; BL 304 mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? 305 mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? 306 mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? 307 mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? 308 mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? 309 mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? 310 mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? 311 mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm? 312 313 ; BR 314 mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? 315 mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? 316 mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? 317 mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? 318 mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? 319 mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? 320 mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? 321 mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm? 322%elif %1 == 16 ; chroma 323 ; TL 324 mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? 325 mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? 326 mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? 327 mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm? 328 329 ; TR 330 mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? 331 mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? 332 mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? 333 mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm? 334 335 ; BL 336 mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? 337 mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? 338 mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? 339 mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm? 340 341 ; BR 342 mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? 343 mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? 344 mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? 345 mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm? 346%endif 347%endmacro 348 349;***********************************************************************---------------- 350; void ExpandPictureLuma_sse2( uint8_t *pDst, 351; const int32_t iStride, 352; const int32_t iWidth, 353; const int32_t iHeight ); 354;***********************************************************************---------------- 355WELS_EXTERN ExpandPictureLuma_sse2 356 357 push r4 358 push r5 359 push r6 360 361 %assign push_num 3 362 LOAD_4_PARA 363 PUSH_XMM 7 364 365 SIGN_EXTENSION r1, r1d 366 SIGN_EXTENSION r2, r2d 367 SIGN_EXTENSION r3, r3d 368 369 ;also prepare for cross border pData top-left:xmm3 370 371 movzx r6d,byte[r0] 372 SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0] 373 374 neg r1 375 lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride] 376 neg r1 377 378 push r3 379 380 381 dec r3 ;h-1 382 imul r3,r1 ;(h-1)*stride 383 lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom 384 385 mov r6,r1 ;r6 = stride 386 sal r6,05h ;r6 = 32*stride 387 lea r4,[r3+r6] ;r4 = dst bottom 388 389 ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6 390 391 movzx r6d,byte [r3] ;bottom-left 392 SSE2_Copy16Times xmm5,r6d 393 394 lea r6,[r3+r2-1] 395 movzx r6d,byte [r6] 396 SSE2_Copy16Times xmm6,r6d ;bottom-right 397 398 neg r1 ;r1 = -stride 399 400 push r0 401 push r1 402 push r2 403 404 exp_top_bottom_sse2 32 405 406 ; for both left and right border 407 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 408 409 pop r2 410 pop r1 411 pop r0 412 413 lea r5,[r0-32] ;left border dst luma =32 chroma = -16 414 415 lea r3,[r0+r2-1] ;right border src 416 lea r4,[r3+1] ;right border dst 417 418 ;prepare for cross border data: top-rigth with xmm4 419 movzx r6d,byte [r3] ;top -rigth 420 SSE2_Copy16Times xmm4,r6d 421 422 neg r1 ;r1 = stride 423 424 425 pop r6 ; r6 = height 426 427 428 429 push r0 430 push r1 431 push r2 432 push r6 433 434 exp_left_right_sse2 32,a 435 436 pop r6 437 pop r2 438 pop r1 439 pop r0 440 441 ; for cross border [top-left, top-right, bottom-left, bottom-right] 442 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 443 ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. 444 445 neg r1 ;r1 = -stride 446 lea r3,[r0-32] 447 lea r3,[r3+r1] ;last line of top-left border 448 449 lea r4,[r0+r2] ;psrc +width 450 lea r4,[r4+r1] ;psrc +width -stride 451 452 453 neg r1 ;r1 = stride 454 add r6,32 ;height +32(16) ,luma = 32, chroma = 16 455 imul r6,r1 456 457 lea r5,[r3+r6] ;last line of bottom-left border 458 lea r6,[r4+r6] ;last line of botoom-right border 459 460 neg r1 ; r1 = -stride 461 462 ; for left & right border expanding 463 exp_cross_sse2 32,a 464 465 POP_XMM 466 LOAD_4_PARA_POP 467 468 pop r6 469 pop r5 470 pop r4 471 472 %assign push_num 0 473 474 475 ret 476 477;***********************************************************************---------------- 478; void ExpandPictureChromaAlign_sse2( uint8_t *pDst, 479; const int32_t iStride, 480; const int32_t iWidth, 481; const int32_t iHeight ); 482;***********************************************************************---------------- 483WELS_EXTERN ExpandPictureChromaAlign_sse2 484 485 push r4 486 push r5 487 push r6 488 489 %assign push_num 3 490 LOAD_4_PARA 491 PUSH_XMM 7 492 493 SIGN_EXTENSION r1,r1d 494 SIGN_EXTENSION r2,r2d 495 SIGN_EXTENSION r3,r3d 496 497 ;also prepare for cross border pData top-left:xmm3 498 499 movzx r6d,byte [r0] 500 SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0] 501 502 neg r1 503 lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride] 504 neg r1 505 506 push r3 507 508 509 dec r3 ;h-1 510 imul r3,r1 ;(h-1)*stride 511 lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom 512 513 mov r6,r1 ;r6 = stride 514 sal r6,04h ;r6 = 32*stride 515 lea r4,[r3+r6] ;r4 = dst bottom 516 517 ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6 518 519 movzx r6d,byte [r3] ;bottom-left 520 SSE2_Copy16Times xmm5,r6d 521 522 lea r6,[r3+r2-1] 523 movzx r6d,byte [r6] 524 SSE2_Copy16Times xmm6,r6d ;bottom-right 525 526 neg r1 ;r1 = -stride 527 528 push r0 529 push r1 530 push r2 531 532 exp_top_bottom_sse2 16 533 534 ; for both left and right border 535 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 536 537 pop r2 538 pop r1 539 pop r0 540 541 lea r5,[r0-16] ;left border dst luma =32 chroma = -16 542 543 lea r3,[r0+r2-1] ;right border src 544 lea r4,[r3+1] ;right border dst 545 546 ;prepare for cross border data: top-rigth with xmm4 547 movzx r6d,byte [r3] ;top -rigth 548 SSE2_Copy16Times xmm4,r6d 549 550 neg r1 ;r1 = stride 551 552 553 pop r6 ; r6 = height 554 555 556 557 push r0 558 push r1 559 push r2 560 push r6 561 exp_left_right_sse2 16,a 562 563 pop r6 564 pop r2 565 pop r1 566 pop r0 567 568 ; for cross border [top-left, top-right, bottom-left, bottom-right] 569 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 570 ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. 571 572 neg r1 ;r1 = -stride 573 lea r3,[r0-16] 574 lea r3,[r3+r1] ;last line of top-left border 575 576 lea r4,[r0+r2] ;psrc +width 577 lea r4,[r4+r1] ;psrc +width -stride 578 579 580 neg r1 ;r1 = stride 581 add r6,16 ;height +32(16) ,luma = 32, chroma = 16 582 imul r6,r1 583 584 lea r5,[r3+r6] ;last line of bottom-left border 585 lea r6,[r4+r6] ;last line of botoom-right border 586 587 neg r1 ; r1 = -stride 588 589 ; for left & right border expanding 590 exp_cross_sse2 16,a 591 592 POP_XMM 593 LOAD_4_PARA_POP 594 595 pop r6 596 pop r5 597 pop r4 598 599 %assign push_num 0 600 601 602 ret 603 604;***********************************************************************---------------- 605; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst, 606; const int32_t iStride, 607; const int32_t iWidth, 608; const int32_t iHeight ); 609;***********************************************************************---------------- 610WELS_EXTERN ExpandPictureChromaUnalign_sse2 611 push r4 612 push r5 613 push r6 614 615 %assign push_num 3 616 LOAD_4_PARA 617 PUSH_XMM 7 618 619 SIGN_EXTENSION r1,r1d 620 SIGN_EXTENSION r2,r2d 621 SIGN_EXTENSION r3,r3d 622 623 ;also prepare for cross border pData top-left:xmm3 624 625 movzx r6d,byte [r0] 626 SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0] 627 628 neg r1 629 lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride] 630 neg r1 631 632 push r3 633 634 635 dec r3 ;h-1 636 imul r3,r1 ;(h-1)*stride 637 lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom 638 639 mov r6,r1 ;r6 = stride 640 sal r6,04h ;r6 = 32*stride 641 lea r4,[r3+r6] ;r4 = dst bottom 642 643 ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6 644 645 movzx r6d,byte [r3] ;bottom-left 646 SSE2_Copy16Times xmm5,r6d 647 648 lea r6,[r3+r2-1] 649 movzx r6d,byte [r6] 650 SSE2_Copy16Times xmm6,r6d ;bottom-right 651 652 neg r1 ;r1 = -stride 653 654 push r0 655 push r1 656 push r2 657 658 exp_top_bottom_sse2 16 659 660 ; for both left and right border 661 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 662 663 pop r2 664 pop r1 665 pop r0 666 667 lea r5,[r0-16] ;left border dst luma =32 chroma = -16 668 669 lea r3,[r0+r2-1] ;right border src 670 lea r4,[r3+1] ;right border dst 671 672 ;prepare for cross border data: top-rigth with xmm4 673 movzx r6d,byte [r3] ;top -rigth 674 SSE2_Copy16Times xmm4,r6d 675 676 neg r1 ;r1 = stride 677 678 679 pop r6 ; r6 = height 680 681 682 683 push r0 684 push r1 685 push r2 686 push r6 687 exp_left_right_sse2 16,u 688 689 pop r6 690 pop r2 691 pop r1 692 pop r0 693 694 ; for cross border [top-left, top-right, bottom-left, bottom-right] 695 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 696 ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued.. 697 698 neg r1 ;r1 = -stride 699 lea r3,[r0-16] 700 lea r3,[r3+r1] ;last line of top-left border 701 702 lea r4,[r0+r2] ;psrc +width 703 lea r4,[r4+r1] ;psrc +width -stride 704 705 706 neg r1 ;r1 = stride 707 add r6,16 ;height +32(16) ,luma = 32, chroma = 16 708 imul r6,r1 709 710 lea r5,[r3+r6] ;last line of bottom-left border 711 lea r6,[r4+r6] ;last line of botoom-right border 712 713 neg r1 ; r1 = -stride 714 715 ; for left & right border expanding 716 exp_cross_sse2 16,u 717 718 POP_XMM 719 LOAD_4_PARA_POP 720 721 pop r6 722 pop r5 723 pop r4 724 725 %assign push_num 0 726 727 728 ret 729