1;*! 2;* \copy 3;* Copyright (c) 2004-2013, Cisco Systems 4;* All rights reserved. 5;* 6;* Redistribution and use in source and binary forms, with or without 7;* modification, are permitted provided that the following conditions 8;* are met: 9;* 10;* * Redistributions of source code must retain the above copyright 11;* notice, this list of conditions and the following disclaimer. 12;* 13;* * Redistributions in binary form must reproduce the above copyright 14;* notice, this list of conditions and the following disclaimer in 15;* the documentation and/or other materials provided with the 16;* distribution. 17;* 18;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29;* POSSIBILITY OF SUCH DAMAGE. 30;* 31;* 32;* mc_chroma.asm 33;* 34;* Abstract 35;* mmx motion compensation for chroma 36;* 37;* History 38;* 10/13/2004 Created 39;* 40;* 41;*************************************************************************/ 42%include "asm_inc.asm" 43 44;*********************************************************************** 45; Local Data (Read Only) 46;*********************************************************************** 47 48SECTION .rodata align=16 49 50;*********************************************************************** 51; Various memory constants (trigonometric values or rounding values) 52;*********************************************************************** 53 54ALIGN 16 55h264_d0x20_sse2: 56 dw 32,32,32,32,32,32,32,32 57ALIGN 16 58h264_d0x20_mmx: 59 dw 32,32,32,32 60 61 62;============================================================================= 63; Code 64;============================================================================= 65 66SECTION .text 67 68;******************************************************************************* 69; void McChromaWidthEq4_mmx( const uint8_t *src, 70; int32_t iSrcStride, 71; uint8_t *pDst, 72; int32_t iDstStride, 73; const uint8_t *pABCD, 74; int32_t iHeigh ); 75;******************************************************************************* 76WELS_EXTERN McChromaWidthEq4_mmx 77 %assign push_num 0 78 LOAD_6_PARA 79 SIGN_EXTENSION r1, r1d 80 SIGN_EXTENSION r3, r3d 81 SIGN_EXTENSION r5, r5d 82 83 movd mm3, [r4]; [eax] 84 WELS_Zero mm7 85 punpcklbw mm3, mm3 86 movq mm4, mm3 87 punpcklwd mm3, mm3 88 punpckhwd mm4, mm4 89 90 movq mm5, mm3 91 punpcklbw mm3, mm7 92 punpckhbw mm5, mm7 93 94 movq mm6, mm4 95 punpcklbw mm4, mm7 96 punpckhbw mm6, mm7 97 98 lea r4, [r0 + r1] ;lea ebx, [esi + eax] 99 movd mm0, [r0] 100 movd mm1, [r0+1] 101 punpcklbw mm0, mm7 102 punpcklbw mm1, mm7 103.xloop: 104 105 pmullw mm0, mm3 106 pmullw mm1, mm5 107 paddw mm0, mm1 108 109 movd mm1, [r4] 110 punpcklbw mm1, mm7 111 movq mm2, mm1 112 pmullw mm1, mm4 113 paddw mm0, mm1 114 115 movd mm1, [r4+1] 116 punpcklbw mm1, mm7 117 movq mm7, mm1 118 pmullw mm1,mm6 119 paddw mm0, mm1 120 movq mm1,mm7 121 122%ifdef X86_32_PICASM 123 pcmpeqw mm7, mm7 124 psrlw mm7, 15 125 psllw mm7, 5 126 paddw mm0, mm7 127%else 128 paddw mm0, [h264_d0x20_mmx] 129%endif 130 psrlw mm0, 6 131 132 WELS_Zero mm7 133 packuswb mm0, mm7 134 movd [r2], mm0 135 136 movq mm0, mm2 137 138 lea r2, [r2 + r3] 139 lea r4, [r4 + r1] 140 141 dec r5 142 jnz near .xloop 143 WELSEMMS 144 LOAD_6_PARA_POP 145 ret 146 147 148;******************************************************************************* 149; void McChromaWidthEq8_sse2( const uint8_t *pSrc, 150; int32_t iSrcStride, 151; uint8_t *pDst, 152; int32_t iDstStride, 153; const uint8_t *pABCD, 154; int32_t iheigh ); 155;******************************************************************************* 156WELS_EXTERN McChromaWidthEq8_sse2 157 %assign push_num 0 158 LOAD_6_PARA 159 PUSH_XMM 8 160 SIGN_EXTENSION r1, r1d 161 SIGN_EXTENSION r3, r3d 162 SIGN_EXTENSION r5, r5d 163 164 movd xmm3, [r4] 165 WELS_Zero xmm7 166 punpcklbw xmm3, xmm3 167 punpcklwd xmm3, xmm3 168 169 movdqa xmm4, xmm3 170 punpckldq xmm3, xmm3 171 punpckhdq xmm4, xmm4 172 movdqa xmm5, xmm3 173 movdqa xmm6, xmm4 174 175 punpcklbw xmm3, xmm7 176 punpckhbw xmm5, xmm7 177 punpcklbw xmm4, xmm7 178 punpckhbw xmm6, xmm7 179 180 lea r4, [r0 + r1] ;lea ebx, [esi + eax] 181 movq xmm0, [r0] 182 movq xmm1, [r0+1] 183 punpcklbw xmm0, xmm7 184 punpcklbw xmm1, xmm7 185.xloop: 186 187 pmullw xmm0, xmm3 188 pmullw xmm1, xmm5 189 paddw xmm0, xmm1 190 191 movq xmm1, [r4] 192 punpcklbw xmm1, xmm7 193 movdqa xmm2, xmm1 194 pmullw xmm1, xmm4 195 paddw xmm0, xmm1 196 197 movq xmm1, [r4+1] 198 punpcklbw xmm1, xmm7 199 movdqa xmm7, xmm1 200 pmullw xmm1, xmm6 201 paddw xmm0, xmm1 202 movdqa xmm1,xmm7 203 204%ifdef X86_32_PICASM 205 pcmpeqw xmm7, xmm7 206 psrlw xmm7, 15 207 psllw xmm7, 5 208 paddw xmm0, xmm7 209%else 210 paddw xmm0, [h264_d0x20_sse2] 211%endif 212 psrlw xmm0, 6 213 214 WELS_Zero xmm7 215 packuswb xmm0, xmm7 216 movq [r2], xmm0 217 218 movdqa xmm0, xmm2 219 220 lea r2, [r2 + r3] 221 lea r4, [r4 + r1] 222 223 dec r5 224 jnz near .xloop 225 226 POP_XMM 227 LOAD_6_PARA_POP 228 229 ret 230 231 232 233 234;*********************************************************************** 235; void McChromaWidthEq8_ssse3( const uint8_t *pSrc, 236; int32_t iSrcStride, 237; uint8_t *pDst, 238; int32_t iDstStride, 239; const uint8_t *pABCD, 240; int32_t iHeigh); 241;*********************************************************************** 242WELS_EXTERN McChromaWidthEq8_ssse3 243 %assign push_num 0 244 LOAD_6_PARA 245 PUSH_XMM 8 246 SIGN_EXTENSION r1, r1d 247 SIGN_EXTENSION r3, r3d 248 SIGN_EXTENSION r5, r5d 249 250 pxor xmm7, xmm7 251 movd xmm5, [r4] 252 punpcklwd xmm5, xmm5 253 punpckldq xmm5, xmm5 254 movdqa xmm6, xmm5 255 punpcklqdq xmm5, xmm5 256 punpckhqdq xmm6, xmm6 257 258 sub r2, r3 ;sub esi, edi 259 sub r2, r3 260%ifdef X86_32_PICASM 261 pcmpeqw xmm7, xmm7 262 psrlw xmm7, 15 263 psllw xmm7, 5 264%else 265 movdqa xmm7, [h264_d0x20_sse2] 266%endif 267 268 movdqu xmm0, [r0] 269 movdqa xmm1, xmm0 270 psrldq xmm1, 1 271 punpcklbw xmm0, xmm1 272 273.hloop_chroma: 274 lea r2, [r2+2*r3] 275 276 movdqu xmm2, [r0+r1] 277 movdqa xmm3, xmm2 278 psrldq xmm3, 1 279 punpcklbw xmm2, xmm3 280 movdqa xmm4, xmm2 281 282 pmaddubsw xmm0, xmm5 283 pmaddubsw xmm2, xmm6 284 paddw xmm0, xmm2 285 paddw xmm0, xmm7 286 psrlw xmm0, 6 287 packuswb xmm0, xmm0 288 movq [r2],xmm0 289 290 lea r0, [r0+2*r1] 291 movdqu xmm2, [r0] 292 movdqa xmm3, xmm2 293 psrldq xmm3, 1 294 punpcklbw xmm2, xmm3 295 movdqa xmm0, xmm2 296 297 pmaddubsw xmm4, xmm5 298 pmaddubsw xmm2, xmm6 299 paddw xmm4, xmm2 300 paddw xmm4, xmm7 301 psrlw xmm4, 6 302 packuswb xmm4, xmm4 303 movq [r2+r3],xmm4 304 305 sub r5, 2 306 jnz .hloop_chroma 307 308 POP_XMM 309 LOAD_6_PARA_POP 310 311 ret 312 313 314