1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%macro STACK_FRAME_CREATE 0 15%if ABI_IS_32BIT 16 %define input rsi 17 %define output rdi 18 %define pitch rax 19 push rbp 20 mov rbp, rsp 21 GET_GOT rbx 22 push rsi 23 push rdi 24 ; end prolog 25 26 mov rsi, arg(0) 27 mov rdi, arg(1) 28 29 movsxd rax, dword ptr arg(2) 30 lea rcx, [rsi + rax*2] 31%else 32 %ifidn __OUTPUT_FORMAT__,x64 33 %define input rcx 34 %define output rdx 35 %define pitch r8 36 %else 37 %define input rdi 38 %define output rsi 39 %define pitch rdx 40 %endif 41%endif 42%endmacro 43 44%macro STACK_FRAME_DESTROY 0 45 %define input 46 %define output 47 %define pitch 48 49%if ABI_IS_32BIT 50 pop rdi 51 pop rsi 52 RESTORE_GOT 53 pop rbp 54%else 55 %ifidn __OUTPUT_FORMAT__,x64 56 %endif 57%endif 58 ret 59%endmacro 60 61;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) 62global sym(vp8_short_fdct4x4_sse2) 63sym(vp8_short_fdct4x4_sse2): 64 65 STACK_FRAME_CREATE 66 67 movq xmm0, MMWORD PTR[input ] ;03 02 01 00 68 movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 69 lea input, [input+2*pitch] 70 movq xmm1, MMWORD PTR[input ] ;23 22 21 20 71 movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 72 73 punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 74 punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 75 76 movdqa xmm2, xmm0 77 punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 78 punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 79 movdqa xmm1, xmm0 80 punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 81 pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx 82 pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx 83 84 punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 85 movdqa xmm3, xmm0 86 paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 87 psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 88 psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 89 psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 90 91 movdqa xmm1, xmm0 92 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 93 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 94 movdqa xmm4, xmm3 95 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 96 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 97 98 paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] 99 paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] 100 psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 101 psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 102 103 packssdw xmm0, xmm1 ;op[2] op[0] 104 packssdw xmm3, xmm4 ;op[3] op[1] 105 ; 23 22 21 20 03 02 01 00 106 ; 107 ; 33 32 31 30 13 12 11 10 108 ; 109 movdqa xmm2, xmm0 110 punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 111 punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 112 113 movdqa xmm3, xmm0 114 punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 115 punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 116 movdqa xmm2, xmm0 117 punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 118 punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 119 120 movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] 121 pshufd xmm2, xmm2, 04eh 122 movdqa xmm3, xmm0 123 paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 124 psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 125 126 pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 127 movdqa xmm2, xmm3 ;save d1 for compare 128 pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 129 pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 130 pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 131 pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 132 pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 133 movdqa xmm1, xmm0 134 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 135 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 136 137 pxor xmm4, xmm4 ;zero out for compare 138 paddd xmm0, xmm5 139 paddd xmm1, xmm5 140 pcmpeqw xmm2, xmm4 141 psrad xmm0, 4 ;(a1 + b1 + 7)>>4 142 psrad xmm1, 4 ;(a1 - b1 + 7)>>4 143 pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, 144 ;and keep bit 0 of lower 145 146 movdqa xmm4, xmm3 147 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 148 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 149 paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] 150 paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] 151 packssdw xmm0, xmm1 ;op[8] op[0] 152 psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 153 psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 154 155 packssdw xmm3, xmm4 ;op[12] op[4] 156 movdqa xmm1, xmm0 157 paddw xmm3, xmm2 ;op[4] += (d1!=0) 158 punpcklqdq xmm0, xmm3 ;op[4] op[0] 159 punpckhqdq xmm1, xmm3 ;op[12] op[8] 160 161 movdqa XMMWORD PTR[output + 0], xmm0 162 movdqa XMMWORD PTR[output + 16], xmm1 163 164 STACK_FRAME_DESTROY 165 166;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) 167global sym(vp8_short_fdct8x4_sse2) 168sym(vp8_short_fdct8x4_sse2): 169 170 STACK_FRAME_CREATE 171 172 ; read the input data 173 movdqa xmm0, [input ] 174 movdqa xmm2, [input+ pitch] 175 lea input, [input+2*pitch] 176 movdqa xmm4, [input ] 177 movdqa xmm3, [input+ pitch] 178 179 ; transpose for the first stage 180 movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 181 movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 182 183 punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 184 punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 185 186 punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 187 punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 188 189 movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 190 punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 191 192 punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 193 194 movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 195 punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 196 197 punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 198 movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 199 200 punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 201 punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 202 203 movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 204 punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 205 206 punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 207 208 ; xmm0 0 209 ; xmm1 1 210 ; xmm2 2 211 ; xmm3 3 212 213 ; first stage 214 movdqa xmm5, xmm0 215 movdqa xmm4, xmm1 216 217 paddw xmm0, xmm3 ; a1 = 0 + 3 218 paddw xmm1, xmm2 ; b1 = 1 + 2 219 220 psubw xmm4, xmm2 ; c1 = 1 - 2 221 psubw xmm5, xmm3 ; d1 = 0 - 3 222 223 psllw xmm5, 3 224 psllw xmm4, 3 225 226 psllw xmm0, 3 227 psllw xmm1, 3 228 229 ; output 0 and 2 230 movdqa xmm2, xmm0 ; a1 231 232 paddw xmm0, xmm1 ; op[0] = a1 + b1 233 psubw xmm2, xmm1 ; op[2] = a1 - b1 234 235 ; output 1 and 3 236 ; interleave c1, d1 237 movdqa xmm1, xmm5 ; d1 238 punpcklwd xmm1, xmm4 ; c1 d1 239 punpckhwd xmm5, xmm4 ; c1 d1 240 241 movdqa xmm3, xmm1 242 movdqa xmm4, xmm5 243 244 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 245 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 246 247 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 248 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 249 250 paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] 251 paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] 252 paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] 253 paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] 254 255 psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 256 psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 257 psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 258 psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 259 260 packssdw xmm1, xmm4 ; op[1] 261 packssdw xmm3, xmm5 ; op[3] 262 263 ; done with vertical 264 ; transpose for the second stage 265 movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 266 movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 267 268 punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 269 punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 270 271 punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 272 punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 273 274 movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 275 punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 276 277 punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 278 279 movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 280 punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 281 282 punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 283 movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 284 285 punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 286 punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 287 288 movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 289 punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 290 291 punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 292 293 ; xmm0 0 294 ; xmm1 4 295 ; xmm2 1 296 ; xmm3 3 297 298 movdqa xmm5, xmm0 299 movdqa xmm2, xmm1 300 301 paddw xmm0, xmm3 ; a1 = 0 + 3 302 paddw xmm1, xmm4 ; b1 = 1 + 2 303 304 psubw xmm4, xmm2 ; c1 = 1 - 2 305 psubw xmm5, xmm3 ; d1 = 0 - 3 306 307 pxor xmm6, xmm6 ; zero out for compare 308 309 pcmpeqw xmm6, xmm5 ; d1 != 0 310 311 pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, 312 ; and keep bit 0 of lower 313 314 ; output 0 and 2 315 movdqa xmm2, xmm0 ; a1 316 317 paddw xmm0, xmm1 ; a1 + b1 318 psubw xmm2, xmm1 ; a1 - b1 319 320 paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] 321 paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] 322 323 psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 324 psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 325 326 ; output 1 and 3 327 ; interleave c1, d1 328 movdqa xmm1, xmm5 ; d1 329 punpcklwd xmm1, xmm4 ; c1 d1 330 punpckhwd xmm5, xmm4 ; c1 d1 331 332 movdqa xmm3, xmm1 333 movdqa xmm4, xmm5 334 335 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 336 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 337 338 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 339 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 340 341 paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] 342 paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] 343 paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] 344 paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] 345 346 psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 347 psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 348 psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 349 psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 350 351 packssdw xmm1, xmm4 ; op[4] 352 packssdw xmm3, xmm5 ; op[12] 353 354 paddw xmm1, xmm6 ; op[4] += (d1!=0) 355 356 movdqa xmm4, xmm0 357 movdqa xmm5, xmm2 358 359 punpcklqdq xmm0, xmm1 360 punpckhqdq xmm4, xmm1 361 362 punpcklqdq xmm2, xmm3 363 punpckhqdq xmm5, xmm3 364 365 movdqa XMMWORD PTR[output + 0 ], xmm0 366 movdqa XMMWORD PTR[output + 16], xmm2 367 movdqa XMMWORD PTR[output + 32], xmm4 368 movdqa XMMWORD PTR[output + 48], xmm5 369 370 STACK_FRAME_DESTROY 371 372SECTION_RODATA 373align 16 374_5352_2217: 375 dw 5352 376 dw 2217 377 dw 5352 378 dw 2217 379 dw 5352 380 dw 2217 381 dw 5352 382 dw 2217 383align 16 384_2217_neg5352: 385 dw 2217 386 dw -5352 387 dw 2217 388 dw -5352 389 dw 2217 390 dw -5352 391 dw 2217 392 dw -5352 393align 16 394_mult_add: 395 times 8 dw 1 396align 16 397_cmp_mask: 398 times 4 dw 1 399 times 4 dw 0 400align 16 401_cmp_mask8x4: 402 times 8 dw 1 403align 16 404_mult_sub: 405 dw 1 406 dw -1 407 dw 1 408 dw -1 409 dw 1 410 dw -1 411 dw 1 412 dw -1 413align 16 414_7: 415 times 4 dd 7 416align 16 417_7w: 418 times 8 dw 7 419align 16 420_14500: 421 times 4 dd 14500 422align 16 423_7500: 424 times 4 dd 7500 425align 16 426_12000: 427 times 4 dd 12000 428align 16 429_51000: 430 times 4 dd 51000 431