1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%macro STACK_FRAME_CREATE 0 15%if ABI_IS_32BIT 16 %define input rsi 17 %define output rdi 18 %define pitch rax 19 push rbp 20 mov rbp, rsp 21 GET_GOT rbx 22 push rsi 23 push rdi 24 ; end prolog 25 26 mov rsi, arg(0) 27 mov rdi, arg(1) 28 29 movsxd rax, dword ptr arg(2) 30 lea rcx, [rsi + rax*2] 31%else 32 %if LIBVPX_YASM_WIN64 33 %define input rcx 34 %define output rdx 35 %define pitch r8 36 SAVE_XMM 7, u 37 %else 38 %define input rdi 39 %define output rsi 40 %define pitch rdx 41 %endif 42%endif 43%endmacro 44 45%macro STACK_FRAME_DESTROY 0 46 %define input 47 %define output 48 %define pitch 49 50%if ABI_IS_32BIT 51 pop rdi 52 pop rsi 53 RESTORE_GOT 54 pop rbp 55%else 56 %if LIBVPX_YASM_WIN64 57 RESTORE_XMM 58 %endif 59%endif 60 ret 61%endmacro 62 63;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) 64global sym(vp8_short_fdct4x4_sse2) PRIVATE 65sym(vp8_short_fdct4x4_sse2): 66 67 STACK_FRAME_CREATE 68 69 movq xmm0, MMWORD PTR[input ] ;03 02 01 00 70 movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 71 lea input, [input+2*pitch] 72 movq xmm1, MMWORD PTR[input ] ;23 22 21 20 73 movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 74 75 punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 76 punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 77 78 movdqa xmm2, xmm0 79 punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 80 punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 81 movdqa xmm1, xmm0 82 punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 83 pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx 84 pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx 85 86 punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 87 movdqa xmm3, xmm0 88 paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 89 psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 90 psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 91 psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 92 93 movdqa xmm1, xmm0 94 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 95 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 96 movdqa xmm4, xmm3 97 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 98 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 99 100 paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] 101 paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] 102 psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 103 psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 104 105 packssdw xmm0, xmm1 ;op[2] op[0] 106 packssdw xmm3, xmm4 ;op[3] op[1] 107 ; 23 22 21 20 03 02 01 00 108 ; 109 ; 33 32 31 30 13 12 11 10 110 ; 111 movdqa xmm2, xmm0 112 punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 113 punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 114 115 movdqa xmm3, xmm0 116 punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 117 punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 118 movdqa xmm2, xmm0 119 punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 120 punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 121 122 movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] 123 pshufd xmm2, xmm2, 04eh 124 movdqa xmm3, xmm0 125 paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 126 psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 127 128 pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 129 movdqa xmm2, xmm3 ;save d1 for compare 130 pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 131 pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 132 pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 133 pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 134 pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 135 movdqa xmm1, xmm0 136 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 137 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 138 139 pxor xmm4, xmm4 ;zero out for compare 140 paddd xmm0, xmm5 141 paddd xmm1, xmm5 142 pcmpeqw xmm2, xmm4 143 psrad xmm0, 4 ;(a1 + b1 + 7)>>4 144 psrad xmm1, 4 ;(a1 - b1 + 7)>>4 145 pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, 146 ;and keep bit 0 of lower 147 148 movdqa xmm4, xmm3 149 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 150 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 151 paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] 152 paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] 153 packssdw xmm0, xmm1 ;op[8] op[0] 154 psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 155 psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 156 157 packssdw xmm3, xmm4 ;op[12] op[4] 158 movdqa xmm1, xmm0 159 paddw xmm3, xmm2 ;op[4] += (d1!=0) 160 punpcklqdq xmm0, xmm3 ;op[4] op[0] 161 punpckhqdq xmm1, xmm3 ;op[12] op[8] 162 163 movdqa XMMWORD PTR[output + 0], xmm0 164 movdqa XMMWORD PTR[output + 16], xmm1 165 166 STACK_FRAME_DESTROY 167 168;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) 169global sym(vp8_short_fdct8x4_sse2) PRIVATE 170sym(vp8_short_fdct8x4_sse2): 171 172 STACK_FRAME_CREATE 173 174 ; read the input data 175 movdqa xmm0, [input ] 176 movdqa xmm2, [input+ pitch] 177 lea input, [input+2*pitch] 178 movdqa xmm4, [input ] 179 movdqa xmm3, [input+ pitch] 180 181 ; transpose for the first stage 182 movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 183 movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 184 185 punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 186 punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 187 188 punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 189 punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 190 191 movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 192 punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 193 194 punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 195 196 movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 197 punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 198 199 punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 200 movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 201 202 punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 203 punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 204 205 movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 206 punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 207 208 punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 209 210 ; xmm0 0 211 ; xmm1 1 212 ; xmm2 2 213 ; xmm3 3 214 215 ; first stage 216 movdqa xmm5, xmm0 217 movdqa xmm4, xmm1 218 219 paddw xmm0, xmm3 ; a1 = 0 + 3 220 paddw xmm1, xmm2 ; b1 = 1 + 2 221 222 psubw xmm4, xmm2 ; c1 = 1 - 2 223 psubw xmm5, xmm3 ; d1 = 0 - 3 224 225 psllw xmm5, 3 226 psllw xmm4, 3 227 228 psllw xmm0, 3 229 psllw xmm1, 3 230 231 ; output 0 and 2 232 movdqa xmm2, xmm0 ; a1 233 234 paddw xmm0, xmm1 ; op[0] = a1 + b1 235 psubw xmm2, xmm1 ; op[2] = a1 - b1 236 237 ; output 1 and 3 238 ; interleave c1, d1 239 movdqa xmm1, xmm5 ; d1 240 punpcklwd xmm1, xmm4 ; c1 d1 241 punpckhwd xmm5, xmm4 ; c1 d1 242 243 movdqa xmm3, xmm1 244 movdqa xmm4, xmm5 245 246 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 247 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 248 249 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 250 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 251 252 paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] 253 paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] 254 paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] 255 paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] 256 257 psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 258 psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 259 psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 260 psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 261 262 packssdw xmm1, xmm4 ; op[1] 263 packssdw xmm3, xmm5 ; op[3] 264 265 ; done with vertical 266 ; transpose for the second stage 267 movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 268 movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 269 270 punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 271 punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 272 273 punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 274 punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 275 276 movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 277 punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 278 279 punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 280 281 movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 282 punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 283 284 punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 285 movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 286 287 punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 288 punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 289 290 movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 291 punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 292 293 punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 294 295 ; xmm0 0 296 ; xmm1 4 297 ; xmm2 1 298 ; xmm3 3 299 300 movdqa xmm5, xmm0 301 movdqa xmm2, xmm1 302 303 paddw xmm0, xmm3 ; a1 = 0 + 3 304 paddw xmm1, xmm4 ; b1 = 1 + 2 305 306 psubw xmm4, xmm2 ; c1 = 1 - 2 307 psubw xmm5, xmm3 ; d1 = 0 - 3 308 309 pxor xmm6, xmm6 ; zero out for compare 310 311 pcmpeqw xmm6, xmm5 ; d1 != 0 312 313 pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, 314 ; and keep bit 0 of lower 315 316 ; output 0 and 2 317 movdqa xmm2, xmm0 ; a1 318 319 paddw xmm0, xmm1 ; a1 + b1 320 psubw xmm2, xmm1 ; a1 - b1 321 322 paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] 323 paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] 324 325 psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 326 psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 327 328 ; output 1 and 3 329 ; interleave c1, d1 330 movdqa xmm1, xmm5 ; d1 331 punpcklwd xmm1, xmm4 ; c1 d1 332 punpckhwd xmm5, xmm4 ; c1 d1 333 334 movdqa xmm3, xmm1 335 movdqa xmm4, xmm5 336 337 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 338 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 339 340 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 341 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 342 343 paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] 344 paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] 345 paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] 346 paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] 347 348 psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 349 psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 350 psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 351 psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 352 353 packssdw xmm1, xmm4 ; op[4] 354 packssdw xmm3, xmm5 ; op[12] 355 356 paddw xmm1, xmm6 ; op[4] += (d1!=0) 357 358 movdqa xmm4, xmm0 359 movdqa xmm5, xmm2 360 361 punpcklqdq xmm0, xmm1 362 punpckhqdq xmm4, xmm1 363 364 punpcklqdq xmm2, xmm3 365 punpckhqdq xmm5, xmm3 366 367 movdqa XMMWORD PTR[output + 0 ], xmm0 368 movdqa XMMWORD PTR[output + 16], xmm2 369 movdqa XMMWORD PTR[output + 32], xmm4 370 movdqa XMMWORD PTR[output + 48], xmm5 371 372 STACK_FRAME_DESTROY 373 374SECTION_RODATA 375align 16 376_5352_2217: 377 dw 5352 378 dw 2217 379 dw 5352 380 dw 2217 381 dw 5352 382 dw 2217 383 dw 5352 384 dw 2217 385align 16 386_2217_neg5352: 387 dw 2217 388 dw -5352 389 dw 2217 390 dw -5352 391 dw 2217 392 dw -5352 393 dw 2217 394 dw -5352 395align 16 396_mult_add: 397 times 8 dw 1 398align 16 399_cmp_mask: 400 times 4 dw 1 401 times 4 dw 0 402align 16 403_cmp_mask8x4: 404 times 8 dw 1 405align 16 406_mult_sub: 407 dw 1 408 dw -1 409 dw 1 410 dw -1 411 dw 1 412 dw -1 413 dw 1 414 dw -1 415align 16 416_7: 417 times 4 dd 7 418align 16 419_7w: 420 times 8 dw 7 421align 16 422_14500: 423 times 4 dd 14500 424align 16 425_7500: 426 times 4 dd 7500 427align 16 428_12000: 429 times 4 dd 12000 430align 16 431_51000: 432 times 4 dd 51000 433