1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14section .text 15 global sym(vp8_short_fdct4x4_mmx) 16 global sym(vp8_short_fdct8x4_wmt) 17 18 19%define DCTCONSTANTSBITS (16) 20%define DCTROUNDINGVALUE (1<< (DCTCONSTANTSBITS-1)) 21%define x_c1 (60547) ; cos(pi /8) * (1<<15) 22%define x_c2 (46341) ; cos(pi*2/8) * (1<<15) 23%define x_c3 (25080) ; cos(pi*3/8) * (1<<15) 24 25 26;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) 27sym(vp8_short_fdct4x4_mmx): 28 push rbp 29 mov rbp, rsp 30 SHADOW_ARGS_TO_STACK 3 31 GET_GOT rbx 32 push rsi 33 push rdi 34 ; end prolog 35 mov rsi, arg(0) ;input 36 mov rdi, arg(1) ;output 37 38 lea rdx, [GLOBAL(dct_const_mmx)] 39 movsxd rax, dword ptr arg(2) ;pitch 40 41 lea rcx, [rsi + rax*2] 42 ; read the input data 43 movq mm0, [rsi] 44 movq mm1, [rsi + rax ] 45 46 movq mm2, [rcx] 47 movq mm3, [rcx + rax] 48 ; get the constants 49 ;shift to left by 1 for prescision 50 psllw mm0, 3 51 psllw mm1, 3 52 53 psllw mm2, 3 54 psllw mm3, 3 55 56 ; transpose for the second stage 57 movq mm4, mm0 ; 00 01 02 03 58 movq mm5, mm2 ; 10 11 12 03 59 60 punpcklwd mm0, mm1 ; 00 10 01 11 61 punpckhwd mm4, mm1 ; 02 12 03 13 62 63 punpcklwd mm2, mm3 ; 20 30 21 31 64 punpckhwd mm5, mm3 ; 22 32 23 33 65 66 67 movq mm1, mm0 ; 00 10 01 11 68 punpckldq mm0, mm2 ; 00 10 20 30 69 70 punpckhdq mm1, mm2 ; 01 11 21 31 71 72 movq mm2, mm4 ; 02 12 03 13 73 punpckldq mm2, mm5 ; 02 12 22 32 74 75 punpckhdq mm4, mm5 ; 03 13 23 33 76 movq mm3, mm4 77 78 79 ; first stage 80 movq mm5, mm0 81 movq mm4, mm1 82 83 paddw mm0, mm3 ; a = 0 + 3 84 paddw mm1, mm2 ; b = 1 + 2 85 86 psubw mm4, mm2 ; c = 1 - 2 87 psubw mm5, mm3 ; d = 0 - 3 88 89 90 ; output 0 and 2 91 movq mm6, [rdx + 16] ; c2 92 movq mm2, mm0 ; a 93 94 paddw mm0, mm1 ; a + b 95 psubw mm2, mm1 ; a - b 96 97 movq mm1, mm0 ; a + b 98 pmulhw mm0, mm6 ; 00 01 02 03 99 100 paddw mm0, mm1 ; output 00 01 02 03 101 pmulhw mm6, mm2 ; 20 21 22 23 102 103 paddw mm2, mm6 ; output 20 21 22 23 104 105 ; output 1 and 3 106 movq mm6, [rdx + 8] ; c1 107 movq mm7, [rdx + 24] ; c3 108 109 movq mm1, mm4 ; c 110 movq mm3, mm5 ; d 111 112 pmulhw mm1, mm7 ; c * c3 113 pmulhw mm3, mm6 ; d * c1 114 115 paddw mm3, mm5 ; d * c1 rounded 116 paddw mm1, mm3 ; output 10 11 12 13 117 118 movq mm3, mm4 ; c 119 pmulhw mm5, mm7 ; d * c3 120 121 pmulhw mm4, mm6 ; c * c1 122 paddw mm3, mm4 ; round c* c1 123 124 psubw mm5, mm3 ; output 30 31 32 33 125 movq mm3, mm5 126 127 128 ; done with vertical 129 ; transpose for the second stage 130 movq mm4, mm0 ; 00 01 02 03 131 movq mm5, mm2 ; 10 11 12 03 132 133 punpcklwd mm0, mm1 ; 00 10 01 11 134 punpckhwd mm4, mm1 ; 02 12 03 13 135 136 punpcklwd mm2, mm3 ; 20 30 21 31 137 punpckhwd mm5, mm3 ; 22 32 23 33 138 139 140 movq mm1, mm0 ; 00 10 01 11 141 punpckldq mm0, mm2 ; 00 10 20 30 142 143 punpckhdq mm1, mm2 ; 01 11 21 31 144 145 movq mm2, mm4 ; 02 12 03 13 146 punpckldq mm2, mm5 ; 02 12 22 32 147 148 punpckhdq mm4, mm5 ; 03 13 23 33 149 movq mm3, mm4 150 151 152 ; first stage 153 movq mm5, mm0 154 movq mm4, mm1 155 156 paddw mm0, mm3 ; a = 0 + 3 157 paddw mm1, mm2 ; b = 1 + 2 158 159 psubw mm4, mm2 ; c = 1 - 2 160 psubw mm5, mm3 ; d = 0 - 3 161 162 163 ; output 0 and 2 164 movq mm6, [rdx + 16] ; c2 165 movq mm2, mm0 ; a 166 paddw mm0, mm1 ; a + b 167 168 psubw mm2, mm1 ; a - b 169 170 movq mm1, mm0 ; a + b 171 pmulhw mm0, mm6 ; 00 01 02 03 172 173 paddw mm0, mm1 ; output 00 01 02 03 174 pmulhw mm6, mm2 ; 20 21 22 23 175 176 paddw mm2, mm6 ; output 20 21 22 23 177 178 179 ; output 1 and 3 180 movq mm6, [rdx + 8] ; c1 181 movq mm7, [rdx + 24] ; c3 182 183 movq mm1, mm4 ; c 184 movq mm3, mm5 ; d 185 186 pmulhw mm1, mm7 ; c * c3 187 pmulhw mm3, mm6 ; d * c1 188 189 paddw mm3, mm5 ; d * c1 rounded 190 paddw mm1, mm3 ; output 10 11 12 13 191 192 movq mm3, mm4 ; c 193 pmulhw mm5, mm7 ; d * c3 194 195 pmulhw mm4, mm6 ; c * c1 196 paddw mm3, mm4 ; round c* c1 197 198 psubw mm5, mm3 ; output 30 31 32 33 199 movq mm3, mm5 200 ; done with vertical 201 202 pcmpeqw mm4, mm4 203 pcmpeqw mm5, mm5 204 psrlw mm4, 15 205 psrlw mm5, 15 206 207 psllw mm4, 2 208 psllw mm5, 2 209 210 paddw mm0, mm4 211 paddw mm1, mm5 212 paddw mm2, mm4 213 paddw mm3, mm5 214 215 psraw mm0, 3 216 psraw mm1, 3 217 psraw mm2, 3 218 psraw mm3, 3 219 220 movq [rdi ], mm0 221 movq [rdi+ 8], mm1 222 movq [rdi+16], mm2 223 movq [rdi+24], mm3 224 225 ; begin epilog 226 pop rdi 227 pop rsi 228 RESTORE_GOT 229 UNSHADOW_ARGS 230 pop rbp 231 ret 232 233 234;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch) 235sym(vp8_short_fdct8x4_wmt): 236 push rbp 237 mov rbp, rsp 238 SHADOW_ARGS_TO_STACK 3 239 GET_GOT rbx 240 push rsi 241 push rdi 242 ; end prolog 243 mov rsi, arg(0) ;input 244 mov rdi, arg(1) ;output 245 246 lea rdx, [GLOBAL(dct_const_xmm)] 247 movsxd rax, dword ptr arg(2) ;pitch 248 249 lea rcx, [rsi + rax*2] 250 ; read the input data 251 movdqa xmm0, [rsi] 252 movdqa xmm2, [rsi + rax] 253 254 movdqa xmm4, [rcx] 255 movdqa xmm3, [rcx + rax] 256 ; get the constants 257 ;shift to left by 1 for prescision 258 psllw xmm0, 3 259 psllw xmm2, 3 260 261 psllw xmm4, 3 262 psllw xmm3, 3 263 264 ; transpose for the second stage 265 movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 266 movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 267 268 punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 269 punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 270 271 punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 272 punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 273 274 movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 275 punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 276 277 punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 278 279 280 movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 281 punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 282 283 punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 284 movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 285 286 punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 287 punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 288 289 movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 290 punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 291 292 punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 293 294 ; xmm0 0 295 ; xmm1 1 296 ; xmm2 2 297 ; xmm3 3 298 299 ; first stage 300 movdqa xmm5, xmm0 301 movdqa xmm4, xmm1 302 303 paddw xmm0, xmm3 ; a = 0 + 3 304 paddw xmm1, xmm2 ; b = 1 + 2 305 306 psubw xmm4, xmm2 ; c = 1 - 2 307 psubw xmm5, xmm3 ; d = 0 - 3 308 309 310 ; output 0 and 2 311 movdqa xmm6, [rdx + 32] ; c2 312 movdqa xmm2, xmm0 ; a 313 314 paddw xmm0, xmm1 ; a + b 315 psubw xmm2, xmm1 ; a - b 316 317 movdqa xmm1, xmm0 ; a + b 318 pmulhw xmm0, xmm6 ; 00 01 02 03 319 320 paddw xmm0, xmm1 ; output 00 01 02 03 321 pmulhw xmm6, xmm2 ; 20 21 22 23 322 323 paddw xmm2, xmm6 ; output 20 21 22 23 324 325 ; output 1 and 3 326 movdqa xmm6, [rdx + 16] ; c1 327 movdqa xmm7, [rdx + 48] ; c3 328 329 movdqa xmm1, xmm4 ; c 330 movdqa xmm3, xmm5 ; d 331 332 pmulhw xmm1, xmm7 ; c * c3 333 pmulhw xmm3, xmm6 ; d * c1 334 335 paddw xmm3, xmm5 ; d * c1 rounded 336 paddw xmm1, xmm3 ; output 10 11 12 13 337 338 movdqa xmm3, xmm4 ; c 339 pmulhw xmm5, xmm7 ; d * c3 340 341 pmulhw xmm4, xmm6 ; c * c1 342 paddw xmm3, xmm4 ; round c* c1 343 344 psubw xmm5, xmm3 ; output 30 31 32 33 345 movdqa xmm3, xmm5 346 347 348 ; done with vertical 349 ; transpose for the second stage 350 movdqa xmm4, xmm2 ; 02 12 22 32 06 16 26 36 351 movdqa xmm2, xmm1 ; 01 11 21 31 05 15 25 35 352 353 movdqa xmm1, xmm0 ; 00 10 20 30 04 14 24 34 354 movdqa xmm5, xmm4 ; 02 12 22 32 06 16 26 36 355 356 punpcklwd xmm0, xmm2 ; 00 01 10 11 20 21 30 31 357 punpckhwd xmm1, xmm2 ; 04 05 14 15 24 25 34 35 358 359 punpcklwd xmm4, xmm3 ; 02 03 12 13 22 23 32 33 360 punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 361 362 movdqa xmm2, xmm0 ; 00 01 10 11 20 21 30 31 363 punpckldq xmm0, xmm4 ; 00 01 02 03 10 11 12 13 364 365 punpckhdq xmm2, xmm4 ; 20 21 22 23 30 31 32 33 366 367 368 movdqa xmm4, xmm1 ; 04 05 14 15 24 25 34 35 369 punpckldq xmm4, xmm5 ; 04 05 06 07 14 15 16 17 370 371 punpckhdq xmm1, xmm5 ; 24 25 26 27 34 35 36 37 372 movdqa xmm3, xmm2 ; 20 21 22 23 30 31 32 33 373 374 punpckhqdq xmm3, xmm1 ; 30 31 32 33 34 35 36 37 375 punpcklqdq xmm2, xmm1 ; 20 21 22 23 24 25 26 27 376 377 movdqa xmm1, xmm0 ; 00 01 02 03 10 11 12 13 378 punpcklqdq xmm0, xmm4 ; 00 01 02 03 04 05 06 07 379 380 punpckhqdq xmm1, xmm4 ; 10 11 12 13 14 15 16 17 381 382 ; first stage 383 movdqa xmm5, xmm0 384 movdqa xmm4, xmm1 385 386 paddw xmm0, xmm3 ; a = 0 + 3 387 paddw xmm1, xmm2 ; b = 1 + 2 388 389 psubw xmm4, xmm2 ; c = 1 - 2 390 psubw xmm5, xmm3 ; d = 0 - 3 391 392 393 ; output 0 and 2 394 movdqa xmm6, [rdx + 32] ; c2 395 movdqa xmm2, xmm0 ; a 396 397 paddw xmm0, xmm1 ; a + b 398 psubw xmm2, xmm1 ; a - b 399 400 movdqa xmm1, xmm0 ; a + b 401 pmulhw xmm0, xmm6 ; 00 01 02 03 402 403 paddw xmm0, xmm1 ; output 00 01 02 03 404 pmulhw xmm6, xmm2 ; 20 21 22 23 405 406 paddw xmm2, xmm6 ; output 20 21 22 23 407 408 ; output 1 and 3 409 movdqa xmm6, [rdx + 16] ; c1 410 movdqa xmm7, [rdx + 48] ; c3 411 412 movdqa xmm1, xmm4 ; c 413 movdqa xmm3, xmm5 ; d 414 415 pmulhw xmm1, xmm7 ; c * c3 416 pmulhw xmm3, xmm6 ; d * c1 417 418 paddw xmm3, xmm5 ; d * c1 rounded 419 paddw xmm1, xmm3 ; output 10 11 12 13 420 421 movdqa xmm3, xmm4 ; c 422 pmulhw xmm5, xmm7 ; d * c3 423 424 pmulhw xmm4, xmm6 ; c * c1 425 paddw xmm3, xmm4 ; round c* c1 426 427 psubw xmm5, xmm3 ; output 30 31 32 33 428 movdqa xmm3, xmm5 429 ; done with vertical 430 431 432 pcmpeqw xmm4, xmm4 433 pcmpeqw xmm5, xmm5; 434 psrlw xmm4, 15 435 psrlw xmm5, 15 436 437 psllw xmm4, 2 438 psllw xmm5, 2 439 440 paddw xmm0, xmm4 441 paddw xmm1, xmm5 442 paddw xmm2, xmm4 443 paddw xmm3, xmm5 444 445 psraw xmm0, 3 446 psraw xmm1, 3 447 psraw xmm2, 3 448 psraw xmm3, 3 449 450 movq QWORD PTR[rdi ], xmm0 451 movq QWORD PTR[rdi+ 8], xmm1 452 movq QWORD PTR[rdi+16], xmm2 453 movq QWORD PTR[rdi+24], xmm3 454 455 psrldq xmm0, 8 456 psrldq xmm1, 8 457 psrldq xmm2, 8 458 psrldq xmm3, 8 459 460 movq QWORD PTR[rdi+32], xmm0 461 movq QWORD PTR[rdi+40], xmm1 462 movq QWORD PTR[rdi+48], xmm2 463 movq QWORD PTR[rdi+56], xmm3 464 ; begin epilog 465 pop rdi 466 pop rsi 467 RESTORE_GOT 468 UNSHADOW_ARGS 469 pop rbp 470 ret 471 472 473SECTION_RODATA 474;static const unsigned int dct1st_stage_rounding_mmx[2] = 475align 16 476dct1st_stage_rounding_mmx: 477 times 2 dd 8192 478 479 480;static const unsigned int dct2nd_stage_rounding_mmx[2] = 481align 16 482dct2nd_stage_rounding_mmx: 483 times 2 dd 32768 484 485 486;static const short dct_matrix[4][4]= 487align 16 488dct_matrix: 489 times 4 dw 23170 490 491 dw 30274 492 dw 12540 493 dw -12540 494 dw -30274 495 496 dw 23170 497 times 2 dw -23170 498 dw 23170 499 500 dw 12540 501 dw -30274 502 dw 30274 503 dw -12540 504 505 506;static const unsigned short dct_const_mmx[4 * 4]= 507align 16 508dct_const_mmx: 509 times 4 dw 0 510 times 4 dw 60547 511 times 4 dw 46341 512 times 4 dw 25080 513 514 515;static const unsigned short dct_const_xmm[8 * 4]= 516align 16 517dct_const_xmm: 518 times 8 dw 0 519 times 8 dw 60547 520 times 8 dw 46341 521 times 8 dw 25080 522