1; from a new GOGO-no-coda (1999/09) 2; Copyright (C) 1999 shigeo 3; special thanks to Keiichi SAKAI, URURI 4; hacked and back-ported to LAME 5; by Takehiro TOMINAGA Nov 2000 6 7%include "nasm.h" 8 9 globaldef fht_3DN 10 11 segment_data 12 align 16 13costab dd 0x80000000, 0 14 dd 1.414213562,1.414213562 15 dd 9.238795283293805e-01, 9.238795283293805e-01 16 dd 3.826834424611044e-01, 3.826834424611044e-01 17 dd 9.951847264044178e-01, 9.951847264044178e-01 18 dd 9.801714304836734e-02, 9.801714304836734e-02 19 dd 9.996988186794428e-01, 9.996988186794428e-01 20 dd 2.454122920569705e-02, 2.454122920569705e-02 21 dd 9.999811752815535e-01, 9.999811752815535e-01 22 dd 6.135884819898878e-03, 6.135884819898878e-03 23D_1_0_0_0 dd 0.0 , 1.0 24 25 segment_code 26 27PIC_OFFSETTABLE 28 29 30;void fht_3DN(float *fz, int nn); 31 32proc fht_3DN 33 34 pushd ebp, ebx, esi, edi 35 36 sub esp, 20 37 38 call get_pc.bp 39 add ebp, PIC_BASE() 40 41 mov r0, [esp+40] ;fi 42 mov r1, [esp+44] ;r1 = nn 43 lea r3, [PIC_EBP_REL(costab)] ;tri = costab 44 lea r4, [r0+r1*8] ;r4 = fn = &fz[n] 45 mov [esp+16], r4 46 mov r4, 8 ;kx = k1/2 47 48 pmov mm7, [r3] 49 50 loopalign 16 51.do1 52 lea r3, [r3+16] ;tri += 2; 53 pmov mm6, [PIC_EBP_REL(costab+8)] 54 lea r2, [r4+r4*2] ;k3*fsize/2 55 mov r5, 4 ;i = 1*fsize 56 57 loopalign 16 58.do2: 59 lea r1, [r0+r4] ;gi = fi + kx 60 ;f 61 pmov mm0, [r0] ;fi0 62 pmov mm1, [r0+r4*2] ;fi1 63 pmov mm2, [r0+r2*2] ;fi3 64 pmov mm3, [r0+r4*4] ;fi2 65 66 pupldq mm0, mm0 ;fi0 | fi0 67 pupldq mm1, mm1 ;fi1 | fi1 68 pupldq mm2, mm2 ;fi2 | fi2 69 pupldq mm3, mm3 ;fi3 | fi3 70 71 pxor mm1, mm7 ;fi1 | -fi1 72 pxor mm3, mm7 ;fi3 | -fi3 73 74 pfsub mm0, mm1 ;f1 | f0 75 pfsub mm2, mm3 ;f3 | f2 76 77 pmov mm4, mm0 78 pfadd mm0, mm2 ;f1+f3|f0+f2 = fi1 | fi0 79 pfsub mm4, mm2 ;f1-f3|f0-f2 = fi3 | fi2 80 81 pmovd [r0], mm0 ;fi[0] 82 puphdq mm0, mm0 83 pmovd [r0+r4*4], mm4 ;fi[k2] 84 puphdq mm4, mm4 85 86 pmovd [r0+r4*2], mm4 ;fi[k1] 87 pmovd [r0+r2*2], mm0 ;fi[k3] 88 lea r0, [r0+r4*8] 89 90 ;g 91 pmov mm0, [r1] ;gi0 92 pmov mm1, [r1+r4*2] ;gi1 93 pmov mm2, [r1+r4*4] ;gi2 94 pmov mm3, [r1+r2*2] ;gi3 95 96 pupldq mm1, mm1 97 pupldq mm0, mm0 ;gi0 | gi0 98 pupldq mm2, mm3 ;gi3 | gi2 99 100 pxor mm1, mm7 ;gi1 | -gi1 101 102 pfsub mm0, mm1 ;gi0-gi1|gi0+gi1 = g1 | g0 103 pfmul mm2, mm6 ;gi3*SQRT2|gi2*SQRT2 = g3 | g2 104 105 pmov mm4, mm0 106 pfadd mm0, mm2 ;g1+g3|g0+g2 = gi1 | gi0 107 pfsub mm4, mm2 ;g1-g3|g0-g2 = gi3 | gi2 108 109 pmovd [r1], mm0 ;gi[0] 110 puphdq mm0, mm0 111 pmovd [r1+r4*4], mm4 ;gi[k2] 112 puphdq mm4, mm4 113 114 cmp r0, [esp + 16] 115 pmovd [r1+r4*2], mm0 ;gi[k1] 116 pmovd [r1+r2*2], mm4 ;gi[k3] 117 118 jb near .do2 119 120 pmov mm6, [r3+r5] ; this is not aligned address!! 121 122 loopalign 16 123.for: 124; 125; mm6 = c1 | s1 126; mm7 = 0x800000000 | 0 127; 128 pmov mm1, mm6 129 mov r0, [esp+40] ; fz 130 puphdq mm1, mm1 ; c1 | c1 131 lea r1, [r0+r4*2] 132 pfadd mm1, mm1 ; c1+c1 | c1+c1 133 pfmul mm1, mm6 ; 2*c1*c1 | 2*c1*s1 134 pfsub mm1, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2 135 136 pmov mm0, mm1 137 pxor mm7, mm6 ; c1 | -s1 138 139 pupldq mm2, mm0 140 pupldq mm3, mm6 ; ** | c1 141 puphdq mm0, mm2 ; s2 | c2 142 puphdq mm6, mm3 ;-s1 | c1 143 144 pxor mm0, [PIC_EBP_REL(costab)] ; c2 | -s2 145 146; mm0 = s2| c2 147; mm1 = -c2| s2 148; mm6 = c1| s1 149; mm7 = s1|-c1 (we use the opposite sign. from GOGO here) 150 151 pmov [esp], mm0 152 pmov [esp+8], mm1 153 154 sub r1, r5 ;r1 = gi 155 add r0, r5 ;r0 = fi 156 157 loopalign 16 158.do3: 159 pmov mm2, [r0+r4*2] ; fi[k1] 160 pmov mm4, [r1+r4*2] ; gi[k1] 161 pmov mm3, [r0+r2*2] ; fi[k3] 162 pmov mm5, [r1+r2*2] ; gi[k3] 163 164 pupldq mm2, mm2 ; fi1 | fi1 165 pupldq mm4, mm4 ; gi1 | gi1 166 pupldq mm3, mm3 ; fi3 | fi3 167 pupldq mm5, mm5 ; gi3 | gi3 168 169 pfmul mm2, mm0 ; s2 * fi1 | c2 * fi1 170 pfmul mm4, mm1 ;-c2 * gi1 | s2 * gi1 171 pfmul mm3, mm0 ; s2 * fi3 | c2 * fi3 172 pfmul mm5, mm1 ;-c2 * gi3 | s2 * gi3 173 174 pfadd mm2, mm4 ;b | a 175 pfadd mm3, mm5 ;d | c 176 177 pmov mm0, [r0] 178 pmov mm4, [r1] 179 pmov mm1, [r0+r4*4] 180 pmov mm5, [r1+r4*4] 181 182 pupldq mm0, mm4 ;gi0 | fi0 183 pupldq mm1, mm5 ;gi2 | fi2 184 185 pmov mm4, mm2 186 pmov mm5, mm3 187 188 pfadd mm2, mm0 ;g0 | f0 189 pfadd mm3, mm1 ;g2 | f2 190 191 pfsub mm0, mm4 ;g1 | f1 192 pfsub mm1, mm5 ;g3 | f3 193 194 pmov mm4, mm3 195 pmov mm5, mm1 196 197 pupldq mm4, mm4 ;f2 | f2 198 puphdq mm5, mm5 ;g3 | g3 199 puphdq mm3, mm3 ;g2 | g2 200 pupldq mm1, mm1 ;f3 | f3 201 202 pfmul mm4, mm6 ;f2 * c1 | f2 * s1 203 pfmul mm5, mm7 ;g3 * s1 | g3 *-c1 204 pfmul mm3, mm6 ;g2 * c1 | g2 * s1 205 pfmul mm1, mm7 ;f3 * s1 | f3 *-c1 206 207 pfadd mm4, mm5 ;a | b 208 pfsub mm3, mm1 ;d | c 209 210 pmov mm5, mm2 211 pmov mm1, mm0 212 213 pupldq mm2, mm2 ;f0 | f0 214 pupldq mm0, mm0 ;f1 | f1 215 216 puphdq mm1, mm2 ;f0 | g1 217 puphdq mm5, mm0 ;f1 | g0 218 219 pmov mm2, mm4 220 pmov mm0, mm3 221 222 pfadd mm4, mm1 ;fi0 | gi1 223 pfadd mm3, mm5 ;fi1 | gi0 224 pfsub mm1, mm2 ;fi2 | gi3 225 pfsub mm5, mm0 ;fi3 | gi2 226 227 pmovd [r1+r4*2], mm4 ;gi[k1] 228 puphdq mm4, mm4 229 pmovd [r1], mm3 ;gi[0] 230 puphdq mm3, mm3 231 pmovd [r1+r2*2], mm1 ;gi[k3] 232 puphdq mm1, mm1 233 pmovd [r1+r4*4], mm5 ;gi[k2] 234 puphdq mm5, mm5 235 236 pmovd [r0], mm4 ;fi[0] 237 pmovd [r0+r4*2], mm3 ;fi[k1] 238 pmovd [r0+r4*4], mm1 ;fi[k2] 239 pmovd [r0+r2*2], mm5 ;fi[k3] 240 241 lea r0, [r0+r4*8] 242 lea r1, [r1+r4*8] 243 cmp r0, [esp + 16] 244 pmov mm0, [esp] 245 pmov mm1, [esp+8] 246 247 jb near .do3 248 249 add r5, 4 250; mm6 = c1| s1 251; mm7 = s1|-c1 (we use the opposite sign. from GOGO here) 252 pfmul mm6, [r3] ; c1*a | s1*a 253 pfmul mm7, [r3+8] ; s1*b |-c1*b 254 cmp r5, r4 255 256 pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b 257 pupldq mm7,mm6 258 puphdq mm6,mm7 259 pmov mm7, [PIC_EBP_REL(costab)] 260 jb near .for 261 262 mov r0, [esp+40] ;fi 263 cmp r4, [esp+40+4] 264 lea r4, [r4*4] ;kx *= 4 265 266 jb near .do1 267.exitttt 268 femms 269 add esp,20 270 popd ebp, ebx, esi, edi 271endproc 272 273 274;void fht_E3DN(float *fz, int nn); 275 276proc fht_E3DN 277 278 pushd ebp, ebx, esi, edi 279 280 sub esp, 20 281 282 call get_pc.bp 283 add ebp, PIC_BASE() 284 285 mov r0, [esp+40] ;fi 286 mov r1, [esp+44] ;r1 = nn 287 lea r3, [PIC_EBP_REL(costab)] ;tri = costab 288 lea r4, [r0+r1*8] ;r4 = fn = &fz[n] 289 mov [esp+16], r4 290 mov r4, 8 ;kx = k1/2 291 292 pmov mm7, [r3] 293 294 loopalign 16 295.do1 296 lea r3, [r3+16] ;tri += 2; 297 pmov mm6, [PIC_EBP_REL(costab+8)] 298 lea r2, [r4+r4*2] ;k3*fsize/2 299 mov r5, 4 ;i = 1*fsize 300 301 loopalign 16 302.do2: 303 lea r1, [r0+r4] ;gi = fi + kx 304;f 305 pmov mm0, [r0] ; X | fi0 306 pmov mm1, [r0+r4*4] ; X | fi2 307 pupldq mm0, [r0+r4*2] ;fi1 | fi0 308 pupldq mm1, [r0+r2*2] ;fi3 | fi2 309 pfpnacc mm0, mm0 ;fi0+fi1 | fi0-fi1 = f0|f1 310 pfpnacc mm1, mm1 ;fi2+fi3 | fi2-fi3 = f2|f3 311 312 pmov mm2, mm0 313 pfadd mm0, mm1 ;f0+f2|f1+f3 = fi0 | fi1 314 pfsub mm2, mm1 ;f0-f2|f1-f3 = fi2 | fi3 315 316 pmovd [r0+r4*2], mm0 ;fi[k1] 317 pmovd [r0+r2*2], mm2 ;fi[k3] 318 319 puphdq mm0, mm0 320 puphdq mm2, mm2 321 pmovd [r0], mm0 ;fi[0] 322 pmovd [r0+r4*4], mm2 ;fi[k2] 323 324 lea r0, [r0+r4*8] 325;g 326 pmov mm3, [r1] ; gi0 327 pmov mm4, [r1+r2*2] ; gi3 328 pupldq mm3, [r1+r4*2] ;gi1|gi0 329 pupldq mm4, [r1+r4*4] ;gi2|gi3 330 331 pfpnacc mm3, mm3 ;gi0+gi1 |gi0-gi1 = f0|f1 332 pfmul mm4, mm6 ;gi2*SQRT2|gi3*SQRT2 = f2|f3 333 334 pmov mm5, mm3 335 pfadd mm3, mm4 ;f0+f2|f1+f3 336 pfsub mm5, mm4 ;f0-f2|f1-f3 337 338 cmp r0, [esp + 16] 339 pmovd [r1+r4*2], mm3 ;gi[k1] 340 pmovd [r1+r2*2], mm5 ;gi[k3] 341 puphdq mm3, mm3 342 puphdq mm5, mm5 343 pmovd [r1], mm3 ;gi[0] 344 pmovd [r1+r4*4], mm5 ;gi[k2] 345 346 jb near .do2 347 348 pmov mm6, [r3+r5] ; this is not aligned address!! 349 350 loopalign 16 351.for: 352; 353; mm6 = c1 | s1 354; mm7 = 0x800000000 | 0 355; 356 pmov mm5, mm6 357 mov r0, [esp+40] ; fz 358 puphdq mm5, mm5 ; c1 | c1 359 lea r1, [r0+r4*2] 360 pfadd mm5, mm5 ; c1+c1 | c1+c1 361 pfmul mm5, mm6 ; 2*c1*c1 | 2*c1*s1 362 pfsub mm5, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2 363 364 pswapd mm4, mm5 ; s2 |-c2 365 pxor mm4, mm7 ; s2 | c2 366 pxor mm7, mm6 ; c1 |-s1 367 pswapd mm6, mm6 ; s1 | c1 368 369; mm4 = s2| c2 370; mm5 = -c2| s2 371; mm6 = c1| s1 372; mm7 = s1|-c1 (we use the opposite sign. from GOGO here) 373 374 pmov [esp], mm4 375 pmov [esp+8], mm5 376 377 sub r1, r5 ;r1 = gi 378 add r0, r5 ;r0 = fi 379 380 loopalign 16 381.do3: 382 pmov mm0, [r0+r2*2] ; fi[k1] 383 pmov mm2, [r1+r2*2] ; gi[k1] 384 pmov mm1, [r0+r4*2] ; fi[k3] 385 pmov mm3, [r1+r4*2] ; gi[k3] 386 387 pupldq mm0, mm0 388 pupldq mm2, mm2 389 pupldq mm1, mm1 390 pupldq mm3, mm3 391 392 pfmul mm0, mm4 393 pfmul mm2, mm5 394 pfmul mm1, mm4 395 pfmul mm3, mm5 396 397 pfadd mm0, mm2 ;d | c 398 pfadd mm1, mm3 ;b | a 399 400 pmov mm2, [r0+r4*4] ;fi2 401 pupldq mm3, [r1+r4*4] ;gi2 | - 402 pmov mm4, [r0] ;fi0 403 pupldq mm5, [r1] ;gi0 | - 404 405 pupldq mm2, mm0 ;c | fi2 406 puphdq mm3, mm0 ;d | gi2 407 pupldq mm4, mm1 ;a | fi0 408 puphdq mm5, mm1 ;b | gi0 409 410 pfpnacc mm2, mm2 ;f2 | f3 411 pfpnacc mm3, mm3 ;g2 | g3 412 pfpnacc mm4, mm4 ;f0 | f1 413 pfpnacc mm5, mm5 ;g0 | g1 414 415 pmov mm0, mm2 416 pmov mm1, mm3 417 pupldq mm2, mm2 ;f3 | f3 418 pupldq mm3, mm3 ;g3 | g3 419 puphdq mm0, mm0 ;f2 | f2 420 puphdq mm1, mm1 ;g2 | g2 421 422 pswapd mm4, mm4 ;f1 | f0 423 pswapd mm5, mm5 ;g1 | g0 424 425 pfmul mm0, mm7 ;f2 * s1 | f2 *-c1 426 pfmul mm3, mm6 ;g3 * c1 | g3 * s1 427 pfmul mm1, mm6 ;g2 * c1 | g2 * s1 428 pfmul mm2, mm7 ;f3 * s1 | f3 *-c1 429 430 pfsub mm0, mm3 ; b |-a 431 pfsub mm1, mm2 ; d | c 432 433 pmov mm2, mm5 434 pmov mm3, mm4 435 pupldq mm4, mm0 ;-a | f0 436 pupldq mm5, mm1 ; c | g0 437 puphdq mm2, mm0 ; b | g1 438 puphdq mm3, mm1 ; d | f1 439 440 pfpnacc mm4, mm4 ;fi2 | fi0 441 pfpnacc mm5, mm5 ;gi0 | gi2 442 pfpnacc mm2, mm2 ;gi1 | gi3 443 pfpnacc mm3, mm3 ;fi1 | fi3 444 445 pmovd [r0], mm4 ;fi[0] 446 pmovd [r1+r4*4], mm5 ;gi[k2] 447 pmovd [r1+r2*2], mm2 ;gi[k3] 448 pmovd [r0+r2*2], mm3 ;fi[k3] 449 450 puphdq mm4, mm4 451 puphdq mm5, mm5 452 puphdq mm2, mm2 453 puphdq mm3, mm3 454 pmovd [r0+r4*4], mm4 ;fi[k2] 455 pmovd [r1], mm5 ;gi[0] 456 pmovd [r1+r4*2], mm2 ;gi[k1] 457 pmovd [r0+r4*2], mm3 ;fi[k1] 458 459 lea r0, [r0+r4*8] 460 lea r1, [r1+r4*8] 461 cmp r0, [esp + 16] 462 pmov mm4, [esp] 463 pmov mm5, [esp+8] 464 465 jb near .do3 466 467 add r5, 4 468; mm6 = c1| s1 469; mm7 = s1|-c1 (we use the opposite sign. from GOGO here) 470 pfmul mm6, [r3] ; c1*a | s1*a 471 pfmul mm7, [r3+8] ; s1*b |-c1*b 472 cmp r5, r4 473 474 pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b 475 pswapd mm6, mm6 ; ??? ; s1*a+c1*b | c1*a-s1*b 476 pmov mm7, [PIC_EBP_REL(costab)] 477 jb near .for 478 479 mov r0, [esp+40] ;fi 480 cmp r4, [esp+40+4] 481 lea r4, [r4*4] ;kx *= 4 482 483 jb near .do1 484.exitttt 485 femms 486 add esp,20 487 popd ebp, ebx, esi, edi 488endproc 489