1/* 2 * MIPS DSPr2 optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. 5 * All Rights Reserved. 6 * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com> 7 * Darko Laus <darko.laus@imgtec.com> 8 * Copyright (C) 2015, D. R. Commander. All Rights Reserved. 9 * 10 * This software is provided 'as-is', without any express or implied 11 * warranty. In no event will the authors be held liable for any damages 12 * arising from the use of this software. 13 * 14 * Permission is granted to anyone to use this software for any purpose, 15 * including commercial applications, and to alter it and redistribute it 16 * freely, subject to the following restrictions: 17 * 18 * 1. The origin of this software must not be misrepresented; you must not 19 * claim that you wrote the original software. If you use this software 20 * in a product, an acknowledgment in the product documentation would be 21 * appreciated but is not required. 22 * 2. Altered source versions must be plainly marked as such, and must not be 23 * misrepresented as being the original software. 24 * 3. This notice may not be removed or altered from any source distribution. 25 */ 26 27#include "jsimd_dspr2_asm.h" 28 29 30/*****************************************************************************/ 31LEAF_DSPR2(jsimd_c_null_convert_dspr2) 32/* 33 * a0 = cinfo->image_width 34 * a1 = input_buf 35 * a2 = output_buf 36 * a3 = output_row 37 * 16(sp) = num_rows 38 * 20(sp) = cinfo->num_components 39 * 40 * Null conversion for compression 41 */ 42 SAVE_REGS_ON_STACK 8, s0, s1 43 44 lw t9, 24(sp) // t9 = num_rows 45 lw s0, 28(sp) // s0 = cinfo->num_components 46 andi t0, a0, 3 // t0 = cinfo->image_width & 3 47 beqz t0, 4f // no residual 48 nop 490: 50 addiu t9, t9, -1 51 bltz t9, 7f 52 li t1, 0 531: 54 sll t3, t1, 2 55 lwx t5, t3(a2) // t5 = outptr = output_buf[ci] 56 lw t2, 0(a1) // t2 = inptr = *input_buf 57 sll t4, a3, 2 58 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] 59 addu t2, t2, t1 60 addu s1, t5, a0 61 addu t6, t5, t0 622: 63 lbu t3, 0(t2) 64 addiu t5, t5, 1 65 sb t3, -1(t5) 66 bne t6, t5, 2b 67 addu t2, t2, s0 683: 69 lbu t3, 0(t2) 70 addu t4, t2, s0 71 addu t7, t4, s0 72 addu t8, t7, s0 73 addu t2, t8, s0 74 lbu t4, 0(t4) 75 lbu t7, 0(t7) 76 lbu t8, 0(t8) 77 addiu t5, t5, 4 78 sb t3, -4(t5) 79 sb t4, -3(t5) 80 sb t7, -2(t5) 81 bne s1, t5, 3b 82 sb t8, -1(t5) 83 addiu t1, t1, 1 84 bne t1, s0, 1b 85 nop 86 addiu a1, a1, 4 87 bgez t9, 0b 88 addiu a3, a3, 1 89 b 7f 90 nop 914: 92 addiu t9, t9, -1 93 bltz t9, 7f 94 li t1, 0 955: 96 sll t3, t1, 2 97 lwx t5, t3(a2) // t5 = outptr = output_buf[ci] 98 lw t2, 0(a1) // t2 = inptr = *input_buf 99 sll t4, a3, 2 100 lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row] 101 addu t2, t2, t1 102 addu s1, t5, a0 103 addu t6, t5, t0 1046: 105 lbu t3, 0(t2) 106 addu t4, t2, s0 107 addu t7, t4, s0 108 addu t8, t7, s0 109 addu t2, t8, s0 110 lbu t4, 0(t4) 111 lbu t7, 0(t7) 112 lbu t8, 0(t8) 113 addiu t5, t5, 4 114 sb t3, -4(t5) 115 sb t4, -3(t5) 116 sb t7, -2(t5) 117 bne s1, t5, 6b 118 sb t8, -1(t5) 119 addiu t1, t1, 1 120 bne t1, s0, 5b 121 nop 122 addiu a1, a1, 4 123 bgez t9, 4b 124 addiu a3, a3, 1 1257: 126 RESTORE_REGS_FROM_STACK 8, s0, s1 127 128 j ra 129 nop 130 131END(jsimd_c_null_convert_dspr2) 132 133 134/*****************************************************************************/ 135/* 136 * jsimd_extrgb_ycc_convert_dspr2 137 * jsimd_extbgr_ycc_convert_dspr2 138 * jsimd_extrgbx_ycc_convert_dspr2 139 * jsimd_extbgrx_ycc_convert_dspr2 140 * jsimd_extxbgr_ycc_convert_dspr2 141 * jsimd_extxrgb_ycc_convert_dspr2 142 * 143 * Colorspace conversion RGB -> YCbCr 144 */ 145 146.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \ 147 r_offs, g_offs, b_offs 148 149.macro DO_RGB_TO_YCC r, g, b, inptr 150 lbu \r, \r_offs(\inptr) 151 lbu \g, \g_offs(\inptr) 152 lbu \b, \b_offs(\inptr) 153 addiu \inptr, \pixel_size 154.endm 155 156LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2) 157/* 158 * a0 = cinfo->image_width 159 * a1 = input_buf 160 * a2 = output_buf 161 * a3 = output_row 162 * 16(sp) = num_rows 163 */ 164 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 165 166 lw t7, 48(sp) // t7 = num_rows 167 li s0, 0x4c8b // FIX(0.29900) 168 li s1, 0x9646 // FIX(0.58700) 169 li s2, 0x1d2f // FIX(0.11400) 170 li s3, 0xffffd4cd // -FIX(0.16874) 171 li s4, 0xffffab33 // -FIX(0.33126) 172 li s5, 0x8000 // FIX(0.50000) 173 li s6, 0xffff94d1 // -FIX(0.41869) 174 li s7, 0xffffeb2f // -FIX(0.08131) 175 li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1 176 1770: 178 addiu t7, -1 // --num_rows 179 lw t6, 0(a1) // t6 = input_buf[0] 180 lw t0, 0(a2) 181 lw t1, 4(a2) 182 lw t2, 8(a2) 183 sll t3, a3, 2 184 lwx t0, t3(t0) // t0 = output_buf[0][output_row] 185 lwx t1, t3(t1) // t1 = output_buf[1][output_row] 186 lwx t2, t3(t2) // t2 = output_buf[2][output_row] 187 188 addu t9, t2, a0 // t9 = end address 189 addiu a3, 1 190 1911: 192 DO_RGB_TO_YCC t3, t4, t5, t6 193 194 mtlo s5, $ac0 195 mtlo t8, $ac1 196 mtlo t8, $ac2 197 maddu $ac0, s2, t5 198 maddu $ac1, s5, t5 199 maddu $ac2, s5, t3 200 maddu $ac0, s0, t3 201 maddu $ac1, s3, t3 202 maddu $ac2, s6, t4 203 maddu $ac0, s1, t4 204 maddu $ac1, s4, t4 205 maddu $ac2, s7, t5 206 extr.w t3, $ac0, 16 207 extr.w t4, $ac1, 16 208 extr.w t5, $ac2, 16 209 sb t3, 0(t0) 210 sb t4, 0(t1) 211 sb t5, 0(t2) 212 addiu t0, 1 213 addiu t2, 1 214 bne t2, t9, 1b 215 addiu t1, 1 216 bgtz t7, 0b 217 addiu a1, 4 218 219 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 220 221 j ra 222 nop 223END(jsimd_\colorid\()_ycc_convert_dspr2) 224 225.purgem DO_RGB_TO_YCC 226 227.endm 228 229/*-------------------------------------id -- pix R G B */ 230GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 231GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 232GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 233GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 234GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 235GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 236 237 238/*****************************************************************************/ 239/* 240 * jsimd_ycc_extrgb_convert_dspr2 241 * jsimd_ycc_extbgr_convert_dspr2 242 * jsimd_ycc_extrgbx_convert_dspr2 243 * jsimd_ycc_extbgrx_convert_dspr2 244 * jsimd_ycc_extxbgr_convert_dspr2 245 * jsimd_ycc_extxrgb_convert_dspr2 246 * 247 * Colorspace conversion YCbCr -> RGB 248 */ 249 250.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \ 251 r_offs, g_offs, b_offs, a_offs 252 253.macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr 254 sb \scratch0, \r_offs(\outptr) 255 sb \scratch1, \g_offs(\outptr) 256 sb \scratch2, \b_offs(\outptr) 257.if (\pixel_size == 4) 258 li t0, 0xFF 259 sb t0, \a_offs(\outptr) 260.endif 261 addiu \outptr, \pixel_size 262.endm 263 264LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2) 265/* 266 * a0 = cinfo->image_width 267 * a1 = input_buf 268 * a2 = input_row 269 * a3 = output_buf 270 * 16(sp) = num_rows 271 */ 272 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 273 274 lw s1, 48(sp) 275 li t3, 0x8000 276 li t4, 0x166e9 // FIX(1.40200) 277 li t5, 0x1c5a2 // FIX(1.77200) 278 li t6, 0xffff492e // -FIX(0.71414) 279 li t7, 0xffffa7e6 // -FIX(0.34414) 280 repl.ph t8, 128 281 2820: 283 lw s0, 0(a3) 284 lw t0, 0(a1) 285 lw t1, 4(a1) 286 lw t2, 8(a1) 287 sll s5, a2, 2 288 addiu s1, -1 289 lwx s2, s5(t0) 290 lwx s3, s5(t1) 291 lwx s4, s5(t2) 292 addu t9, s2, a0 293 addiu a2, 1 294 2951: 296 lbu s7, 0(s4) // cr 297 lbu s6, 0(s3) // cb 298 lbu s5, 0(s2) // y 299 addiu s2, 1 300 addiu s4, 1 301 addiu s7, -128 302 addiu s6, -128 303 mul t2, t7, s6 304 mul t0, t6, s7 // Crgtab[cr] 305 sll s7, 15 306 mulq_rs.w t1, t4, s7 // Crrtab[cr] 307 sll s6, 15 308 addu t2, t3 // Cbgtab[cb] 309 addu t2, t0 310 311 mulq_rs.w t0, t5, s6 // Cbbtab[cb] 312 sra t2, 16 313 addu t1, s5 314 addu t2, s5 // add y 315 ins t2, t1, 16, 16 316 subu.ph t2, t2, t8 317 addu t0, s5 318 shll_s.ph t2, t2, 8 319 subu t0, 128 320 shra.ph t2, t2, 8 321 shll_s.w t0, t0, 24 322 addu.ph t2, t2, t8 // clip & store 323 sra t0, t0, 24 324 sra t1, t2, 16 325 addiu t0, 128 326 327 STORE_YCC_TO_RGB t1, t2, t0, s0 328 329 bne s2, t9, 1b 330 addiu s3, 1 331 bgtz s1, 0b 332 addiu a3, 4 333 334 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 335 336 j ra 337 nop 338END(jsimd_ycc_\colorid\()_convert_dspr2) 339 340.purgem STORE_YCC_TO_RGB 341 342.endm 343 344/*-------------------------------------id -- pix R G B A */ 345GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3 346GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3 347GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3 348GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3 349GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0 350GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0 351 352 353/*****************************************************************************/ 354/* 355 * jsimd_extrgb_gray_convert_dspr2 356 * jsimd_extbgr_gray_convert_dspr2 357 * jsimd_extrgbx_gray_convert_dspr2 358 * jsimd_extbgrx_gray_convert_dspr2 359 * jsimd_extxbgr_gray_convert_dspr2 360 * jsimd_extxrgb_gray_convert_dspr2 361 * 362 * Colorspace conversion RGB -> GRAY 363 */ 364 365.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \ 366 r_offs, g_offs, b_offs 367 368.macro DO_RGB_TO_GRAY r, g, b, inptr 369 lbu \r, \r_offs(\inptr) 370 lbu \g, \g_offs(\inptr) 371 lbu \b, \b_offs(\inptr) 372 addiu \inptr, \pixel_size 373.endm 374 375LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2) 376/* 377 * a0 = cinfo->image_width 378 * a1 = input_buf 379 * a2 = output_buf 380 * a3 = output_row 381 * 16(sp) = num_rows 382 */ 383 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 384 385 li s0, 0x4c8b // s0 = FIX(0.29900) 386 li s1, 0x9646 // s1 = FIX(0.58700) 387 li s2, 0x1d2f // s2 = FIX(0.11400) 388 li s7, 0x8000 // s7 = FIX(0.50000) 389 lw s6, 48(sp) 390 andi t7, a0, 3 391 3920: 393 addiu s6, -1 // s6 = num_rows 394 lw t0, 0(a1) 395 lw t1, 0(a2) 396 sll t3, a3, 2 397 lwx t1, t3(t1) 398 addiu a3, 1 399 addu t9, t1, a0 400 subu t8, t9, t7 401 beq t1, t8, 2f 402 nop 403 4041: 405 DO_RGB_TO_GRAY t3, t4, t5, t0 406 DO_RGB_TO_GRAY s3, s4, s5, t0 407 408 mtlo s7, $ac0 409 maddu $ac0, s2, t5 410 maddu $ac0, s1, t4 411 maddu $ac0, s0, t3 412 mtlo s7, $ac1 413 maddu $ac1, s2, s5 414 maddu $ac1, s1, s4 415 maddu $ac1, s0, s3 416 extr.w t6, $ac0, 16 417 418 DO_RGB_TO_GRAY t3, t4, t5, t0 419 DO_RGB_TO_GRAY s3, s4, s5, t0 420 421 mtlo s7, $ac0 422 maddu $ac0, s2, t5 423 maddu $ac0, s1, t4 424 extr.w t2, $ac1, 16 425 maddu $ac0, s0, t3 426 mtlo s7, $ac1 427 maddu $ac1, s2, s5 428 maddu $ac1, s1, s4 429 maddu $ac1, s0, s3 430 extr.w t5, $ac0, 16 431 sb t6, 0(t1) 432 sb t2, 1(t1) 433 extr.w t3, $ac1, 16 434 addiu t1, 4 435 sb t5, -2(t1) 436 sb t3, -1(t1) 437 bne t1, t8, 1b 438 nop 439 4402: 441 beqz t7, 4f 442 nop 443 4443: 445 DO_RGB_TO_GRAY t3, t4, t5, t0 446 447 mtlo s7, $ac0 448 maddu $ac0, s2, t5 449 maddu $ac0, s1, t4 450 maddu $ac0, s0, t3 451 extr.w t6, $ac0, 16 452 sb t6, 0(t1) 453 addiu t1, 1 454 bne t1, t9, 3b 455 nop 456 4574: 458 bgtz s6, 0b 459 addiu a1, 4 460 461 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 462 463 j ra 464 nop 465END(jsimd_\colorid\()_gray_convert_dspr2) 466 467.purgem DO_RGB_TO_GRAY 468 469.endm 470 471/*-------------------------------------id -- pix R G B */ 472GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 473GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 474GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 475GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 476GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 477GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 478 479 480/*****************************************************************************/ 481/* 482 * jsimd_h2v2_merged_upsample_dspr2 483 * jsimd_h2v2_extrgb_merged_upsample_dspr2 484 * jsimd_h2v2_extrgbx_merged_upsample_dspr2 485 * jsimd_h2v2_extbgr_merged_upsample_dspr2 486 * jsimd_h2v2_extbgrx_merged_upsample_dspr2 487 * jsimd_h2v2_extxbgr_merged_upsample_dspr2 488 * jsimd_h2v2_extxrgb_merged_upsample_dspr2 489 * 490 * Merged h2v2 upsample routines 491 */ 492.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ 493 r1_offs, g1_offs, \ 494 b1_offs, a1_offs, \ 495 r2_offs, g2_offs, \ 496 b2_offs, a2_offs 497 498.macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ 499 scratch5 outptr 500 sb \scratch0, \r1_offs(\outptr) 501 sb \scratch1, \g1_offs(\outptr) 502 sb \scratch2, \b1_offs(\outptr) 503 sb \scratch3, \r2_offs(\outptr) 504 sb \scratch4, \g2_offs(\outptr) 505 sb \scratch5, \b2_offs(\outptr) 506.if (\pixel_size == 8) 507 li \scratch0, 0xFF 508 sb \scratch0, \a1_offs(\outptr) 509 sb \scratch0, \a2_offs(\outptr) 510.endif 511 addiu \outptr, \pixel_size 512.endm 513 514.macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr 515 sb \scratch0, \r1_offs(\outptr) 516 sb \scratch1, \g1_offs(\outptr) 517 sb \scratch2, \b1_offs(\outptr) 518 519.if (\pixel_size == 8) 520 li t0, 0xFF 521 sb t0, \a1_offs(\outptr) 522.endif 523.endm 524 525LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) 526/* 527 * a0 = cinfo->output_width 528 * a1 = input_buf 529 * a2 = in_row_group_ctr 530 * a3 = output_buf 531 * 16(sp) = cinfo->sample_range_limit 532 */ 533 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 534 535 lw t9, 56(sp) // cinfo->sample_range_limit 536 lw v0, 0(a1) 537 lw v1, 4(a1) 538 lw t0, 8(a1) 539 sll t1, a2, 3 540 addiu t2, t1, 4 541 sll t3, a2, 2 542 lw t4, 0(a3) // t4 = output_buf[0] 543 lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2] 544 lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1] 545 lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr] 546 lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr] 547 lw t7, 4(a3) // t7 = output_buf[1] 548 li s1, 0xe6ea 549 addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)] 550 addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)] 551 addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] 552 xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] 553 srl t3, a0, 1 554 blez t3, 2f 555 addu t0, t5, t3 // t0 = end address 556 1: 557 lbu t3, 0(t5) 558 lbu s3, 0(t6) 559 addiu t5, t5, 1 560 addiu t3, t3, -128 // (cb - 128) 561 addiu s3, s3, -128 // (cr - 128) 562 mult $ac1, s1, t3 563 madd $ac1, s2, s3 564 sll s3, s3, 15 565 sll t3, t3, 15 566 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS 567 extr_r.w s5, $ac1, 16 568 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS 569 lbu v0, 0(t1) 570 addiu t6, t6, 1 571 addiu t1, t1, 2 572 addu t3, v0, s4 // y+cred 573 addu s3, v0, s5 // y+cgreen 574 addu v1, v0, s6 // y+cblue 575 addu t3, t9, t3 // y+cred 576 addu s3, t9, s3 // y+cgreen 577 addu v1, t9, v1 // y+cblue 578 lbu AT, 0(t3) 579 lbu s7, 0(s3) 580 lbu ra, 0(v1) 581 lbu v0, -1(t1) 582 addu t3, v0, s4 // y+cred 583 addu s3, v0, s5 // y+cgreen 584 addu v1, v0, s6 // y+cblue 585 addu t3, t9, t3 // y+cred 586 addu s3, t9, s3 // y+cgreen 587 addu v1, t9, v1 // y+cblue 588 lbu t3, 0(t3) 589 lbu s3, 0(s3) 590 lbu v1, 0(v1) 591 lbu v0, 0(t2) 592 593 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 594 595 addu t3, v0, s4 // y+cred 596 addu s3, v0, s5 // y+cgreen 597 addu v1, v0, s6 // y+cblue 598 addu t3, t9, t3 // y+cred 599 addu s3, t9, s3 // y+cgreen 600 addu v1, t9, v1 // y+cblue 601 lbu AT, 0(t3) 602 lbu s7, 0(s3) 603 lbu ra, 0(v1) 604 lbu v0, 1(t2) 605 addiu t2, t2, 2 606 addu t3, v0, s4 // y+cred 607 addu s3, v0, s5 // y+cgreen 608 addu v1, v0, s6 // y+cblue 609 addu t3, t9, t3 // y+cred 610 addu s3, t9, s3 // y+cgreen 611 addu v1, t9, v1 // y+cblue 612 lbu t3, 0(t3) 613 lbu s3, 0(s3) 614 lbu v1, 0(v1) 615 616 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 617 618 bne t0, t5, 1b 619 nop 6202: 621 andi t0, a0, 1 622 beqz t0, 4f 623 lbu t3, 0(t5) 624 lbu s3, 0(t6) 625 addiu t3, t3, -128 // (cb - 128) 626 addiu s3, s3, -128 // (cr - 128) 627 mult $ac1, s1, t3 628 madd $ac1, s2, s3 629 sll s3, s3, 15 630 sll t3, t3, 15 631 lbu v0, 0(t1) 632 extr_r.w s5, $ac1, 16 633 mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS 634 mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS 635 addu t3, v0, s4 // y+cred 636 addu s3, v0, s5 // y+cgreen 637 addu v1, v0, s6 // y+cblue 638 addu t3, t9, t3 // y+cred 639 addu s3, t9, s3 // y+cgreen 640 addu v1, t9, v1 // y+cblue 641 lbu t3, 0(t3) 642 lbu s3, 0(s3) 643 lbu v1, 0(v1) 644 lbu v0, 0(t2) 645 646 STORE_H2V2_1_PIXEL t3, s3, v1, t4 647 648 addu t3, v0, s4 // y+cred 649 addu s3, v0, s5 // y+cgreen 650 addu v1, v0, s6 // y+cblue 651 addu t3, t9, t3 // y+cred 652 addu s3, t9, s3 // y+cgreen 653 addu v1, t9, v1 // y+cblue 654 lbu t3, 0(t3) 655 lbu s3, 0(s3) 656 lbu v1, 0(v1) 657 658 STORE_H2V2_1_PIXEL t3, s3, v1, t7 6594: 660 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 661 662 j ra 663 nop 664 665END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) 666 667.purgem STORE_H2V2_1_PIXEL 668.purgem STORE_H2V2_2_PIXELS 669.endm 670 671/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ 672GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 673GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 674GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 675GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 676GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 677GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 678 679 680/*****************************************************************************/ 681/* 682 * jsimd_h2v1_merged_upsample_dspr2 683 * jsimd_h2v1_extrgb_merged_upsample_dspr2 684 * jsimd_h2v1_extrgbx_merged_upsample_dspr2 685 * jsimd_h2v1_extbgr_merged_upsample_dspr2 686 * jsimd_h2v1_extbgrx_merged_upsample_dspr2 687 * jsimd_h2v1_extxbgr_merged_upsample_dspr2 688 * jsimd_h2v1_extxrgb_merged_upsample_dspr2 689 * 690 * Merged h2v1 upsample routines 691 */ 692 693.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ 694 r1_offs, g1_offs, \ 695 b1_offs, a1_offs, \ 696 r2_offs, g2_offs, \ 697 b2_offs, a2_offs 698 699.macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ 700 scratch5 outptr 701 sb \scratch0, \r1_offs(\outptr) 702 sb \scratch1, \g1_offs(\outptr) 703 sb \scratch2, \b1_offs(\outptr) 704 sb \scratch3, \r2_offs(\outptr) 705 sb \scratch4, \g2_offs(\outptr) 706 sb \scratch5, \b2_offs(\outptr) 707.if (\pixel_size == 8) 708 li t0, 0xFF 709 sb t0, \a1_offs(\outptr) 710 sb t0, \a2_offs(\outptr) 711.endif 712 addiu \outptr, \pixel_size 713.endm 714 715.macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr 716 sb \scratch0, \r1_offs(\outptr) 717 sb \scratch1, \g1_offs(\outptr) 718 sb \scratch2, \b1_offs(\outptr) 719.if (\pixel_size == 8) 720 li t0, 0xFF 721 sb t0, \a1_offs(\outptr) 722.endif 723.endm 724 725LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) 726/* 727 * a0 = cinfo->output_width 728 * a1 = input_buf 729 * a2 = in_row_group_ctr 730 * a3 = output_buf 731 * 16(sp) = range_limit 732 */ 733 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 734 735 li t0, 0xe6ea 736 lw t1, 0(a1) // t1 = input_buf[0] 737 lw t2, 4(a1) // t2 = input_buf[1] 738 lw t3, 8(a1) // t3 = input_buf[2] 739 lw t8, 56(sp) // t8 = range_limit 740 addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)] 741 addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)] 742 addiu s0, t0, 0x9916 // s0 = 0x8000 743 addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)] 744 xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)] 745 srl t0, a0, 1 746 sll t4, a2, 2 747 lwx s5, t4(t1) // s5 = inptr0 748 lwx s6, t4(t2) // s6 = inptr1 749 lwx s7, t4(t3) // s7 = inptr2 750 lw t7, 0(a3) // t7 = outptr 751 blez t0, 2f 752 addu t9, s6, t0 // t9 = end address 7531: 754 lbu t2, 0(s6) // t2 = cb 755 lbu t0, 0(s7) // t0 = cr 756 lbu t1, 0(s5) // t1 = y 757 addiu t2, t2, -128 // t2 = cb - 128 758 addiu t0, t0, -128 // t0 = cr - 128 759 mult $ac1, s4, t2 760 madd $ac1, s3, t0 761 sll t0, t0, 15 762 sll t2, t2, 15 763 mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS 764 extr_r.w t5, $ac1, 16 765 mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS 766 addiu s7, s7, 1 767 addiu s6, s6, 1 768 addu t2, t1, t0 // t2 = y + cred 769 addu t3, t1, t5 // t3 = y + cgreen 770 addu t4, t1, t6 // t4 = y + cblue 771 addu t2, t8, t2 772 addu t3, t8, t3 773 addu t4, t8, t4 774 lbu t1, 1(s5) 775 lbu v0, 0(t2) 776 lbu v1, 0(t3) 777 lbu ra, 0(t4) 778 addu t2, t1, t0 779 addu t3, t1, t5 780 addu t4, t1, t6 781 addu t2, t8, t2 782 addu t3, t8, t3 783 addu t4, t8, t4 784 lbu t2, 0(t2) 785 lbu t3, 0(t3) 786 lbu t4, 0(t4) 787 788 STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 789 790 bne t9, s6, 1b 791 addiu s5, s5, 2 7922: 793 andi t0, a0, 1 794 beqz t0, 4f 795 nop 7963: 797 lbu t2, 0(s6) 798 lbu t0, 0(s7) 799 lbu t1, 0(s5) 800 addiu t2, t2, -128 // (cb - 128) 801 addiu t0, t0, -128 // (cr - 128) 802 mul t3, s4, t2 803 mul t4, s3, t0 804 sll t0, t0, 15 805 sll t2, t2, 15 806 mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS 807 mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS 808 addu t3, t3, s0 809 addu t3, t4, t3 810 sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS 811 addu t2, t1, t0 // y + cred 812 addu t3, t1, t5 // y + cgreen 813 addu t4, t1, t6 // y + cblue 814 addu t2, t8, t2 815 addu t3, t8, t3 816 addu t4, t8, t4 817 lbu t2, 0(t2) 818 lbu t3, 0(t3) 819 lbu t4, 0(t4) 820 821 STORE_H2V1_1_PIXEL t2, t3, t4, t7 8224: 823 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 824 825 j ra 826 nop 827 828END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) 829 830.purgem STORE_H2V1_1_PIXEL 831.purgem STORE_H2V1_2_PIXELS 832.endm 833 834/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ 835GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 836GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 837GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 838GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 839GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 840GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 841 842 843/*****************************************************************************/ 844/* 845 * jsimd_h2v2_fancy_upsample_dspr2 846 * 847 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 848 */ 849LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2) 850/* 851 * a0 = cinfo->max_v_samp_factor 852 * a1 = downsampled_width 853 * a2 = input_data 854 * a3 = output_data_ptr 855 */ 856 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 857 858 li s4, 0 859 lw s2, 0(a3) // s2 = *output_data_ptr 8600: 861 li t9, 2 862 lw s1, -4(a2) // s1 = inptr1 863 8641: 865 lw s0, 0(a2) // s0 = inptr0 866 lwx s3, s4(s2) 867 addiu s5, a1, -2 // s5 = downsampled_width - 2 868 srl t4, s5, 1 869 sll t4, t4, 1 870 lbu t0, 0(s0) 871 lbu t1, 1(s0) 872 lbu t2, 0(s1) 873 lbu t3, 1(s1) 874 addiu s0, 2 875 addiu s1, 2 876 addu t8, s0, t4 // t8 = end address 877 andi s5, s5, 1 // s5 = residual 878 sll t4, t0, 1 879 sll t6, t1, 1 880 addu t0, t0, t4 // t0 = (*inptr0++) * 3 881 addu t1, t1, t6 // t1 = (*inptr0++) * 3 882 addu t7, t0, t2 // t7 = thiscolsum 883 addu t6, t1, t3 // t5 = nextcolsum 884 sll t0, t7, 2 // t0 = thiscolsum * 4 885 subu t1, t0, t7 // t1 = thiscolsum * 3 886 shra_r.w t0, t0, 4 887 addiu t1, 7 888 addu t1, t1, t6 889 srl t1, t1, 4 890 sb t0, 0(s3) 891 sb t1, 1(s3) 892 beq t8, s0, 22f // skip to final iteration if width == 3 893 addiu s3, 2 8942: 895 lh t0, 0(s0) // t0 = A3|A2 896 lh t2, 0(s1) // t2 = B3|B2 897 addiu s0, 2 898 addiu s1, 2 899 preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2 900 preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2 901 shll.ph t1, t0, 1 902 sll t3, t6, 1 903 addu.ph t0, t1, t0 // t0 = A3*3|A2*3 904 addu t3, t3, t6 // t3 = this * 3 905 addu.ph t0, t0, t2 // t0 = next2|next1 906 addu t1, t3, t7 907 andi t7, t0, 0xFFFF // t7 = next1 908 sll t2, t7, 1 909 addu t2, t7, t2 // t2 = next1*3 910 addu t4, t2, t6 911 srl t6, t0, 16 // t6 = next2 912 shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4 913 addu t0, t3, t7 914 addiu t0, 7 915 srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4 916 shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4 917 addu t2, t2, t6 918 addiu t2, 7 919 srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4 920 sb t1, 0(s3) 921 sb t0, 1(s3) 922 sb t4, 2(s3) 923 sb t2, 3(s3) 924 bne t8, s0, 2b 925 addiu s3, 4 92622: 927 beqz s5, 4f 928 addu t8, s0, s5 9293: 930 lbu t0, 0(s0) 931 lbu t2, 0(s1) 932 addiu s0, 1 933 addiu s1, 1 934 sll t3, t6, 1 935 sll t1, t0, 1 936 addu t1, t0, t1 // t1 = inptr0 * 3 937 addu t3, t3, t6 // t3 = thiscolsum * 3 938 addu t5, t1, t2 939 addu t1, t3, t7 940 shra_r.w t1, t1, 4 941 addu t0, t3, t5 942 addiu t0, 7 943 srl t0, t0, 4 944 sb t1, 0(s3) 945 sb t0, 1(s3) 946 addiu s3, 2 947 move t7, t6 948 bne t8, s0, 3b 949 move t6, t5 9504: 951 sll t0, t6, 2 // t0 = thiscolsum * 4 952 subu t1, t0, t6 // t1 = thiscolsum * 3 953 addu t1, t1, t7 954 addiu s4, 4 955 shra_r.w t1, t1, 4 956 addiu t0, 7 957 srl t0, t0, 4 958 sb t1, 0(s3) 959 sb t0, 1(s3) 960 addiu t9, -1 961 addiu s3, 2 962 bnez t9, 1b 963 lw s1, 4(a2) 964 srl t0, s4, 2 965 subu t0, a0, t0 966 bgtz t0, 0b 967 addiu a2, 4 968 969 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 970 971 j ra 972 nop 973END(jsimd_h2v2_fancy_upsample_dspr2) 974 975 976/*****************************************************************************/ 977LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2) 978/* 979 * a0 = cinfo->max_v_samp_factor 980 * a1 = downsampled_width 981 * a2 = input_data 982 * a3 = output_data_ptr 983 */ 984 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 985 986 .set at 987 988 beqz a0, 3f 989 sll t0, a0, 2 990 lw s1, 0(a3) 991 li s3, 0x10001 992 addu s0, s1, t0 9930: 994 addiu t8, a1, -2 995 srl t9, t8, 2 996 lw t7, 0(a2) 997 lw s2, 0(s1) 998 lbu t0, 0(t7) 999 lbu t1, 1(t7) // t1 = inptr[1] 1000 sll t2, t0, 1 1001 addu t2, t2, t0 // t2 = invalue*3 1002 addu t2, t2, t1 1003 shra_r.w t2, t2, 2 1004 sb t0, 0(s2) 1005 sb t2, 1(s2) 1006 beqz t9, 11f 1007 addiu s2, 2 10081: 1009 ulw t0, 0(t7) // t0 = |P3|P2|P1|P0| 1010 ulw t1, 1(t7) 1011 ulh t2, 4(t7) // t2 = |0|0|P5|P4| 1012 preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2| 1013 preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0| 1014 preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4| 1015 preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3| 1016 preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1| 1017 shll.ph t5, t4, 1 1018 shll.ph t6, t1, 1 1019 addu.ph t5, t5, t4 // t5 = |P4*3|P3*3| 1020 addu.ph t6, t6, t1 // t6 = |P2*3|P1*3| 1021 addu.ph t4, t3, s3 1022 addu.ph t0, t0, s3 1023 addu.ph t4, t4, t5 1024 addu.ph t0, t0, t6 1025 shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2| 1026 shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0| 1027 addu.ph t2, t2, t5 1028 addu.ph t3, t3, t6 1029 shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4| 1030 shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2| 1031 shll.ph t2, t2, 8 1032 shll.ph t3, t3, 8 1033 or t2, t4, t2 1034 or t3, t3, t0 1035 addiu t9, -1 1036 usw t3, 0(s2) 1037 usw t2, 4(s2) 1038 addiu s2, 8 1039 bgtz t9, 1b 1040 addiu t7, 4 104111: 1042 andi t8, 3 1043 beqz t8, 22f 1044 addiu t7, 1 1045 10462: 1047 lbu t0, 0(t7) 1048 addiu t7, 1 1049 sll t1, t0, 1 1050 addu t2, t0, t1 // t2 = invalue 1051 lbu t3, -2(t7) 1052 lbu t4, 0(t7) 1053 addiu t3, 1 1054 addiu t4, 2 1055 addu t3, t3, t2 1056 addu t4, t4, t2 1057 srl t3, 2 1058 srl t4, 2 1059 sb t3, 0(s2) 1060 sb t4, 1(s2) 1061 addiu t8, -1 1062 bgtz t8, 2b 1063 addiu s2, 2 1064 106522: 1066 lbu t0, 0(t7) 1067 lbu t2, -1(t7) 1068 sll t1, t0, 1 1069 addu t1, t1, t0 // t1 = invalue * 3 1070 addu t1, t1, t2 1071 addiu t1, 1 1072 srl t1, t1, 2 1073 sb t1, 0(s2) 1074 sb t0, 1(s2) 1075 addiu s1, 4 1076 bne s1, s0, 0b 1077 addiu a2, 4 10783: 1079 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 1080 1081 j ra 1082 nop 1083END(jsimd_h2v1_fancy_upsample_dspr2) 1084 1085 1086/*****************************************************************************/ 1087LEAF_DSPR2(jsimd_h2v1_downsample_dspr2) 1088/* 1089 * a0 = cinfo->image_width 1090 * a1 = cinfo->max_v_samp_factor 1091 * a2 = compptr->v_samp_factor 1092 * a3 = compptr->width_in_blocks 1093 * 16(sp) = input_data 1094 * 20(sp) = output_data 1095 */ 1096 .set at 1097 1098 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 1099 1100 beqz a2, 7f 1101 lw s1, 44(sp) // s1 = output_data 1102 lw s0, 40(sp) // s0 = input_data 1103 srl s2, a0, 2 1104 andi t9, a0, 2 1105 srl t7, t9, 1 1106 addu s2, t7, s2 1107 sll t0, a3, 3 // t0 = width_in_blocks*DCT 1108 srl t7, t0, 1 1109 subu s2, t7, s2 11100: 1111 andi t6, a0, 1 // t6 = temp_index 1112 addiu t6, -1 1113 lw t4, 0(s1) // t4 = outptr 1114 lw t5, 0(s0) // t5 = inptr0 1115 li s3, 0 // s3 = bias 1116 srl t7, a0, 1 // t7 = image_width1 1117 srl s4, t7, 2 1118 andi t8, t7, 3 11191: 1120 ulhu t0, 0(t5) 1121 ulhu t1, 2(t5) 1122 ulhu t2, 4(t5) 1123 ulhu t3, 6(t5) 1124 raddu.w.qb t0, t0 1125 raddu.w.qb t1, t1 1126 raddu.w.qb t2, t2 1127 raddu.w.qb t3, t3 1128 shra.ph t0, t0, 1 1129 shra_r.ph t1, t1, 1 1130 shra.ph t2, t2, 1 1131 shra_r.ph t3, t3, 1 1132 sb t0, 0(t4) 1133 sb t1, 1(t4) 1134 sb t2, 2(t4) 1135 sb t3, 3(t4) 1136 addiu s4, -1 1137 addiu t4, 4 1138 bgtz s4, 1b 1139 addiu t5, 8 1140 beqz t8, 3f 1141 addu s4, t4, t8 11422: 1143 ulhu t0, 0(t5) 1144 raddu.w.qb t0, t0 1145 addqh.w t0, t0, s3 1146 xori s3, s3, 1 1147 sb t0, 0(t4) 1148 addiu t4, 1 1149 bne t4, s4, 2b 1150 addiu t5, 2 11513: 1152 lbux t1, t6(t5) 1153 sll t1, 1 1154 addqh.w t2, t1, s3 // t2 = pixval1 1155 xori s3, s3, 1 1156 addqh.w t3, t1, s3 // t3 = pixval2 1157 blez s2, 5f 1158 append t3, t2, 8 1159 addu t5, t4, s2 // t5 = loop_end2 11604: 1161 ush t3, 0(t4) 1162 addiu s2, -1 1163 bgtz s2, 4b 1164 addiu t4, 2 11655: 1166 beqz t9, 6f 1167 nop 1168 sb t2, 0(t4) 11696: 1170 addiu s1, 4 1171 addiu a2, -1 1172 bnez a2, 0b 1173 addiu s0, 4 11747: 1175 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 1176 1177 j ra 1178 nop 1179END(jsimd_h2v1_downsample_dspr2) 1180 1181 1182/*****************************************************************************/ 1183LEAF_DSPR2(jsimd_h2v2_downsample_dspr2) 1184/* 1185 * a0 = cinfo->image_width 1186 * a1 = cinfo->max_v_samp_factor 1187 * a2 = compptr->v_samp_factor 1188 * a3 = compptr->width_in_blocks 1189 * 16(sp) = input_data 1190 * 20(sp) = output_data 1191 */ 1192 .set at 1193 1194 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1195 1196 beqz a2, 8f 1197 lw s1, 52(sp) // s1 = output_data 1198 lw s0, 48(sp) // s0 = input_data 1199 1200 andi t6, a0, 1 // t6 = temp_index 1201 addiu t6, -1 1202 srl t7, a0, 1 // t7 = image_width1 1203 srl s4, t7, 2 1204 andi t8, t7, 3 1205 andi t9, a0, 2 1206 srl s2, a0, 2 1207 srl t7, t9, 1 1208 addu s2, t7, s2 1209 sll t0, a3, 3 // s2 = width_in_blocks*DCT 1210 srl t7, t0, 1 1211 subu s2, t7, s2 12120: 1213 lw t4, 0(s1) // t4 = outptr 1214 lw t5, 0(s0) // t5 = inptr0 1215 lw s7, 4(s0) // s7 = inptr1 1216 li s6, 1 // s6 = bias 12172: 1218 ulw t0, 0(t5) // t0 = |P3|P2|P1|P0| 1219 ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0| 1220 ulw t2, 4(t5) 1221 ulw t3, 4(s7) 1222 precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2| 1223 ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0| 1224 raddu.w.qb t1, t7 1225 raddu.w.qb t0, t0 1226 shra_r.w t1, t1, 2 1227 addiu t0, 1 1228 srl t0, 2 1229 precrq.ph.w t7, t2, t3 1230 ins t2, t3, 16, 16 1231 raddu.w.qb t7, t7 1232 raddu.w.qb t2, t2 1233 shra_r.w t7, t7, 2 1234 addiu t2, 1 1235 srl t2, 2 1236 sb t0, 0(t4) 1237 sb t1, 1(t4) 1238 sb t2, 2(t4) 1239 sb t7, 3(t4) 1240 addiu t4, 4 1241 addiu t5, 8 1242 addiu s4, s4, -1 1243 bgtz s4, 2b 1244 addiu s7, 8 1245 beqz t8, 4f 1246 addu t8, t4, t8 12473: 1248 ulhu t0, 0(t5) 1249 ulhu t1, 0(s7) 1250 ins t0, t1, 16, 16 1251 raddu.w.qb t0, t0 1252 addu t0, t0, s6 1253 srl t0, 2 1254 xori s6, s6, 3 1255 sb t0, 0(t4) 1256 addiu t5, 2 1257 addiu t4, 1 1258 bne t8, t4, 3b 1259 addiu s7, 2 12604: 1261 lbux t1, t6(t5) 1262 sll t1, 1 1263 lbux t0, t6(s7) 1264 sll t0, 1 1265 addu t1, t1, t0 1266 addu t3, t1, s6 1267 srl t0, t3, 2 // t2 = pixval1 1268 xori s6, s6, 3 1269 addu t2, t1, s6 1270 srl t1, t2, 2 // t3 = pixval2 1271 blez s2, 6f 1272 append t1, t0, 8 12735: 1274 ush t1, 0(t4) 1275 addiu s2, -1 1276 bgtz s2, 5b 1277 addiu t4, 2 12786: 1279 beqz t9, 7f 1280 nop 1281 sb t0, 0(t4) 12827: 1283 addiu s1, 4 1284 addiu a2, -1 1285 bnez a2, 0b 1286 addiu s0, 8 12878: 1288 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1289 1290 j ra 1291 nop 1292END(jsimd_h2v2_downsample_dspr2) 1293 1294 1295/*****************************************************************************/ 1296LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2) 1297/* 1298 * a0 = input_data 1299 * a1 = output_data 1300 * a2 = compptr->v_samp_factor 1301 * a3 = cinfo->max_v_samp_factor 1302 * 16(sp) = cinfo->smoothing_factor 1303 * 20(sp) = compptr->width_in_blocks 1304 * 24(sp) = cinfo->image_width 1305 */ 1306 .set at 1307 1308 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1309 1310 lw s7, 52(sp) // compptr->width_in_blocks 1311 lw s0, 56(sp) // cinfo->image_width 1312 lw s6, 48(sp) // cinfo->smoothing_factor 1313 sll s7, 3 // output_cols = width_in_blocks * DCTSIZE 1314 sll v0, s7, 1 1315 subu v0, v0, s0 1316 blez v0, 2f 1317 move v1, zero 1318 addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2 13190: 1320 addiu t1, a0, -4 1321 sll t2, v1, 2 1322 lwx t1, t2(t1) 1323 move t3, v0 1324 addu t1, t1, s0 1325 lbu t2, -1(t1) 13261: 1327 addiu t3, t3, -1 1328 sb t2, 0(t1) 1329 bgtz t3, 1b 1330 addiu t1, t1, 1 1331 addiu v1, v1, 1 1332 bne v1, t0, 0b 1333 nop 13342: 1335 li v0, 80 1336 mul v0, s6, v0 1337 li v1, 16384 1338 move t4, zero 1339 move t5, zero 1340 subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80 1341 sll t7, s6, 4 // t7 = tmp_smoot_f * 16 13423: 1343/* Special case for first column: pretend column -1 is same as column 0 */ 1344 sll v0, t4, 2 1345 lwx t8, v0(a1) // outptr = output_data[outrow] 1346 sll v1, t5, 2 1347 addiu t9, v1, 4 1348 addiu s0, v1, -4 1349 addiu s1, v1, 8 1350 lwx s2, v1(a0) // inptr0 = input_data[inrow] 1351 lwx t9, t9(a0) // inptr1 = input_data[inrow+1] 1352 lwx s0, s0(a0) // above_ptr = input_data[inrow-1] 1353 lwx s1, s1(a0) // below_ptr = input_data[inrow+2] 1354 lh v0, 0(s2) 1355 lh v1, 0(t9) 1356 lh t0, 0(s0) 1357 lh t1, 0(s1) 1358 ins v0, v1, 16, 16 1359 ins t0, t1, 16, 16 1360 raddu.w.qb t2, v0 1361 raddu.w.qb s3, t0 1362 lbu v0, 0(s2) 1363 lbu v1, 2(s2) 1364 lbu t0, 0(t9) 1365 lbu t1, 2(t9) 1366 addu v0, v0, v1 1367 mult $ac1, t2, t6 1368 addu t0, t0, t1 1369 lbu t2, 2(s0) 1370 addu t0, t0, v0 1371 lbu t3, 2(s1) 1372 addu s3, t0, s3 1373 lbu v0, 0(s0) 1374 lbu t0, 0(s1) 1375 sll s3, s3, 1 1376 addu v0, v0, t2 1377 addu t0, t0, t3 1378 addu t0, t0, v0 1379 addu s3, t0, s3 1380 madd $ac1, s3, t7 1381 extr_r.w v0, $ac1, 16 1382 addiu t8, t8, 1 1383 addiu s2, s2, 2 1384 addiu t9, t9, 2 1385 addiu s0, s0, 2 1386 addiu s1, s1, 2 1387 sb v0, -1(t8) 1388 addiu s4, s7, -2 1389 and s4, s4, 3 1390 addu s5, s4, t8 // end address 13914: 1392 lh v0, 0(s2) 1393 lh v1, 0(t9) 1394 lh t0, 0(s0) 1395 lh t1, 0(s1) 1396 ins v0, v1, 16, 16 1397 ins t0, t1, 16, 16 1398 raddu.w.qb t2, v0 1399 raddu.w.qb s3, t0 1400 lbu v0, -1(s2) 1401 lbu v1, 2(s2) 1402 lbu t0, -1(t9) 1403 lbu t1, 2(t9) 1404 addu v0, v0, v1 1405 mult $ac1, t2, t6 1406 addu t0, t0, t1 1407 lbu t2, 2(s0) 1408 addu t0, t0, v0 1409 lbu t3, 2(s1) 1410 addu s3, t0, s3 1411 lbu v0, -1(s0) 1412 lbu t0, -1(s1) 1413 sll s3, s3, 1 1414 addu v0, v0, t2 1415 addu t0, t0, t3 1416 addu t0, t0, v0 1417 addu s3, t0, s3 1418 madd $ac1, s3, t7 1419 extr_r.w t2, $ac1, 16 1420 addiu t8, t8, 1 1421 addiu s2, s2, 2 1422 addiu t9, t9, 2 1423 addiu s0, s0, 2 1424 sb t2, -1(t8) 1425 bne s5, t8, 4b 1426 addiu s1, s1, 2 1427 addiu s5, s7, -2 1428 subu s5, s5, s4 1429 addu s5, s5, t8 // end address 14305: 1431 lh v0, 0(s2) 1432 lh v1, 0(t9) 1433 lh t0, 0(s0) 1434 lh t1, 0(s1) 1435 ins v0, v1, 16, 16 1436 ins t0, t1, 16, 16 1437 raddu.w.qb t2, v0 1438 raddu.w.qb s3, t0 1439 lbu v0, -1(s2) 1440 lbu v1, 2(s2) 1441 lbu t0, -1(t9) 1442 lbu t1, 2(t9) 1443 addu v0, v0, v1 1444 mult $ac1, t2, t6 1445 addu t0, t0, t1 1446 lbu t2, 2(s0) 1447 addu t0, t0, v0 1448 lbu t3, 2(s1) 1449 addu s3, t0, s3 1450 lbu v0, -1(s0) 1451 lbu t0, -1(s1) 1452 sll s3, s3, 1 1453 addu v0, v0, t2 1454 addu t0, t0, t3 1455 lh v1, 2(t9) 1456 addu t0, t0, v0 1457 lh v0, 2(s2) 1458 addu s3, t0, s3 1459 lh t0, 2(s0) 1460 lh t1, 2(s1) 1461 madd $ac1, s3, t7 1462 extr_r.w t2, $ac1, 16 1463 ins t0, t1, 16, 16 1464 ins v0, v1, 16, 16 1465 raddu.w.qb s3, t0 1466 lbu v1, 4(s2) 1467 lbu t0, 1(t9) 1468 lbu t1, 4(t9) 1469 sb t2, 0(t8) 1470 raddu.w.qb t3, v0 1471 lbu v0, 1(s2) 1472 addu t0, t0, t1 1473 mult $ac1, t3, t6 1474 addu v0, v0, v1 1475 lbu t2, 4(s0) 1476 addu t0, t0, v0 1477 lbu v0, 1(s0) 1478 addu s3, t0, s3 1479 lbu t0, 1(s1) 1480 lbu t3, 4(s1) 1481 addu v0, v0, t2 1482 sll s3, s3, 1 1483 addu t0, t0, t3 1484 lh v1, 4(t9) 1485 addu t0, t0, v0 1486 lh v0, 4(s2) 1487 addu s3, t0, s3 1488 lh t0, 4(s0) 1489 lh t1, 4(s1) 1490 madd $ac1, s3, t7 1491 extr_r.w t2, $ac1, 16 1492 ins t0, t1, 16, 16 1493 ins v0, v1, 16, 16 1494 raddu.w.qb s3, t0 1495 lbu v1, 6(s2) 1496 lbu t0, 3(t9) 1497 lbu t1, 6(t9) 1498 sb t2, 1(t8) 1499 raddu.w.qb t3, v0 1500 lbu v0, 3(s2) 1501 addu t0, t0, t1 1502 mult $ac1, t3, t6 1503 addu v0, v0, v1 1504 lbu t2, 6(s0) 1505 addu t0, t0, v0 1506 lbu v0, 3(s0) 1507 addu s3, t0, s3 1508 lbu t0, 3(s1) 1509 lbu t3, 6(s1) 1510 addu v0, v0, t2 1511 sll s3, s3, 1 1512 addu t0, t0, t3 1513 lh v1, 6(t9) 1514 addu t0, t0, v0 1515 lh v0, 6(s2) 1516 addu s3, t0, s3 1517 lh t0, 6(s0) 1518 lh t1, 6(s1) 1519 madd $ac1, s3, t7 1520 extr_r.w t3, $ac1, 16 1521 ins t0, t1, 16, 16 1522 ins v0, v1, 16, 16 1523 raddu.w.qb s3, t0 1524 lbu v1, 8(s2) 1525 lbu t0, 5(t9) 1526 lbu t1, 8(t9) 1527 sb t3, 2(t8) 1528 raddu.w.qb t2, v0 1529 lbu v0, 5(s2) 1530 addu t0, t0, t1 1531 mult $ac1, t2, t6 1532 addu v0, v0, v1 1533 lbu t2, 8(s0) 1534 addu t0, t0, v0 1535 lbu v0, 5(s0) 1536 addu s3, t0, s3 1537 lbu t0, 5(s1) 1538 lbu t3, 8(s1) 1539 addu v0, v0, t2 1540 sll s3, s3, 1 1541 addu t0, t0, t3 1542 addiu t8, t8, 4 1543 addu t0, t0, v0 1544 addiu s2, s2, 8 1545 addu s3, t0, s3 1546 addiu t9, t9, 8 1547 madd $ac1, s3, t7 1548 extr_r.w t1, $ac1, 16 1549 addiu s0, s0, 8 1550 addiu s1, s1, 8 1551 bne s5, t8, 5b 1552 sb t1, -1(t8) 1553/* Special case for last column */ 1554 lh v0, 0(s2) 1555 lh v1, 0(t9) 1556 lh t0, 0(s0) 1557 lh t1, 0(s1) 1558 ins v0, v1, 16, 16 1559 ins t0, t1, 16, 16 1560 raddu.w.qb t2, v0 1561 raddu.w.qb s3, t0 1562 lbu v0, -1(s2) 1563 lbu v1, 1(s2) 1564 lbu t0, -1(t9) 1565 lbu t1, 1(t9) 1566 addu v0, v0, v1 1567 mult $ac1, t2, t6 1568 addu t0, t0, t1 1569 lbu t2, 1(s0) 1570 addu t0, t0, v0 1571 lbu t3, 1(s1) 1572 addu s3, t0, s3 1573 lbu v0, -1(s0) 1574 lbu t0, -1(s1) 1575 sll s3, s3, 1 1576 addu v0, v0, t2 1577 addu t0, t0, t3 1578 addu t0, t0, v0 1579 addu s3, t0, s3 1580 madd $ac1, s3, t7 1581 extr_r.w t0, $ac1, 16 1582 addiu t5, t5, 2 1583 sb t0, 0(t8) 1584 addiu t4, t4, 1 1585 bne t4, a2, 3b 1586 addiu t5, t5, 2 1587 1588 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1589 1590 j ra 1591 nop 1592 1593END(jsimd_h2v2_smooth_downsample_dspr2) 1594 1595 1596/*****************************************************************************/ 1597LEAF_DSPR2(jsimd_int_upsample_dspr2) 1598/* 1599 * a0 = upsample->h_expand[compptr->component_index] 1600 * a1 = upsample->v_expand[compptr->component_index] 1601 * a2 = input_data 1602 * a3 = output_data_ptr 1603 * 16(sp) = cinfo->output_width 1604 * 20(sp) = cinfo->max_v_samp_factor 1605 */ 1606 .set at 1607 1608 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 1609 1610 lw s0, 0(a3) // s0 = output_data 1611 lw s1, 32(sp) // s1 = cinfo->output_width 1612 lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor 1613 li t6, 0 // t6 = inrow 1614 beqz s2, 10f 1615 li s3, 0 // s3 = outrow 16160: 1617 addu t0, a2, t6 1618 addu t7, s0, s3 1619 lw t3, 0(t0) // t3 = inptr 1620 lw t8, 0(t7) // t8 = outptr 1621 beqz s1, 4f 1622 addu t5, t8, s1 // t5 = outend 16231: 1624 lb t2, 0(t3) // t2 = invalue = *inptr++ 1625 addiu t3, 1 1626 beqz a0, 3f 1627 move t0, a0 // t0 = h_expand 16282: 1629 sb t2, 0(t8) 1630 addiu t0, -1 1631 bgtz t0, 2b 1632 addiu t8, 1 16333: 1634 bgt t5, t8, 1b 1635 nop 16364: 1637 addiu t9, a1, -1 // t9 = v_expand - 1 1638 blez t9, 9f 1639 nop 16405: 1641 lw t3, 0(s0) 1642 lw t4, 4(s0) 1643 subu t0, s1, 0xF 1644 blez t0, 7f 1645 addu t5, t3, s1 // t5 = end address 1646 andi t7, s1, 0xF // t7 = residual 1647 subu t8, t5, t7 16486: 1649 ulw t0, 0(t3) 1650 ulw t1, 4(t3) 1651 ulw t2, 8(t3) 1652 usw t0, 0(t4) 1653 ulw t0, 12(t3) 1654 usw t1, 4(t4) 1655 usw t2, 8(t4) 1656 usw t0, 12(t4) 1657 addiu t3, 16 1658 bne t3, t8, 6b 1659 addiu t4, 16 1660 beqz t7, 8f 1661 nop 16627: 1663 lbu t0, 0(t3) 1664 sb t0, 0(t4) 1665 addiu t3, 1 1666 bne t3, t5, 7b 1667 addiu t4, 1 16688: 1669 addiu t9, -1 1670 bgtz t9, 5b 1671 addiu s0, 8 16729: 1673 addu s3, s3, a1 1674 bne s3, s2, 0b 1675 addiu t6, 1 167610: 1677 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 1678 1679 j ra 1680 nop 1681END(jsimd_int_upsample_dspr2) 1682 1683 1684/*****************************************************************************/ 1685LEAF_DSPR2(jsimd_h2v1_upsample_dspr2) 1686/* 1687 * a0 = cinfo->max_v_samp_factor 1688 * a1 = cinfo->output_width 1689 * a2 = input_data 1690 * a3 = output_data_ptr 1691 */ 1692 lw t7, 0(a3) // t7 = output_data 1693 andi t8, a1, 0xf // t8 = residual 1694 sll t0, a0, 2 1695 blez a0, 4f 1696 addu t9, t7, t0 // t9 = output_data end address 16970: 1698 lw t5, 0(t7) // t5 = outptr 1699 lw t6, 0(a2) // t6 = inptr 1700 addu t3, t5, a1 // t3 = outptr + output_width (end address) 1701 subu t3, t8 // t3 = end address - residual 1702 beq t5, t3, 2f 1703 move t4, t8 17041: 1705 ulw t0, 0(t6) // t0 = |P3|P2|P1|P0| 1706 ulw t2, 4(t6) // t2 = |P7|P6|P5|P4| 1707 srl t1, t0, 16 // t1 = |X|X|P3|P2| 1708 ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0| 1709 ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2| 1710 ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0| 1711 ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2| 1712 usw t0, 0(t5) 1713 usw t1, 4(t5) 1714 srl t0, t2, 16 // t0 = |X|X|P7|P6| 1715 ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4| 1716 ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6| 1717 ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4| 1718 ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6| 1719 usw t2, 8(t5) 1720 usw t0, 12(t5) 1721 addiu t5, 16 1722 bne t5, t3, 1b 1723 addiu t6, 8 1724 beqz t8, 3f 1725 move t4, t8 17262: 1727 lbu t1, 0(t6) 1728 sb t1, 0(t5) 1729 sb t1, 1(t5) 1730 addiu t4, -2 1731 addiu t6, 1 1732 bgtz t4, 2b 1733 addiu t5, 2 17343: 1735 addiu t7, 4 1736 bne t9, t7, 0b 1737 addiu a2, 4 17384: 1739 j ra 1740 nop 1741END(jsimd_h2v1_upsample_dspr2) 1742 1743 1744/*****************************************************************************/ 1745LEAF_DSPR2(jsimd_h2v2_upsample_dspr2) 1746/* 1747 * a0 = cinfo->max_v_samp_factor 1748 * a1 = cinfo->output_width 1749 * a2 = input_data 1750 * a3 = output_data_ptr 1751 */ 1752 lw t7, 0(a3) 1753 blez a0, 7f 1754 andi t9, a1, 0xf // t9 = residual 17550: 1756 lw t6, 0(a2) // t6 = inptr 1757 lw t5, 0(t7) // t5 = outptr 1758 addu t8, t5, a1 // t8 = outptr end address 1759 subu t8, t9 // t8 = end address - residual 1760 beq t5, t8, 2f 1761 move t4, t9 17621: 1763 ulw t0, 0(t6) 1764 srl t1, t0, 16 1765 ins t0, t0, 16, 16 1766 ins t0, t0, 8, 16 1767 ins t1, t1, 16, 16 1768 ins t1, t1, 8, 16 1769 ulw t2, 4(t6) 1770 usw t0, 0(t5) 1771 usw t1, 4(t5) 1772 srl t3, t2, 16 1773 ins t2, t2, 16, 16 1774 ins t2, t2, 8, 16 1775 ins t3, t3, 16, 16 1776 ins t3, t3, 8, 16 1777 usw t2, 8(t5) 1778 usw t3, 12(t5) 1779 addiu t5, 16 1780 bne t5, t8, 1b 1781 addiu t6, 8 1782 beqz t9, 3f 1783 move t4, t9 17842: 1785 lbu t0, 0(t6) 1786 sb t0, 0(t5) 1787 sb t0, 1(t5) 1788 addiu t4, -2 1789 addiu t6, 1 1790 bgtz t4, 2b 1791 addiu t5, 2 17923: 1793 lw t6, 0(t7) // t6 = outptr[0] 1794 lw t5, 4(t7) // t5 = outptr[1] 1795 addu t4, t6, a1 // t4 = new end address 1796 beq a1, t9, 5f 1797 subu t8, t4, t9 17984: 1799 ulw t0, 0(t6) 1800 ulw t1, 4(t6) 1801 ulw t2, 8(t6) 1802 usw t0, 0(t5) 1803 ulw t0, 12(t6) 1804 usw t1, 4(t5) 1805 usw t2, 8(t5) 1806 usw t0, 12(t5) 1807 addiu t6, 16 1808 bne t6, t8, 4b 1809 addiu t5, 16 1810 beqz t9, 6f 1811 nop 18125: 1813 lbu t0, 0(t6) 1814 sb t0, 0(t5) 1815 addiu t6, 1 1816 bne t6, t4, 5b 1817 addiu t5, 1 18186: 1819 addiu t7, 8 1820 addiu a0, -2 1821 bgtz a0, 0b 1822 addiu a2, 4 18237: 1824 j ra 1825 nop 1826END(jsimd_h2v2_upsample_dspr2) 1827 1828 1829/*****************************************************************************/ 1830LEAF_DSPR2(jsimd_idct_islow_dspr2) 1831/* 1832 * a0 = coef_block 1833 * a1 = compptr->dcttable 1834 * a2 = output 1835 * a3 = range_limit 1836 */ 1837 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1838 1839 addiu sp, sp, -256 1840 move v0, sp 1841 addiu v1, zero, 8 // v1 = DCTSIZE = 8 18421: 1843 lh s4, 32(a0) // s4 = inptr[16] 1844 lh s5, 64(a0) // s5 = inptr[32] 1845 lh s6, 96(a0) // s6 = inptr[48] 1846 lh t1, 112(a0) // t1 = inptr[56] 1847 lh t7, 16(a0) // t7 = inptr[8] 1848 lh t5, 80(a0) // t5 = inptr[40] 1849 lh t3, 48(a0) // t3 = inptr[24] 1850 or s4, s4, t1 1851 or s4, s4, t3 1852 or s4, s4, t5 1853 or s4, s4, t7 1854 or s4, s4, s5 1855 or s4, s4, s6 1856 bnez s4, 2f 1857 addiu v1, v1, -1 1858 lh s5, 0(a1) // quantptr[DCTSIZE*0] 1859 lh s6, 0(a0) // inptr[DCTSIZE*0] 1860 mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0]) 1861 sll s5, s5, 2 1862 sw s5, 0(v0) 1863 sw s5, 32(v0) 1864 sw s5, 64(v0) 1865 sw s5, 96(v0) 1866 sw s5, 128(v0) 1867 sw s5, 160(v0) 1868 sw s5, 192(v0) 1869 b 3f 1870 sw s5, 224(v0) 18712: 1872 lh t0, 112(a1) 1873 lh t2, 48(a1) 1874 lh t4, 80(a1) 1875 lh t6, 16(a1) 1876 mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) 1877 mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) 1878 mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) 1879 mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) 1880 lh t4, 32(a1) 1881 lh t5, 32(a0) 1882 lh t6, 96(a1) 1883 lh t7, 96(a0) 1884 addu s0, t0, t1 // z3 = tmp0 + tmp2 1885 addu s1, t1, t2 // z2 = tmp1 + tmp2 1886 addu s2, t2, t3 // z4 = tmp1 + tmp3 1887 addu s3, s0, s2 // z3 + z4 1888 addiu t9, zero, 9633 // FIX_1_175875602 1889 mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) 1890 addu t8, t0, t3 // z1 = tmp0 + tmp3 1891 addiu t9, zero, 2446 // FIX_0_298631336 1892 mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) 1893 addiu t9, zero, 16819 // FIX_2_053119869 1894 mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) 1895 addiu t9, zero, 25172 // FIX_3_072711026 1896 mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) 1897 addiu t9, zero, 12299 // FIX_1_501321110 1898 mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) 1899 addiu t9, zero, 16069 // FIX_1_961570560 1900 mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560) 1901 addiu t9, zero, 3196 // FIX_0_390180644 1902 mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644) 1903 addiu t9, zero, 7373 // FIX_0_899976223 1904 mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223) 1905 addiu t9, zero, 20995 // FIX_2_562915447 1906 mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447) 1907 subu s0, s3, s0 // z3 += z5 1908 addu t0, t0, s0 // tmp0 += z3 1909 addu t1, t1, s0 // tmp2 += z3 1910 subu s2, s3, s2 // z4 += z5 1911 addu t2, t2, s2 // tmp1 += z4 1912 addu t3, t3, s2 // tmp3 += z4 1913 subu t0, t0, t8 // tmp0 += z1 1914 subu t1, t1, s1 // tmp2 += z2 1915 subu t2, t2, s1 // tmp1 += z2 1916 subu t3, t3, t8 // tmp3 += z1 1917 mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) 1918 addiu t9, zero, 6270 // FIX_0_765366865 1919 mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) 1920 lh t4, 0(a1) 1921 lh t5, 0(a0) 1922 lh t6, 64(a1) 1923 lh t7, 64(a0) 1924 mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865) 1925 mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) 1926 mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) 1927 addiu t9, zero, 4433 // FIX_0_541196100 1928 addu s3, s0, s1 // z2 + z3 1929 mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) 1930 addiu t9, zero, 15137 // FIX_1_847759065 1931 mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065) 1932 addu t4, t5, t6 1933 subu t5, t5, t6 1934 sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS 1935 sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS 1936 addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) 1937 subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065) 1938 addu s0, t4, t7 1939 subu s1, t4, t7 1940 addu s2, t5, t6 1941 subu s3, t5, t6 1942 addu t4, s0, t3 1943 subu s0, s0, t3 1944 addu t3, s2, t1 1945 subu s2, s2, t1 1946 addu t1, s3, t2 1947 subu s3, s3, t2 1948 addu t2, s1, t0 1949 subu s1, s1, t0 1950 shra_r.w t4, t4, 11 1951 shra_r.w t3, t3, 11 1952 shra_r.w t1, t1, 11 1953 shra_r.w t2, t2, 11 1954 shra_r.w s1, s1, 11 1955 shra_r.w s3, s3, 11 1956 shra_r.w s2, s2, 11 1957 shra_r.w s0, s0, 11 1958 sw t4, 0(v0) 1959 sw t3, 32(v0) 1960 sw t1, 64(v0) 1961 sw t2, 96(v0) 1962 sw s1, 128(v0) 1963 sw s3, 160(v0) 1964 sw s2, 192(v0) 1965 sw s0, 224(v0) 19663: 1967 addiu a1, a1, 2 1968 addiu a0, a0, 2 1969 bgtz v1, 1b 1970 addiu v0, v0, 4 1971 move v0, sp 1972 addiu v1, zero, 8 19734: 1974 lw t0, 8(v0) // z2 = (JLONG)wsptr[2] 1975 lw t1, 24(v0) // z3 = (JLONG)wsptr[6] 1976 lw t2, 0(v0) // (JLONG)wsptr[0] 1977 lw t3, 16(v0) // (JLONG)wsptr[4] 1978 lw s4, 4(v0) // (JLONG)wsptr[1] 1979 lw s5, 12(v0) // (JLONG)wsptr[3] 1980 lw s6, 20(v0) // (JLONG)wsptr[5] 1981 lw s7, 28(v0) // (JLONG)wsptr[7] 1982 or s4, s4, t0 1983 or s4, s4, t1 1984 or s4, s4, t3 1985 or s4, s4, s7 1986 or s4, s4, s5 1987 or s4, s4, s6 1988 bnez s4, 5f 1989 addiu v1, v1, -1 1990 shra_r.w s5, t2, 5 1991 andi s5, s5, 0x3ff 1992 lbux s5, s5(a3) 1993 lw s1, 0(a2) 1994 replv.qb s5, s5 1995 usw s5, 0(s1) 1996 usw s5, 4(s1) 1997 b 6f 1998 nop 19995: 2000 addu t4, t0, t1 // z2 + z3 2001 addiu t8, zero, 4433 // FIX_0_541196100 2002 mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100) 2003 addiu t8, zero, 15137 // FIX_1_847759065 2004 mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065) 2005 addiu t8, zero, 6270 // FIX_0_765366865 2006 mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865) 2007 addu t4, t2, t3 // (JLONG)wsptr[0] + (JLONG)wsptr[4] 2008 subu t2, t2, t3 // (JLONG)wsptr[0] - (JLONG)wsptr[4] 2009 sll t4, t4, 13 // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS 2010 sll t2, t2, 13 // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS 2011 subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065) 2012 subu t3, t2, t1 // tmp12 = tmp1 - tmp2 2013 addu t2, t2, t1 // tmp11 = tmp1 + tmp2 2014 addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) 2015 subu t1, t4, t5 // tmp13 = tmp0 - tmp3 2016 addu t0, t4, t5 // tmp10 = tmp0 + tmp3 2017 lw t4, 28(v0) // tmp0 = (JLONG)wsptr[7] 2018 lw t6, 12(v0) // tmp2 = (JLONG)wsptr[3] 2019 lw t5, 20(v0) // tmp1 = (JLONG)wsptr[5] 2020 lw t7, 4(v0) // tmp3 = (JLONG)wsptr[1] 2021 addu s0, t4, t6 // z3 = tmp0 + tmp2 2022 addiu t8, zero, 9633 // FIX_1_175875602 2023 addu s1, t5, t7 // z4 = tmp1 + tmp3 2024 addu s2, s0, s1 // z3 + z4 2025 mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602) 2026 addu s3, t4, t7 // z1 = tmp0 + tmp3 2027 addu t9, t5, t6 // z2 = tmp1 + tmp2 2028 addiu t8, zero, 16069 // FIX_1_961570560 2029 mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560) 2030 addiu t8, zero, 3196 // FIX_0_390180644 2031 mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644) 2032 addiu t8, zero, 2446 // FIX_0_298631336 2033 mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336) 2034 addiu t8, zero, 7373 // FIX_0_899976223 2035 mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223) 2036 addiu t8, zero, 16819 // FIX_2_053119869 2037 mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869) 2038 addiu t8, zero, 20995 // FIX_2_562915447 2039 mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447) 2040 addiu t8, zero, 25172 // FIX_3_072711026 2041 mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026) 2042 addiu t8, zero, 12299 // FIX_1_501321110 2043 mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110) 2044 subu s0, s2, s0 // z3 += z5 2045 subu s1, s2, s1 // z4 += z5 2046 addu t4, t4, s0 2047 subu t4, t4, s3 // tmp0 2048 addu t5, t5, s1 2049 subu t5, t5, t9 // tmp1 2050 addu t6, t6, s0 2051 subu t6, t6, t9 // tmp2 2052 addu t7, t7, s1 2053 subu t7, t7, s3 // tmp3 2054 addu s0, t0, t7 2055 subu t0, t0, t7 2056 addu t7, t2, t6 2057 subu t2, t2, t6 2058 addu t6, t3, t5 2059 subu t3, t3, t5 2060 addu t5, t1, t4 2061 subu t1, t1, t4 2062 shra_r.w s0, s0, 18 2063 shra_r.w t7, t7, 18 2064 shra_r.w t6, t6, 18 2065 shra_r.w t5, t5, 18 2066 shra_r.w t1, t1, 18 2067 shra_r.w t3, t3, 18 2068 shra_r.w t2, t2, 18 2069 shra_r.w t0, t0, 18 2070 andi s0, s0, 0x3ff 2071 andi t7, t7, 0x3ff 2072 andi t6, t6, 0x3ff 2073 andi t5, t5, 0x3ff 2074 andi t1, t1, 0x3ff 2075 andi t3, t3, 0x3ff 2076 andi t2, t2, 0x3ff 2077 andi t0, t0, 0x3ff 2078 lw s1, 0(a2) 2079 lbux s0, s0(a3) 2080 lbux t7, t7(a3) 2081 lbux t6, t6(a3) 2082 lbux t5, t5(a3) 2083 lbux t1, t1(a3) 2084 lbux t3, t3(a3) 2085 lbux t2, t2(a3) 2086 lbux t0, t0(a3) 2087 sb s0, 0(s1) 2088 sb t7, 1(s1) 2089 sb t6, 2(s1) 2090 sb t5, 3(s1) 2091 sb t1, 4(s1) 2092 sb t3, 5(s1) 2093 sb t2, 6(s1) 2094 sb t0, 7(s1) 20956: 2096 addiu v0, v0, 32 2097 bgtz v1, 4b 2098 addiu a2, a2, 4 2099 addiu sp, sp, 256 2100 2101 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2102 2103 j ra 2104 nop 2105 2106END(jsimd_idct_islow_dspr2) 2107 2108 2109/*****************************************************************************/ 2110LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2) 2111/* 2112 * a0 = inptr 2113 * a1 = quantptr 2114 * a2 = wsptr 2115 * a3 = mips_idct_ifast_coefs 2116 */ 2117 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2118 2119 addiu t9, a0, 16 // end address 2120 or AT, a3, zero 2121 21220: 2123 lw s0, 0(a1) // quantptr[DCTSIZE*0] 2124 lw t0, 0(a0) // inptr[DCTSIZE*0] 2125 lw t1, 16(a0) // inptr[DCTSIZE*1] 2126 muleq_s.w.phl v0, t0, s0 // tmp0 ... 2127 lw t2, 32(a0) // inptr[DCTSIZE*2] 2128 lw t3, 48(a0) // inptr[DCTSIZE*3] 2129 lw t4, 64(a0) // inptr[DCTSIZE*4] 2130 lw t5, 80(a0) // inptr[DCTSIZE*5] 2131 muleq_s.w.phr t0, t0, s0 // ... tmp0 ... 2132 lw t6, 96(a0) // inptr[DCTSIZE*6] 2133 lw t7, 112(a0) // inptr[DCTSIZE*7] 2134 or s4, t1, t2 2135 or s5, t3, t4 2136 bnez s4, 1f 2137 ins t0, v0, 16, 16 // ... tmp0 2138 bnez s5, 1f 2139 or s6, t5, t6 2140 or s6, s6, t7 2141 bnez s6, 1f 2142 sw t0, 0(a2) // wsptr[DCTSIZE*0] 2143 sw t0, 16(a2) // wsptr[DCTSIZE*1] 2144 sw t0, 32(a2) // wsptr[DCTSIZE*2] 2145 sw t0, 48(a2) // wsptr[DCTSIZE*3] 2146 sw t0, 64(a2) // wsptr[DCTSIZE*4] 2147 sw t0, 80(a2) // wsptr[DCTSIZE*5] 2148 sw t0, 96(a2) // wsptr[DCTSIZE*6] 2149 sw t0, 112(a2) // wsptr[DCTSIZE*7] 2150 addiu a0, a0, 4 2151 b 2f 2152 addiu a1, a1, 4 2153 21541: 2155 lw s1, 32(a1) // quantptr[DCTSIZE*2] 2156 lw s2, 64(a1) // quantptr[DCTSIZE*4] 2157 muleq_s.w.phl v0, t2, s1 // tmp1 ... 2158 muleq_s.w.phr t2, t2, s1 // ... tmp1 ... 2159 lw s0, 16(a1) // quantptr[DCTSIZE*1] 2160 lw s1, 48(a1) // quantptr[DCTSIZE*3] 2161 lw s3, 96(a1) // quantptr[DCTSIZE*6] 2162 muleq_s.w.phl v1, t4, s2 // tmp2 ... 2163 muleq_s.w.phr t4, t4, s2 // ... tmp2 ... 2164 lw s2, 80(a1) // quantptr[DCTSIZE*5] 2165 lw t8, 4(AT) // FIX(1.414213562) 2166 ins t2, v0, 16, 16 // ... tmp1 2167 muleq_s.w.phl v0, t6, s3 // tmp3 ... 2168 muleq_s.w.phr t6, t6, s3 // ... tmp3 ... 2169 ins t4, v1, 16, 16 // ... tmp2 2170 addq.ph s4, t0, t4 // tmp10 2171 subq.ph s5, t0, t4 // tmp11 2172 ins t6, v0, 16, 16 // ... tmp3 2173 subq.ph s6, t2, t6 // tmp12 ... 2174 addq.ph s7, t2, t6 // tmp13 2175 mulq_s.ph s6, s6, t8 // ... tmp12 ... 2176 addq.ph t0, s4, s7 // tmp0 2177 subq.ph t6, s4, s7 // tmp3 2178 muleq_s.w.phl v0, t1, s0 // tmp4 ... 2179 muleq_s.w.phr t1, t1, s0 // ... tmp4 ... 2180 shll_s.ph s6, s6, 1 // x2 2181 lw s3, 112(a1) // quantptr[DCTSIZE*7] 2182 subq.ph s6, s6, s7 // ... tmp12 2183 muleq_s.w.phl v1, t7, s3 // tmp7 ... 2184 muleq_s.w.phr t7, t7, s3 // ... tmp7 ... 2185 ins t1, v0, 16, 16 // ... tmp4 2186 addq.ph t2, s5, s6 // tmp1 2187 subq.ph t4, s5, s6 // tmp2 2188 muleq_s.w.phl v0, t5, s2 // tmp6 ... 2189 muleq_s.w.phr t5, t5, s2 // ... tmp6 ... 2190 ins t7, v1, 16, 16 // ... tmp7 2191 addq.ph s5, t1, t7 // z11 2192 subq.ph s6, t1, t7 // z12 2193 muleq_s.w.phl v1, t3, s1 // tmp5 ... 2194 muleq_s.w.phr t3, t3, s1 // ... tmp5 ... 2195 ins t5, v0, 16, 16 // ... tmp6 2196 ins t3, v1, 16, 16 // ... tmp5 2197 addq.ph s7, t5, t3 // z13 2198 subq.ph v0, t5, t3 // z10 2199 addq.ph t7, s5, s7 // tmp7 2200 subq.ph s5, s5, s7 // tmp11 ... 2201 addq.ph v1, v0, s6 // z5 ... 2202 mulq_s.ph s5, s5, t8 // ... tmp11 2203 lw t8, 8(AT) // FIX(1.847759065) 2204 lw s4, 0(AT) // FIX(1.082392200) 2205 addq.ph s0, t0, t7 2206 subq.ph s1, t0, t7 2207 mulq_s.ph v1, v1, t8 // ... z5 2208 shll_s.ph s5, s5, 1 // x2 2209 lw t8, 12(AT) // FIX(-2.613125930) 2210 sw s0, 0(a2) // wsptr[DCTSIZE*0] 2211 shll_s.ph v0, v0, 1 // x4 2212 mulq_s.ph v0, v0, t8 // tmp12 ... 2213 mulq_s.ph s4, s6, s4 // tmp10 ... 2214 shll_s.ph v1, v1, 1 // x2 2215 addiu a0, a0, 4 2216 addiu a1, a1, 4 2217 sw s1, 112(a2) // wsptr[DCTSIZE*7] 2218 shll_s.ph s6, v0, 1 // x4 2219 shll_s.ph s4, s4, 1 // x2 2220 addq.ph s6, s6, v1 // ... tmp12 2221 subq.ph t5, s6, t7 // tmp6 2222 subq.ph s4, s4, v1 // ... tmp10 2223 subq.ph t3, s5, t5 // tmp5 2224 addq.ph s2, t2, t5 2225 addq.ph t1, s4, t3 // tmp4 2226 subq.ph s3, t2, t5 2227 sw s2, 16(a2) // wsptr[DCTSIZE*1] 2228 sw s3, 96(a2) // wsptr[DCTSIZE*6] 2229 addq.ph v0, t4, t3 2230 subq.ph v1, t4, t3 2231 sw v0, 32(a2) // wsptr[DCTSIZE*2] 2232 sw v1, 80(a2) // wsptr[DCTSIZE*5] 2233 addq.ph v0, t6, t1 2234 subq.ph v1, t6, t1 2235 sw v0, 64(a2) // wsptr[DCTSIZE*4] 2236 sw v1, 48(a2) // wsptr[DCTSIZE*3] 2237 22382: 2239 bne a0, t9, 0b 2240 addiu a2, a2, 4 2241 2242 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2243 2244 j ra 2245 nop 2246 2247END(jsimd_idct_ifast_cols_dspr2) 2248 2249 2250/*****************************************************************************/ 2251LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2) 2252/* 2253 * a0 = wsptr 2254 * a1 = output_buf 2255 * a2 = output_col 2256 * a3 = mips_idct_ifast_coefs 2257 */ 2258 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 2259 2260 addiu t9, a0, 128 // end address 2261 lui s8, 0x8080 2262 ori s8, s8, 0x8080 2263 22640: 2265 lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs) 2266 lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a 2267 lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A 2268 lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c 2269 lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C 2270 lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e 2271 lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E 2272 lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g 2273 lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G 2274 precrq.ph.w t1, s0, t0 // B b 2275 ins t0, s0, 16, 16 // A a 2276 bnez t1, 1f 2277 or s0, t2, s2 2278 bnez s0, 1f 2279 or s0, t4, s4 2280 bnez s0, 1f 2281 or s0, t6, s6 2282 bnez s0, 1f 2283 shll_s.ph s0, t0, 2 // A a 2284 lw a3, 0(a1) 2285 lw AT, 4(a1) 2286 precrq.ph.w t0, s0, s0 // A A 2287 ins s0, s0, 16, 16 // a a 2288 addu a3, a3, a2 2289 addu AT, AT, a2 2290 precrq.qb.ph t0, t0, t0 // A A A A 2291 precrq.qb.ph s0, s0, s0 // a a a a 2292 addu.qb s0, s0, s8 2293 addu.qb t0, t0, s8 2294 sw s0, 0(a3) 2295 sw s0, 4(a3) 2296 sw t0, 0(AT) 2297 sw t0, 4(AT) 2298 addiu a0, a0, 32 2299 bne a0, t9, 0b 2300 addiu a1, a1, 8 2301 b 2f 2302 nop 2303 23041: 2305 precrq.ph.w t3, s2, t2 2306 ins t2, s2, 16, 16 2307 precrq.ph.w t5, s4, t4 2308 ins t4, s4, 16, 16 2309 precrq.ph.w t7, s6, t6 2310 ins t6, s6, 16, 16 2311 lw t8, 4(AT) // FIX(1.414213562) 2312 addq.ph s4, t0, t4 // tmp10 2313 subq.ph s5, t0, t4 // tmp11 2314 subq.ph s6, t2, t6 // tmp12 ... 2315 addq.ph s7, t2, t6 // tmp13 2316 mulq_s.ph s6, s6, t8 // ... tmp12 ... 2317 addq.ph t0, s4, s7 // tmp0 2318 subq.ph t6, s4, s7 // tmp3 2319 shll_s.ph s6, s6, 1 // x2 2320 subq.ph s6, s6, s7 // ... tmp12 2321 addq.ph t2, s5, s6 // tmp1 2322 subq.ph t4, s5, s6 // tmp2 2323 addq.ph s5, t1, t7 // z11 2324 subq.ph s6, t1, t7 // z12 2325 addq.ph s7, t5, t3 // z13 2326 subq.ph v0, t5, t3 // z10 2327 addq.ph t7, s5, s7 // tmp7 2328 subq.ph s5, s5, s7 // tmp11 ... 2329 addq.ph v1, v0, s6 // z5 ... 2330 mulq_s.ph s5, s5, t8 // ... tmp11 2331 lw t8, 8(AT) // FIX(1.847759065) 2332 lw s4, 0(AT) // FIX(1.082392200) 2333 addq.ph s0, t0, t7 // tmp0 + tmp7 2334 subq.ph s7, t0, t7 // tmp0 - tmp7 2335 mulq_s.ph v1, v1, t8 // ... z5 2336 lw a3, 0(a1) 2337 lw t8, 12(AT) // FIX(-2.613125930) 2338 shll_s.ph s5, s5, 1 // x2 2339 addu a3, a3, a2 2340 shll_s.ph v0, v0, 1 // x4 2341 mulq_s.ph v0, v0, t8 // tmp12 ... 2342 mulq_s.ph s4, s6, s4 // tmp10 ... 2343 shll_s.ph v1, v1, 1 // x2 2344 addiu a0, a0, 32 2345 addiu a1, a1, 8 2346 shll_s.ph s6, v0, 1 // x4 2347 shll_s.ph s4, s4, 1 // x2 2348 addq.ph s6, s6, v1 // ... tmp12 2349 shll_s.ph s0, s0, 2 2350 subq.ph t5, s6, t7 // tmp6 2351 subq.ph s4, s4, v1 // ... tmp10 2352 subq.ph t3, s5, t5 // tmp5 2353 shll_s.ph s7, s7, 2 2354 addq.ph t1, s4, t3 // tmp4 2355 addq.ph s1, t2, t5 // tmp1 + tmp6 2356 subq.ph s6, t2, t5 // tmp1 - tmp6 2357 addq.ph s2, t4, t3 // tmp2 + tmp5 2358 subq.ph s5, t4, t3 // tmp2 - tmp5 2359 addq.ph s4, t6, t1 // tmp3 + tmp4 2360 subq.ph s3, t6, t1 // tmp3 - tmp4 2361 shll_s.ph s1, s1, 2 2362 shll_s.ph s2, s2, 2 2363 shll_s.ph s3, s3, 2 2364 shll_s.ph s4, s4, 2 2365 shll_s.ph s5, s5, 2 2366 shll_s.ph s6, s6, 2 2367 precrq.ph.w t0, s1, s0 // B A 2368 ins s0, s1, 16, 16 // b a 2369 precrq.ph.w t2, s3, s2 // D C 2370 ins s2, s3, 16, 16 // d c 2371 precrq.ph.w t4, s5, s4 // F E 2372 ins s4, s5, 16, 16 // f e 2373 precrq.ph.w t6, s7, s6 // H G 2374 ins s6, s7, 16, 16 // h g 2375 precrq.qb.ph t0, t2, t0 // D C B A 2376 precrq.qb.ph s0, s2, s0 // d c b a 2377 precrq.qb.ph t4, t6, t4 // H G F E 2378 precrq.qb.ph s4, s6, s4 // h g f e 2379 addu.qb s0, s0, s8 2380 addu.qb s4, s4, s8 2381 sw s0, 0(a3) // outptr[0/1/2/3] d c b a 2382 sw s4, 4(a3) // outptr[4/5/6/7] h g f e 2383 lw a3, -4(a1) 2384 addu.qb t0, t0, s8 2385 addu a3, a3, a2 2386 addu.qb t4, t4, s8 2387 sw t0, 0(a3) // outptr[0/1/2/3] D C B A 2388 bne a0, t9, 0b 2389 sw t4, 4(a3) // outptr[4/5/6/7] H G F E 2390 23912: 2392 2393 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 2394 2395 j ra 2396 nop 2397 2398END(jsimd_idct_ifast_rows_dspr2) 2399 2400 2401/*****************************************************************************/ 2402LEAF_DSPR2(jsimd_fdct_islow_dspr2) 2403/* 2404 * a0 = data 2405 */ 2406 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 2407 2408 lui t0, 6437 2409 ori t0, 2260 2410 lui t1, 9633 2411 ori t1, 11363 2412 lui t2, 0xd39e 2413 ori t2, 0xe6dc 2414 lui t3, 0xf72d 2415 ori t3, 9633 2416 lui t4, 2261 2417 ori t4, 9633 2418 lui t5, 0xd39e 2419 ori t5, 6437 2420 lui t6, 9633 2421 ori t6, 0xd39d 2422 lui t7, 0xe6dc 2423 ori t7, 2260 2424 lui t8, 4433 2425 ori t8, 10703 2426 lui t9, 0xd630 2427 ori t9, 4433 2428 li s8, 8 2429 move a1, a0 24301: 2431 lw s0, 0(a1) // tmp0 = 1|0 2432 lw s1, 4(a1) // tmp1 = 3|2 2433 lw s2, 8(a1) // tmp2 = 5|4 2434 lw s3, 12(a1) // tmp3 = 7|6 2435 packrl.ph s1, s1, s1 // tmp1 = 2|3 2436 packrl.ph s3, s3, s3 // tmp3 = 6|7 2437 subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4 2438 subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7 2439 mult $0, $0 // ac0 = 0 2440 dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260 2441 dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363 2442 mult $ac1, $0, $0 // ac1 = 0 2443 dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436 2444 dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633 2445 mult $ac2, $0, $0 // ac2 = 0 2446 dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633 2447 dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437 2448 mult $ac3, $0, $0 // ac3 = 0 2449 dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363 2450 dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260 2451 addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3 2452 addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0 2453 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 2454 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 2455 extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11 2456 extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11 2457 addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10 2458 subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13 2459 sh s0, 2(a1) 2460 sh s1, 6(a1) 2461 sh s2, 10(a1) 2462 sh s3, 14(a1) 2463 mult $0, $0 // ac0 = 0 2464 dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703 2465 mult $ac1, $0, $0 // ac1 = 0 2466 dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433 2467 sra s4, s5, 16 // tmp4 = t11 2468 addiu a1, a1, 16 2469 addiu s8, s8, -1 2470 extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11 2471 extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11 2472 addu s2, s5, s4 // tmp2 = t10 + t11 2473 subu s3, s5, s4 // tmp3 = t10 - t11 2474 sll s2, s2, 2 // tmp2 = (t10 + t11) << 2 2475 sll s3, s3, 2 // tmp3 = (t10 - t11) << 2 2476 sh s2, -16(a1) 2477 sh s3, -8(a1) 2478 sh s0, -12(a1) 2479 bgtz s8, 1b 2480 sh s1, -4(a1) 2481 li t0, 2260 2482 li t1, 11363 2483 li t2, 9633 2484 li t3, 6436 2485 li t4, 6437 2486 li t5, 2261 2487 li t6, 11362 2488 li t7, 2259 2489 li t8, 4433 2490 li t9, 10703 2491 li a1, 10704 2492 li s8, 8 2493 24942: 2495 lh a2, 0(a0) // 0 2496 lh a3, 16(a0) // 8 2497 lh v0, 32(a0) // 16 2498 lh v1, 48(a0) // 24 2499 lh s4, 64(a0) // 32 2500 lh s5, 80(a0) // 40 2501 lh s6, 96(a0) // 48 2502 lh s7, 112(a0) // 56 2503 addu s2, v0, s5 // tmp2 = 16 + 40 2504 subu s5, v0, s5 // tmp5 = 16 - 40 2505 addu s3, v1, s4 // tmp3 = 24 + 32 2506 subu s4, v1, s4 // tmp4 = 24 - 32 2507 addu s0, a2, s7 // tmp0 = 0 + 56 2508 subu s7, a2, s7 // tmp7 = 0 - 56 2509 addu s1, a3, s6 // tmp1 = 8 + 48 2510 subu s6, a3, s6 // tmp6 = 8 - 48 2511 addu a2, s0, s3 // tmp10 = tmp0 + tmp3 2512 subu v1, s0, s3 // tmp13 = tmp0 - tmp3 2513 addu a3, s1, s2 // tmp11 = tmp1 + tmp2 2514 subu v0, s1, s2 // tmp12 = tmp1 - tmp2 2515 mult s7, t1 // ac0 = tmp7 * c1 2516 madd s4, t0 // ac0 += tmp4 * c0 2517 madd s5, t4 // ac0 += tmp5 * c4 2518 madd s6, t2 // ac0 += tmp6 * c2 2519 mult $ac1, s7, t2 // ac1 = tmp7 * c2 2520 msub $ac1, s4, t3 // ac1 -= tmp4 * c3 2521 msub $ac1, s5, t6 // ac1 -= tmp5 * c6 2522 msub $ac1, s6, t7 // ac1 -= tmp6 * c7 2523 mult $ac2, s7, t4 // ac2 = tmp7 * c4 2524 madd $ac2, s4, t2 // ac2 += tmp4 * c2 2525 madd $ac2, s5, t5 // ac2 += tmp5 * c5 2526 msub $ac2, s6, t6 // ac2 -= tmp6 * c6 2527 mult $ac3, s7, t0 // ac3 = tmp7 * c0 2528 msub $ac3, s4, t1 // ac3 -= tmp4 * c1 2529 madd $ac3, s5, t2 // ac3 += tmp5 * c2 2530 msub $ac3, s6, t3 // ac3 -= tmp6 * c3 2531 extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15 2532 extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15 2533 extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15 2534 extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15 2535 addiu s8, s8, -1 2536 addu s4, a2, a3 // tmp4 = tmp10 + tmp11 2537 subu s5, a2, a3 // tmp5 = tmp10 - tmp11 2538 sh s0, 16(a0) 2539 sh s1, 48(a0) 2540 sh s2, 80(a0) 2541 sh s3, 112(a0) 2542 mult v0, t8 // ac0 = tmp12 * c8 2543 madd v1, t9 // ac0 += tmp13 * c9 2544 mult $ac1, v1, t8 // ac1 = tmp13 * c8 2545 msub $ac1, v0, a1 // ac1 -= tmp12 * c10 2546 addiu a0, a0, 2 2547 extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15 2548 extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15 2549 shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2 2550 shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2 2551 sh s4, -2(a0) 2552 sh s5, 62(a0) 2553 sh s6, 30(a0) 2554 bgtz s8, 2b 2555 sh s7, 94(a0) 2556 2557 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 2558 2559 jr ra 2560 nop 2561 2562END(jsimd_fdct_islow_dspr2) 2563 2564 2565/**************************************************************************/ 2566LEAF_DSPR2(jsimd_fdct_ifast_dspr2) 2567/* 2568 * a0 = data 2569 */ 2570 .set at 2571 2572 SAVE_REGS_ON_STACK 8, s0, s1 2573 2574 li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff) 2575 li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff) 2576 li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff) 2577 li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff) 2578 2579 move v0, a0 2580 addiu v1, v0, 128 // end address 2581 25820: 2583 lw t0, 0(v0) // tmp0 = 1|0 2584 lw t1, 4(v0) // tmp1 = 3|2 2585 lw t2, 8(v0) // tmp2 = 5|4 2586 lw t3, 12(v0) // tmp3 = 7|6 2587 packrl.ph t1, t1, t1 // tmp1 = 2|3 2588 packrl.ph t3, t3, t3 // tmp3 = 6|7 2589 subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4 2590 subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7 2591 addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3 2592 addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0 2593 addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10 2594 subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13 2595 sra t4, t8, 16 // tmp4 = t11 2596 mult $0, $0 // ac0 = 0 2597 dpa.w.ph $ac0, t9, s1 2598 mult $ac1, $0, $0 // ac1 = 0 2599 dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98 2600 dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98 2601 mult $ac2, $0, $0 // ac2 = 0 2602 dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139 2603 mult $ac3, $0, $0 // ac3 = 0 2604 dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334 2605 precrq.ph.w t0, t5, t7 // t0 = t5|t6 2606 addq.ph t2, t8, t4 // tmp2 = t10 + t11 2607 subq.ph t3, t8, t4 // tmp3 = t10 - t11 2608 extr.w t4, $ac0, 8 2609 mult $0, $0 // ac0 = 0 2610 dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181 2611 extr.w t0, $ac1, 8 // t0 = z5 2612 extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139) 2613 extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334) 2614 extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181) 2615 add t6, t1, t0 // t6 = z2 2616 add t7, t7, t0 // t7 = z4 2617 subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3 2618 addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3 2619 addq.ph t1, t0, t6 // t1 = z13 + z2 2620 subq.ph t6, t0, t6 // t6 = z13 - z2 2621 addq.ph t0, t8, t7 // t0 = z11 + z4 2622 subq.ph t7, t8, t7 // t7 = z11 - z4 2623 addq.ph t5, t4, t9 2624 subq.ph t4, t9, t4 2625 sh t2, 0(v0) 2626 sh t5, 4(v0) 2627 sh t3, 8(v0) 2628 sh t4, 12(v0) 2629 sh t1, 10(v0) 2630 sh t6, 6(v0) 2631 sh t0, 2(v0) 2632 sh t7, 14(v0) 2633 addiu v0, 16 2634 bne v1, v0, 0b 2635 nop 2636 move v0, a0 2637 addiu v1, v0, 16 2638 26391: 2640 lh t0, 0(v0) // 0 2641 lh t1, 16(v0) // 8 2642 lh t2, 32(v0) // 16 2643 lh t3, 48(v0) // 24 2644 lh t4, 64(v0) // 32 2645 lh t5, 80(v0) // 40 2646 lh t6, 96(v0) // 48 2647 lh t7, 112(v0) // 56 2648 add t8, t0, t7 // t8 = tmp0 2649 sub t7, t0, t7 // t7 = tmp7 2650 add t0, t1, t6 // t0 = tmp1 2651 sub t1, t1, t6 // t1 = tmp6 2652 add t6, t2, t5 // t6 = tmp2 2653 sub t5, t2, t5 // t5 = tmp5 2654 add t2, t3, t4 // t2 = tmp3 2655 sub t3, t3, t4 // t3 = tmp4 2656 add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3 2657 sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3 2658 sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2 2659 ins t8, s0, 16, 16 // t8 = tmp12|tmp13 2660 add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2 2661 mult $0, $0 // ac0 = 0 2662 dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181 2663 add s0, t4, t2 // t8 = tmp10+tmp11 2664 sub t4, t4, t2 // t4 = tmp10-tmp11 2665 sh s0, 0(v0) 2666 sh t4, 64(v0) 2667 extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781) 2668 addq.ph t4, t8, t2 // t9 = tmp13 + z1 2669 subq.ph t8, t8, t2 // t2 = tmp13 - z1 2670 sh t4, 32(v0) 2671 sh t8, 96(v0) 2672 add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5 2673 add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6 2674 add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7 2675 andi t4, a1, 0xffff 2676 mul s0, t1, t4 2677 sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965) 2678 ins t1, t3, 16, 16 // t1 = tmp10|tmp12 2679 mult $0, $0 // ac0 = 0 2680 mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98 2681 extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433) 2682 add t2, t7, t8 // t2 = tmp7 + z5 2683 sub t7, t7, t8 // t7 = tmp7 - z5 2684 andi t4, a2, 0xffff 2685 mul t8, t3, t4 2686 sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100) 2687 andi t4, s1, 0xffff 2688 mul t6, t0, t4 2689 sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781) 2690 add t0, t6, t8 // t0 = z3 + z2 2691 sub t1, t6, t8 // t1 = z3 - z2 2692 add t3, t6, s0 // t3 = z3 + z4 2693 sub t4, t6, s0 // t4 = z3 - z4 2694 sub t5, t2, t1 // t5 = dataptr[5] 2695 sub t6, t7, t0 // t6 = dataptr[3] 2696 add t3, t2, t3 // t3 = dataptr[1] 2697 add t4, t7, t4 // t4 = dataptr[7] 2698 sh t5, 80(v0) 2699 sh t6, 48(v0) 2700 sh t3, 16(v0) 2701 sh t4, 112(v0) 2702 addiu v0, 2 2703 bne v0, v1, 1b 2704 nop 2705 2706 RESTORE_REGS_FROM_STACK 8, s0, s1 2707 2708 j ra 2709 nop 2710END(jsimd_fdct_ifast_dspr2) 2711 2712 2713/*****************************************************************************/ 2714LEAF_DSPR2(jsimd_quantize_dspr2) 2715/* 2716 * a0 = coef_block 2717 * a1 = divisors 2718 * a2 = workspace 2719 */ 2720 .set at 2721 2722 SAVE_REGS_ON_STACK 16, s0, s1, s2 2723 2724 addiu v0, a2, 124 // v0 = workspace_end 2725 lh t0, 0(a2) 2726 lh t1, 0(a1) 2727 lh t2, 128(a1) 2728 sra t3, t0, 15 2729 sll t3, t3, 1 2730 addiu t3, t3, 1 2731 mul t0, t0, t3 2732 lh t4, 384(a1) 2733 lh t5, 130(a1) 2734 lh t6, 2(a2) 2735 lh t7, 2(a1) 2736 lh t8, 386(a1) 2737 27381: 2739 andi t1, 0xffff 2740 add t9, t0, t2 2741 andi t9, 0xffff 2742 mul v1, t9, t1 2743 sra s0, t6, 15 2744 sll s0, s0, 1 2745 addiu s0, s0, 1 2746 addiu t9, t4, 16 2747 srav v1, v1, t9 2748 mul v1, v1, t3 2749 mul t6, t6, s0 2750 andi t7, 0xffff 2751 addiu a2, a2, 4 2752 addiu a1, a1, 4 2753 add s1, t6, t5 2754 andi s1, 0xffff 2755 sh v1, 0(a0) 2756 2757 mul s2, s1, t7 2758 addiu s1, t8, 16 2759 srav s2, s2, s1 2760 mul s2, s2, s0 2761 lh t0, 0(a2) 2762 lh t1, 0(a1) 2763 sra t3, t0, 15 2764 sll t3, t3, 1 2765 addiu t3, t3, 1 2766 mul t0, t0, t3 2767 lh t2, 128(a1) 2768 lh t4, 384(a1) 2769 lh t5, 130(a1) 2770 lh t8, 386(a1) 2771 lh t6, 2(a2) 2772 lh t7, 2(a1) 2773 sh s2, 2(a0) 2774 lh t0, 0(a2) 2775 sra t3, t0, 15 2776 sll t3, t3, 1 2777 addiu t3, t3, 1 2778 mul t0, t0, t3 2779 bne a2, v0, 1b 2780 addiu a0, a0, 4 2781 2782 andi t1, 0xffff 2783 add t9, t0, t2 2784 andi t9, 0xffff 2785 mul v1, t9, t1 2786 sra s0, t6, 15 2787 sll s0, s0, 1 2788 addiu s0, s0, 1 2789 addiu t9, t4, 16 2790 srav v1, v1, t9 2791 mul v1, v1, t3 2792 mul t6, t6, s0 2793 andi t7, 0xffff 2794 sh v1, 0(a0) 2795 add s1, t6, t5 2796 andi s1, 0xffff 2797 mul s2, s1, t7 2798 addiu s1, t8, 16 2799 addiu a2, a2, 4 2800 addiu a1, a1, 4 2801 srav s2, s2, s1 2802 mul s2, s2, s0 2803 sh s2, 2(a0) 2804 2805 RESTORE_REGS_FROM_STACK 16, s0, s1, s2 2806 2807 j ra 2808 nop 2809 2810END(jsimd_quantize_dspr2) 2811 2812 2813#ifndef __mips_soft_float 2814 2815/*****************************************************************************/ 2816LEAF_DSPR2(jsimd_quantize_float_dspr2) 2817/* 2818 * a0 = coef_block 2819 * a1 = divisors 2820 * a2 = workspace 2821 */ 2822 .set at 2823 2824 li t1, 0x46800100 // integer representation 16384.5 2825 mtc1 t1, f0 2826 li t0, 63 28270: 2828 lwc1 f2, 0(a2) 2829 lwc1 f10, 0(a1) 2830 lwc1 f4, 4(a2) 2831 lwc1 f12, 4(a1) 2832 lwc1 f6, 8(a2) 2833 lwc1 f14, 8(a1) 2834 lwc1 f8, 12(a2) 2835 lwc1 f16, 12(a1) 2836 madd.s f2, f0, f2, f10 2837 madd.s f4, f0, f4, f12 2838 madd.s f6, f0, f6, f14 2839 madd.s f8, f0, f8, f16 2840 lwc1 f10, 16(a1) 2841 lwc1 f12, 20(a1) 2842 trunc.w.s f2, f2 2843 trunc.w.s f4, f4 2844 trunc.w.s f6, f6 2845 trunc.w.s f8, f8 2846 lwc1 f14, 24(a1) 2847 lwc1 f16, 28(a1) 2848 mfc1 t1, f2 2849 mfc1 t2, f4 2850 mfc1 t3, f6 2851 mfc1 t4, f8 2852 lwc1 f2, 16(a2) 2853 lwc1 f4, 20(a2) 2854 lwc1 f6, 24(a2) 2855 lwc1 f8, 28(a2) 2856 madd.s f2, f0, f2, f10 2857 madd.s f4, f0, f4, f12 2858 madd.s f6, f0, f6, f14 2859 madd.s f8, f0, f8, f16 2860 addiu t1, t1, -16384 2861 addiu t2, t2, -16384 2862 addiu t3, t3, -16384 2863 addiu t4, t4, -16384 2864 trunc.w.s f2, f2 2865 trunc.w.s f4, f4 2866 trunc.w.s f6, f6 2867 trunc.w.s f8, f8 2868 sh t1, 0(a0) 2869 sh t2, 2(a0) 2870 sh t3, 4(a0) 2871 sh t4, 6(a0) 2872 mfc1 t1, f2 2873 mfc1 t2, f4 2874 mfc1 t3, f6 2875 mfc1 t4, f8 2876 addiu t0, t0, -8 2877 addiu a2, a2, 32 2878 addiu a1, a1, 32 2879 addiu t1, t1, -16384 2880 addiu t2, t2, -16384 2881 addiu t3, t3, -16384 2882 addiu t4, t4, -16384 2883 sh t1, 8(a0) 2884 sh t2, 10(a0) 2885 sh t3, 12(a0) 2886 sh t4, 14(a0) 2887 bgez t0, 0b 2888 addiu a0, a0, 16 2889 2890 j ra 2891 nop 2892 2893END(jsimd_quantize_float_dspr2) 2894 2895#endif 2896 2897 2898/*****************************************************************************/ 2899LEAF_DSPR2(jsimd_idct_2x2_dspr2) 2900/* 2901 * a0 = compptr->dct_table 2902 * a1 = coef_block 2903 * a2 = output_buf 2904 * a3 = output_col 2905 */ 2906 .set at 2907 2908 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 2909 2910 addiu sp, sp, -40 2911 move v0, sp 2912 addiu s2, zero, 29692 2913 addiu s3, zero, -10426 2914 addiu s4, zero, 6967 2915 addiu s5, zero, -5906 2916 lh t0, 0(a1) // t0 = inptr[DCTSIZE*0] 2917 lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0] 2918 lh t1, 48(a1) // t1 = inptr[DCTSIZE*3] 2919 lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3] 2920 mul t4, t5, t0 2921 lh t0, 16(a1) // t0 = inptr[DCTSIZE*1] 2922 lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1] 2923 mul t6, t6, t1 2924 mul t5, t5, t0 2925 lh t2, 80(a1) // t2 = inptr[DCTSIZE*5] 2926 lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5] 2927 lh t3, 112(a1) // t3 = inptr[DCTSIZE*7] 2928 lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7] 2929 mul t7, t7, t2 2930 mult zero, zero 2931 mul t8, t8, t3 2932 li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff) 2933 li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff) 2934 ins t6, t5, 16, 16 // t6 = t5|t6 2935 sll t4, t4, 15 2936 dpa.w.ph $ac0, t6, s0 2937 lh t1, 2(a1) 2938 lh t6, 2(a0) 2939 ins t8, t7, 16, 16 // t8 = t7|t8 2940 dpa.w.ph $ac0, t8, s1 2941 mflo t0, $ac0 2942 mul t5, t6, t1 2943 lh t1, 18(a1) 2944 lh t6, 18(a0) 2945 lh t2, 50(a1) 2946 lh t7, 50(a0) 2947 mul t6, t6, t1 2948 subu t8, t4, t0 2949 mul t7, t7, t2 2950 addu t0, t4, t0 2951 shra_r.w t0, t0, 13 2952 lh t1, 82(a1) 2953 lh t2, 82(a0) 2954 lh t3, 114(a1) 2955 lh t4, 114(a0) 2956 shra_r.w t8, t8, 13 2957 mul t1, t1, t2 2958 mul t3, t3, t4 2959 sw t0, 0(v0) 2960 sw t8, 20(v0) 2961 sll t4, t5, 15 2962 ins t7, t6, 16, 16 2963 mult zero, zero 2964 dpa.w.ph $ac0, t7, s0 2965 ins t3, t1, 16, 16 2966 lh t1, 6(a1) 2967 lh t6, 6(a0) 2968 dpa.w.ph $ac0, t3, s1 2969 mflo t0, $ac0 2970 mul t5, t6, t1 2971 lh t1, 22(a1) 2972 lh t6, 22(a0) 2973 lh t2, 54(a1) 2974 lh t7, 54(a0) 2975 mul t6, t6, t1 2976 subu t8, t4, t0 2977 mul t7, t7, t2 2978 addu t0, t4, t0 2979 shra_r.w t0, t0, 13 2980 lh t1, 86(a1) 2981 lh t2, 86(a0) 2982 lh t3, 118(a1) 2983 lh t4, 118(a0) 2984 shra_r.w t8, t8, 13 2985 mul t1, t1, t2 2986 mul t3, t3, t4 2987 sw t0, 4(v0) 2988 sw t8, 24(v0) 2989 sll t4, t5, 15 2990 ins t7, t6, 16, 16 2991 mult zero, zero 2992 dpa.w.ph $ac0, t7, s0 2993 ins t3, t1, 16, 16 2994 lh t1, 10(a1) 2995 lh t6, 10(a0) 2996 dpa.w.ph $ac0, t3, s1 2997 mflo t0, $ac0 2998 mul t5, t6, t1 2999 lh t1, 26(a1) 3000 lh t6, 26(a0) 3001 lh t2, 58(a1) 3002 lh t7, 58(a0) 3003 mul t6, t6, t1 3004 subu t8, t4, t0 3005 mul t7, t7, t2 3006 addu t0, t4, t0 3007 shra_r.w t0, t0, 13 3008 lh t1, 90(a1) 3009 lh t2, 90(a0) 3010 lh t3, 122(a1) 3011 lh t4, 122(a0) 3012 shra_r.w t8, t8, 13 3013 mul t1, t1, t2 3014 mul t3, t3, t4 3015 sw t0, 8(v0) 3016 sw t8, 28(v0) 3017 sll t4, t5, 15 3018 ins t7, t6, 16, 16 3019 mult zero, zero 3020 dpa.w.ph $ac0, t7, s0 3021 ins t3, t1, 16, 16 3022 lh t1, 14(a1) 3023 lh t6, 14(a0) 3024 dpa.w.ph $ac0, t3, s1 3025 mflo t0, $ac0 3026 mul t5, t6, t1 3027 lh t1, 30(a1) 3028 lh t6, 30(a0) 3029 lh t2, 62(a1) 3030 lh t7, 62(a0) 3031 mul t6, t6, t1 3032 subu t8, t4, t0 3033 mul t7, t7, t2 3034 addu t0, t4, t0 3035 shra_r.w t0, t0, 13 3036 lh t1, 94(a1) 3037 lh t2, 94(a0) 3038 lh t3, 126(a1) 3039 lh t4, 126(a0) 3040 shra_r.w t8, t8, 13 3041 mul t1, t1, t2 3042 mul t3, t3, t4 3043 sw t0, 12(v0) 3044 sw t8, 32(v0) 3045 sll t4, t5, 15 3046 ins t7, t6, 16, 16 3047 mult zero, zero 3048 dpa.w.ph $ac0, t7, s0 3049 ins t3, t1, 16, 16 3050 dpa.w.ph $ac0, t3, s1 3051 mflo t0, $ac0 3052 lw t9, 0(a2) 3053 lw t3, 0(v0) 3054 lw t7, 4(v0) 3055 lw t1, 8(v0) 3056 addu t9, t9, a3 3057 sll t3, t3, 15 3058 subu t8, t4, t0 3059 addu t0, t4, t0 3060 shra_r.w t0, t0, 13 3061 shra_r.w t8, t8, 13 3062 sw t0, 16(v0) 3063 sw t8, 36(v0) 3064 lw t5, 12(v0) 3065 lw t6, 16(v0) 3066 mult t7, s2 3067 madd t1, s3 3068 madd t5, s4 3069 madd t6, s5 3070 lw t5, 24(v0) 3071 lw t7, 28(v0) 3072 mflo t0, $ac0 3073 lw t8, 32(v0) 3074 lw t2, 36(v0) 3075 mult $ac1, t5, s2 3076 madd $ac1, t7, s3 3077 madd $ac1, t8, s4 3078 madd $ac1, t2, s5 3079 addu t1, t3, t0 3080 subu t6, t3, t0 3081 shra_r.w t1, t1, 20 3082 shra_r.w t6, t6, 20 3083 mflo t4, $ac1 3084 shll_s.w t1, t1, 24 3085 shll_s.w t6, t6, 24 3086 sra t1, t1, 24 3087 sra t6, t6, 24 3088 addiu t1, t1, 128 3089 addiu t6, t6, 128 3090 lw t0, 20(v0) 3091 sb t1, 0(t9) 3092 sb t6, 1(t9) 3093 sll t0, t0, 15 3094 lw t9, 4(a2) 3095 addu t1, t0, t4 3096 subu t6, t0, t4 3097 addu t9, t9, a3 3098 shra_r.w t1, t1, 20 3099 shra_r.w t6, t6, 20 3100 shll_s.w t1, t1, 24 3101 shll_s.w t6, t6, 24 3102 sra t1, t1, 24 3103 sra t6, t6, 24 3104 addiu t1, t1, 128 3105 addiu t6, t6, 128 3106 sb t1, 0(t9) 3107 sb t6, 1(t9) 3108 addiu sp, sp, 40 3109 3110 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 3111 3112 j ra 3113 nop 3114 3115END(jsimd_idct_2x2_dspr2) 3116 3117 3118/*****************************************************************************/ 3119LEAF_DSPR2(jsimd_idct_4x4_dspr2) 3120/* 3121 * a0 = compptr->dct_table 3122 * a1 = coef_block 3123 * a2 = output_buf 3124 * a3 = output_col 3125 * 16(sp) = workspace[DCTSIZE*4]; // buffers data between passes 3126 */ 3127 .set at 3128 3129 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3130 3131 lw v1, 48(sp) 3132 move t0, a1 3133 move t1, v1 3134 li t9, 4 3135 li s0, 0x2e75f93e 3136 li s1, 0x21f9ba79 3137 li s2, 0xecc2efb0 3138 li s3, 0x52031ccd 3139 31400: 3141 lh s6, 32(t0) // inptr[DCTSIZE*2] 3142 lh t6, 32(a0) // quantptr[DCTSIZE*2] 3143 lh s7, 96(t0) // inptr[DCTSIZE*6] 3144 lh t7, 96(a0) // quantptr[DCTSIZE*6] 3145 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3146 lh s4, 0(t0) // inptr[DCTSIZE*0] 3147 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3148 lh s5, 0(a0) // quantptr[0] 3149 li s6, 15137 3150 li s7, 6270 3151 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) 3152 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3153 lh t5, 112(t0) // inptr[DCTSIZE*7] 3154 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3155 lh s4, 112(a0) // quantptr[DCTSIZE*7] 3156 lh v0, 80(t0) // inptr[DCTSIZE*5] 3157 lh s5, 80(a0) // quantptr[DCTSIZE*5] 3158 lh s6, 48(a0) // quantptr[DCTSIZE*3] 3159 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) 3160 lh s7, 16(a0) // quantptr[DCTSIZE*1] 3161 lh t8, 16(t0) // inptr[DCTSIZE*1] 3162 subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) 3163 lh t7, 48(t0) // inptr[DCTSIZE*3] 3164 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) 3165 mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) 3166 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) 3167 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) 3168 addu t3, t2, t6 // tmp10 = tmp0 + z2 3169 subu t4, t2, t6 // tmp10 = tmp0 - z2 3170 mult $ac0, zero, zero 3171 mult $ac1, zero, zero 3172 ins t5, v0, 16, 16 3173 ins t7, t8, 16, 16 3174 addiu t9, t9, -1 3175 dpa.w.ph $ac0, t5, s0 3176 dpa.w.ph $ac0, t7, s1 3177 dpa.w.ph $ac1, t5, s2 3178 dpa.w.ph $ac1, t7, s3 3179 mflo s4, $ac0 3180 mflo s5, $ac1 3181 addiu a0, a0, 2 3182 addiu t1, t1, 4 3183 addiu t0, t0, 2 3184 addu t6, t4, s4 3185 subu t5, t4, s4 3186 addu s6, t3, s5 3187 subu s7, t3, s5 3188 shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12) 3189 shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12) 3190 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) 3191 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) 3192 sw t6, 28(t1) 3193 sw t5, 60(t1) 3194 sw s6, -4(t1) 3195 bgtz t9, 0b 3196 sw s7, 92(t1) 3197 // second loop three pass 3198 li t9, 3 31991: 3200 lh s6, 34(t0) // inptr[DCTSIZE*2] 3201 lh t6, 34(a0) // quantptr[DCTSIZE*2] 3202 lh s7, 98(t0) // inptr[DCTSIZE*6] 3203 lh t7, 98(a0) // quantptr[DCTSIZE*6] 3204 mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3205 lh s4, 2(t0) // inptr[DCTSIZE*0] 3206 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3207 lh s5, 2(a0) // quantptr[DCTSIZE*0] 3208 li s6, 15137 3209 li s7, 6270 3210 mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0]) 3211 mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2]) 3212 lh t5, 114(t0) // inptr[DCTSIZE*7] 3213 mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6]) 3214 lh s4, 114(a0) // quantptr[DCTSIZE*7] 3215 lh s5, 82(a0) // quantptr[DCTSIZE*5] 3216 lh t6, 82(t0) // inptr[DCTSIZE*5] 3217 sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1) 3218 lh s6, 50(a0) // quantptr[DCTSIZE*3] 3219 lh t8, 18(t0) // inptr[DCTSIZE*1] 3220 subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6) 3221 lh t7, 50(t0) // inptr[DCTSIZE*3] 3222 lh s7, 18(a0) // quantptr[DCTSIZE*1] 3223 mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7]) 3224 mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5]) 3225 mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3]) 3226 mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1]) 3227 addu t3, t2, v0 // tmp10 = tmp0 + z2 3228 subu t4, t2, v0 // tmp10 = tmp0 - z2 3229 mult $ac0, zero, zero 3230 mult $ac1, zero, zero 3231 ins t5, t6, 16, 16 3232 ins t7, t8, 16, 16 3233 dpa.w.ph $ac0, t5, s0 3234 dpa.w.ph $ac0, t7, s1 3235 dpa.w.ph $ac1, t5, s2 3236 dpa.w.ph $ac1, t7, s3 3237 mflo t5, $ac0 3238 mflo t6, $ac1 3239 addiu t9, t9, -1 3240 addiu t0, t0, 2 3241 addiu a0, a0, 2 3242 addiu t1, t1, 4 3243 addu s5, t4, t5 3244 subu s4, t4, t5 3245 addu s6, t3, t6 3246 subu s7, t3, t6 3247 shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12) 3248 shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12) 3249 shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12) 3250 shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12) 3251 sw s5, 32(t1) 3252 sw s4, 64(t1) 3253 sw s6, 0(t1) 3254 bgtz t9, 1b 3255 sw s7, 96(t1) 3256 move t1, v1 3257 li s4, 15137 3258 lw s6, 8(t1) // wsptr[2] 3259 li s5, 6270 3260 lw s7, 24(t1) // wsptr[6] 3261 mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) 3262 lw t2, 0(t1) // wsptr[0] 3263 mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) 3264 lh t5, 28(t1) // wsptr[7] 3265 lh t6, 20(t1) // wsptr[5] 3266 lh t7, 12(t1) // wsptr[3] 3267 lh t8, 4(t1) // wsptr[1] 3268 ins t5, t6, 16, 16 3269 ins t7, t8, 16, 16 3270 mult $ac0, zero, zero 3271 dpa.w.ph $ac0, t5, s0 3272 dpa.w.ph $ac0, t7, s1 3273 mult $ac1, zero, zero 3274 dpa.w.ph $ac1, t5, s2 3275 dpa.w.ph $ac1, t7, s3 3276 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) 3277 mflo s6, $ac0 3278 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3279 subu s4, s4, s5 3280 addu t3, t2, s4 // tmp10 = tmp0 + z2 3281 mflo s7, $ac1 3282 subu t4, t2, s4 // tmp10 = tmp0 - z2 3283 addu t7, t4, s6 3284 subu t8, t4, s6 3285 addu t5, t3, s7 3286 subu t6, t3, s7 3287 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) 3288 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) 3289 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) 3290 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) 3291 sll s4, t9, 2 3292 lw v0, 0(a2) // output_buf[ctr] 3293 shll_s.w t5, t5, 24 3294 shll_s.w t6, t6, 24 3295 shll_s.w t7, t7, 24 3296 shll_s.w t8, t8, 24 3297 sra t5, t5, 24 3298 sra t6, t6, 24 3299 sra t7, t7, 24 3300 sra t8, t8, 24 3301 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3302 addiu t5, t5, 128 3303 addiu t6, t6, 128 3304 addiu t7, t7, 128 3305 addiu t8, t8, 128 3306 sb t5, 0(v0) 3307 sb t7, 1(v0) 3308 sb t8, 2(v0) 3309 sb t6, 3(v0) 3310 // 2 3311 li s4, 15137 3312 lw s6, 40(t1) // wsptr[2] 3313 li s5, 6270 3314 lw s7, 56(t1) // wsptr[6] 3315 mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) 3316 lw t2, 32(t1) // wsptr[0] 3317 mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) 3318 lh t5, 60(t1) // wsptr[7] 3319 lh t6, 52(t1) // wsptr[5] 3320 lh t7, 44(t1) // wsptr[3] 3321 lh t8, 36(t1) // wsptr[1] 3322 ins t5, t6, 16, 16 3323 ins t7, t8, 16, 16 3324 mult $ac0, zero, zero 3325 dpa.w.ph $ac0, t5, s0 3326 dpa.w.ph $ac0, t7, s1 3327 mult $ac1, zero, zero 3328 dpa.w.ph $ac1, t5, s2 3329 dpa.w.ph $ac1, t7, s3 3330 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) 3331 mflo s6, $ac0 3332 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3333 subu s4, s4, s5 3334 addu t3, t2, s4 // tmp10 = tmp0 + z2 3335 mflo s7, $ac1 3336 subu t4, t2, s4 // tmp10 = tmp0 - z2 3337 addu t7, t4, s6 3338 subu t8, t4, s6 3339 addu t5, t3, s7 3340 subu t6, t3, s7 3341 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1) 3342 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1) 3343 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1) 3344 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1) 3345 sll s4, t9, 2 3346 lw v0, 4(a2) // output_buf[ctr] 3347 shll_s.w t5, t5, 24 3348 shll_s.w t6, t6, 24 3349 shll_s.w t7, t7, 24 3350 shll_s.w t8, t8, 24 3351 sra t5, t5, 24 3352 sra t6, t6, 24 3353 sra t7, t7, 24 3354 sra t8, t8, 24 3355 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3356 addiu t5, t5, 128 3357 addiu t6, t6, 128 3358 addiu t7, t7, 128 3359 addiu t8, t8, 128 3360 sb t5, 0(v0) 3361 sb t7, 1(v0) 3362 sb t8, 2(v0) 3363 sb t6, 3(v0) 3364 // 3 3365 li s4, 15137 3366 lw s6, 72(t1) // wsptr[2] 3367 li s5, 6270 3368 lw s7, 88(t1) // wsptr[6] 3369 mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) 3370 lw t2, 64(t1) // wsptr[0] 3371 mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) 3372 lh t5, 92(t1) // wsptr[7] 3373 lh t6, 84(t1) // wsptr[5] 3374 lh t7, 76(t1) // wsptr[3] 3375 lh t8, 68(t1) // wsptr[1] 3376 ins t5, t6, 16, 16 3377 ins t7, t8, 16, 16 3378 mult $ac0, zero, zero 3379 dpa.w.ph $ac0, t5, s0 3380 dpa.w.ph $ac0, t7, s1 3381 mult $ac1, zero, zero 3382 dpa.w.ph $ac1, t5, s2 3383 dpa.w.ph $ac1, t7, s3 3384 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) 3385 mflo s6, $ac0 3386 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3387 subu s4, s4, s5 3388 addu t3, t2, s4 // tmp10 = tmp0 + z2 3389 mflo s7, $ac1 3390 subu t4, t2, s4 // tmp10 = tmp0 - z2 3391 addu t7, t4, s6 3392 subu t8, t4, s6 3393 addu t5, t3, s7 3394 subu t6, t3, s7 3395 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) 3396 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) 3397 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) 3398 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) 3399 sll s4, t9, 2 3400 lw v0, 8(a2) // output_buf[ctr] 3401 shll_s.w t5, t5, 24 3402 shll_s.w t6, t6, 24 3403 shll_s.w t7, t7, 24 3404 shll_s.w t8, t8, 24 3405 sra t5, t5, 24 3406 sra t6, t6, 24 3407 sra t7, t7, 24 3408 sra t8, t8, 24 3409 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3410 addiu t5, t5, 128 3411 addiu t6, t6, 128 3412 addiu t7, t7, 128 3413 addiu t8, t8, 128 3414 sb t5, 0(v0) 3415 sb t7, 1(v0) 3416 sb t8, 2(v0) 3417 sb t6, 3(v0) 3418 li s4, 15137 3419 lw s6, 104(t1) // wsptr[2] 3420 li s5, 6270 3421 lw s7, 120(t1) // wsptr[6] 3422 mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065) 3423 lw t2, 96(t1) // wsptr[0] 3424 mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865) 3425 lh t5, 124(t1) // wsptr[7] 3426 lh t6, 116(t1) // wsptr[5] 3427 lh t7, 108(t1) // wsptr[3] 3428 lh t8, 100(t1) // wsptr[1] 3429 ins t5, t6, 16, 16 3430 ins t7, t8, 16, 16 3431 mult $ac0, zero, zero 3432 dpa.w.ph $ac0, t5, s0 3433 dpa.w.ph $ac0, t7, s1 3434 mult $ac1, zero, zero 3435 dpa.w.ph $ac1, t5, s2 3436 dpa.w.ph $ac1, t7, s3 3437 sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1) 3438 mflo s6, $ac0 3439 // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865) 3440 subu s4, s4, s5 3441 addu t3, t2, s4 // tmp10 = tmp0 + z2; 3442 mflo s7, $ac1 3443 subu t4, t2, s4 // tmp10 = tmp0 - z2; 3444 addu t7, t4, s6 3445 subu t8, t4, s6 3446 addu t5, t3, s7 3447 subu t6, t3, s7 3448 shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19) 3449 shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19) 3450 shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19) 3451 shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19) 3452 sll s4, t9, 2 3453 lw v0, 12(a2) // output_buf[ctr] 3454 shll_s.w t5, t5, 24 3455 shll_s.w t6, t6, 24 3456 shll_s.w t7, t7, 24 3457 shll_s.w t8, t8, 24 3458 sra t5, t5, 24 3459 sra t6, t6, 24 3460 sra t7, t7, 24 3461 sra t8, t8, 24 3462 addu v0, v0, a3 // outptr = output_buf[ctr] + output_col 3463 addiu t5, t5, 128 3464 addiu t6, t6, 128 3465 addiu t7, t7, 128 3466 addiu t8, t8, 128 3467 sb t5, 0(v0) 3468 sb t7, 1(v0) 3469 sb t8, 2(v0) 3470 sb t6, 3(v0) 3471 3472 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3473 3474 j ra 3475 nop 3476END(jsimd_idct_4x4_dspr2) 3477 3478 3479/*****************************************************************************/ 3480LEAF_DSPR2(jsimd_idct_6x6_dspr2) 3481/* 3482 * a0 = compptr->dct_table 3483 * a1 = coef_block 3484 * a2 = output_buf 3485 * a3 = output_col 3486 */ 3487 .set at 3488 3489 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3490 3491 addiu sp, sp, -144 3492 move v0, sp 3493 addiu v1, v0, 24 3494 addiu t9, zero, 5793 3495 addiu s0, zero, 10033 3496 addiu s1, zero, 2998 3497 34981: 3499 lh s2, 0(a0) // q0 = quantptr[ 0] 3500 lh s3, 32(a0) // q1 = quantptr[16] 3501 lh s4, 64(a0) // q2 = quantptr[32] 3502 lh t2, 64(a1) // tmp2 = inptr[32] 3503 lh t1, 32(a1) // tmp1 = inptr[16] 3504 lh t0, 0(a1) // tmp0 = inptr[ 0] 3505 mul t2, t2, s4 // tmp2 = tmp2 * q2 3506 mul t1, t1, s3 // tmp1 = tmp1 * q1 3507 mul t0, t0, s2 // tmp0 = tmp0 * q0 3508 lh t6, 16(a1) // z1 = inptr[ 8] 3509 lh t8, 80(a1) // z3 = inptr[40] 3510 lh t7, 48(a1) // z2 = inptr[24] 3511 lh s2, 16(a0) // q0 = quantptr[ 8] 3512 lh s4, 80(a0) // q2 = quantptr[40] 3513 lh s3, 48(a0) // q1 = quantptr[24] 3514 mul t2, t2, t9 // tmp2 = tmp2 * 5793 3515 mul t1, t1, s0 // tmp1 = tmp1 * 10033 3516 sll t0, t0, 13 // tmp0 = tmp0 << 13 3517 mul t6, t6, s2 // z1 = z1 * q0 3518 mul t8, t8, s4 // z3 = z3 * q2 3519 mul t7, t7, s3 // z2 = z2 * q1 3520 addu t3, t0, t2 // tmp10 = tmp0 + tmp2 3521 sll t2, t2, 1 // tmp2 = tmp2 << 2 3522 subu t4, t0, t2 // tmp11 = tmp0 - tmp2; 3523 subu t5, t3, t1 // tmp12 = tmp10 - tmp1 3524 addu t3, t3, t1 // tmp10 = tmp10 + tmp1 3525 addu t1, t6, t8 // tmp1 = z1 + z3 3526 mul t1, t1, s1 // tmp1 = tmp1 * 2998 3527 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 3528 subu t2, t6, t8 // tmp2 = z1 - z3 3529 subu t2, t2, t7 // tmp2 = tmp2 - z2 3530 sll t2, t2, 2 // tmp2 = tmp2 << 2 3531 addu t0, t6, t7 // tmp0 = z1 + z2 3532 sll t0, t0, 13 // tmp0 = tmp0 << 13 3533 subu s2, t8, t7 // q0 = z3 - z2 3534 sll s2, s2, 13 // q0 = q0 << 13 3535 addu t0, t0, t1 // tmp0 = tmp0 + tmp1 3536 addu t1, s2, t1 // tmp1 = q0 + tmp1 3537 addu s2, t4, t2 // q0 = tmp11 + tmp2 3538 subu s3, t4, t2 // q1 = tmp11 - tmp2 3539 addu t6, t3, t0 // z1 = tmp10 + tmp0 3540 subu t7, t3, t0 // z2 = tmp10 - tmp0 3541 addu t4, t5, t1 // tmp11 = tmp12 + tmp1 3542 subu t5, t5, t1 // tmp12 = tmp12 - tmp1 3543 shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11 3544 shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11 3545 shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 3546 shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11 3547 sw s2, 24(v0) 3548 sw s3, 96(v0) 3549 sw t6, 0(v0) 3550 sw t7, 120(v0) 3551 sw t4, 48(v0) 3552 sw t5, 72(v0) 3553 addiu v0, v0, 4 3554 addiu a1, a1, 2 3555 bne v0, v1, 1b 3556 addiu a0, a0, 2 3557 3558 /* Pass 2: process 6 rows from work array, store into output array. */ 3559 move v0, sp 3560 addiu v1, v0, 144 3561 35622: 3563 lw t0, 0(v0) 3564 lw t2, 16(v0) 3565 lw s5, 0(a2) 3566 addiu t0, t0, 16 3567 sll t0, t0, 13 3568 mul t3, t2, t9 3569 lw t6, 4(v0) 3570 lw t8, 20(v0) 3571 lw t7, 12(v0) 3572 addu s5, s5, a3 3573 addu s6, t6, t8 3574 mul s6, s6, s1 3575 addu t1, t0, t3 3576 subu t4, t0, t3 3577 subu t4, t4, t3 3578 lw t3, 8(v0) 3579 mul t0, t3, s0 3580 addu s7, t6, t7 3581 sll s7, s7, 13 3582 addu s7, s6, s7 3583 subu t2, t8, t7 3584 sll t2, t2, 13 3585 addu t2, s6, t2 3586 subu s6, t6, t7 3587 subu s6, s6, t8 3588 sll s6, s6, 13 3589 addu t3, t1, t0 3590 subu t5, t1, t0 3591 addu t6, t3, s7 3592 subu t3, t3, s7 3593 addu t7, t4, s6 3594 subu t4, t4, s6 3595 addu t8, t5, t2 3596 subu t5, t5, t2 3597 shll_s.w t6, t6, 6 3598 shll_s.w t3, t3, 6 3599 shll_s.w t7, t7, 6 3600 shll_s.w t4, t4, 6 3601 shll_s.w t8, t8, 6 3602 shll_s.w t5, t5, 6 3603 sra t6, t6, 24 3604 addiu t6, t6, 128 3605 sra t3, t3, 24 3606 addiu t3, t3, 128 3607 sb t6, 0(s5) 3608 sra t7, t7, 24 3609 addiu t7, t7, 128 3610 sb t3, 5(s5) 3611 sra t4, t4, 24 3612 addiu t4, t4, 128 3613 sb t7, 1(s5) 3614 sra t8, t8, 24 3615 addiu t8, t8, 128 3616 sb t4, 4(s5) 3617 addiu v0, v0, 24 3618 sra t5, t5, 24 3619 addiu t5, t5, 128 3620 sb t8, 2(s5) 3621 addiu a2, a2, 4 3622 bne v0, v1, 2b 3623 sb t5, 3(s5) 3624 3625 addiu sp, sp, 144 3626 3627 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3628 3629 j ra 3630 nop 3631 3632END(jsimd_idct_6x6_dspr2) 3633 3634 3635/*****************************************************************************/ 3636LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2) 3637/* 3638 * a0 = compptr->dct_table 3639 * a1 = coef_block 3640 * a2 = workspace 3641 */ 3642 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 3643 3644 li a3, 8 3645 36461: 3647 // odd part 3648 lh t0, 48(a1) 3649 lh t1, 48(a0) 3650 lh t2, 16(a1) 3651 lh t3, 16(a0) 3652 lh t4, 80(a1) 3653 lh t5, 80(a0) 3654 lh t6, 112(a1) 3655 lh t7, 112(a0) 3656 mul t0, t0, t1 // z2 3657 mul t1, t2, t3 // z1 3658 mul t2, t4, t5 // z3 3659 mul t3, t6, t7 // z4 3660 li t4, 10703 // FIX(1.306562965) 3661 li t5, 4433 // FIX_0_541196100 3662 li t6, 7053 // FIX(0.860918669) 3663 mul t4, t0, t4 // tmp11 3664 mul t5, t0, t5 // -tmp14 3665 addu t7, t1, t2 // tmp10 3666 addu t8, t7, t3 // tmp10 + z4 3667 mul t6, t6, t8 // tmp15 3668 li t8, 2139 // FIX(0.261052384) 3669 mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384)) 3670 li t7, 2295 // FIX(0.280143716) 3671 mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716)) 3672 addu t9, t2, t3 // z3 + z4 3673 li s0, 8565 // FIX(1.045510580) 3674 mul t9, t9, s0 // -tmp13 3675 li s0, 12112 // FIX(1.478575242) 3676 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242) 3677 li s1, 12998 // FIX(1.586706681) 3678 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) 3679 li s2, 5540 // FIX(0.676326758) 3680 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) 3681 li s3, 16244 // FIX(1.982889723) 3682 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) 3683 subu t1, t1, t3 // z1-=z4 3684 subu t0, t0, t2 // z2-=z3 3685 addu t2, t0, t1 // z1+z2 3686 li t3, 4433 // FIX_0_541196100 3687 mul t2, t2, t3 // z3 3688 li t3, 6270 // FIX_0_765366865 3689 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) 3690 li t3, 15137 // FIX_0_765366865 3691 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) 3692 addu t8, t6, t8 // tmp12 3693 addu t3, t8, t4 // tmp12 + tmp11 3694 addu t3, t3, t7 // tmp10 3695 subu t8, t8, t9 // tmp12 + tmp13 3696 addu s0, t5, s0 3697 subu t8, t8, s0 // tmp12 3698 subu t9, t6, t9 3699 subu s1, s1, t4 3700 addu t9, t9, s1 // tmp13 3701 subu t6, t6, t5 3702 subu t6, t6, s2 3703 subu t6, t6, s3 // tmp15 3704 // even part start 3705 lh t4, 64(a1) 3706 lh t5, 64(a0) 3707 lh t7, 32(a1) 3708 lh s0, 32(a0) 3709 lh s1, 0(a1) 3710 lh s2, 0(a0) 3711 lh s3, 96(a1) 3712 lh v0, 96(a0) 3713 mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) 3714 mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) 3715 mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) 3716 mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) 3717 // odd part end 3718 addu t1, t2, t1 // tmp11 3719 subu t0, t2, t0 // tmp14 3720 // update counter and pointers 3721 addiu a3, a3, -1 3722 addiu a0, a0, 2 3723 addiu a1, a1, 2 3724 // even part rest 3725 li s1, 10033 3726 li s2, 11190 3727 mul t4, t4, s1 // z4 3728 mul s1, t5, s2 // z4 3729 sll t5, t5, 13 // z1 3730 sll t7, t7, 13 3731 addiu t7, t7, 1024 // z3 3732 sll s0, s0, 13 // z2 3733 addu s2, t7, t4 // tmp10 3734 subu t4, t7, t4 // tmp11 3735 subu s3, t5, s0 // tmp12 3736 addu t2, t7, s3 // tmp21 3737 subu s3, t7, s3 // tmp24 3738 addu t7, s1, s0 // tmp12 3739 addu v0, s2, t7 // tmp20 3740 subu s2, s2, t7 // tmp25 3741 subu s1, s1, t5 // z4 - z1 3742 subu s1, s1, s0 // tmp12 3743 addu s0, t4, s1 // tmp22 3744 subu t4, t4, s1 // tmp23 3745 // final output stage 3746 addu t5, v0, t3 3747 subu v0, v0, t3 3748 addu t3, t2, t1 3749 subu t2, t2, t1 3750 addu t1, s0, t8 3751 subu s0, s0, t8 3752 addu t8, t4, t9 3753 subu t4, t4, t9 3754 addu t9, s3, t0 3755 subu s3, s3, t0 3756 addu t0, s2, t6 3757 subu s2, s2, t6 3758 sra t5, t5, 11 3759 sra t3, t3, 11 3760 sra t1, t1, 11 3761 sra t8, t8, 11 3762 sra t9, t9, 11 3763 sra t0, t0, 11 3764 sra s2, s2, 11 3765 sra s3, s3, 11 3766 sra t4, t4, 11 3767 sra s0, s0, 11 3768 sra t2, t2, 11 3769 sra v0, v0, 11 3770 sw t5, 0(a2) 3771 sw t3, 32(a2) 3772 sw t1, 64(a2) 3773 sw t8, 96(a2) 3774 sw t9, 128(a2) 3775 sw t0, 160(a2) 3776 sw s2, 192(a2) 3777 sw s3, 224(a2) 3778 sw t4, 256(a2) 3779 sw s0, 288(a2) 3780 sw t2, 320(a2) 3781 sw v0, 352(a2) 3782 bgtz a3, 1b 3783 addiu a2, a2, 4 3784 3785 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 3786 3787 j ra 3788 nop 3789 3790END(jsimd_idct_12x12_pass1_dspr2) 3791 3792 3793/*****************************************************************************/ 3794LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2) 3795/* 3796 * a0 = workspace 3797 * a1 = output 3798 */ 3799 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 3800 3801 li a3, 12 3802 38031: 3804 // Odd part 3805 lw t0, 12(a0) 3806 lw t1, 4(a0) 3807 lw t2, 20(a0) 3808 lw t3, 28(a0) 3809 li t4, 10703 // FIX(1.306562965) 3810 li t5, 4433 // FIX_0_541196100 3811 mul t4, t0, t4 // tmp11 3812 mul t5, t0, t5 // -tmp14 3813 addu t6, t1, t2 // tmp10 3814 li t7, 2139 // FIX(0.261052384) 3815 mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384)) 3816 addu t6, t6, t3 // tmp10 + z4 3817 li t8, 7053 // FIX(0.860918669) 3818 mul t6, t6, t8 // tmp15 3819 li t8, 2295 // FIX(0.280143716) 3820 mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716)) 3821 addu t9, t2, t3 // z3 + z4 3822 li s0, 8565 // FIX(1.045510580) 3823 mul t9, t9, s0 // -tmp13 3824 li s0, 12112 // FIX(1.478575242) 3825 mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)) 3826 li s1, 12998 // FIX(1.586706681) 3827 mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) 3828 li s2, 5540 // FIX(0.676326758) 3829 mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) 3830 li s3, 16244 // FIX(1.982889723) 3831 mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) 3832 subu t1, t1, t3 // z1 -= z4 3833 subu t0, t0, t2 // z2 -= z3 3834 addu t2, t1, t0 // z1 + z2 3835 li t3, 4433 // FIX_0_541196100 3836 mul t2, t2, t3 // z3 3837 li t3, 6270 // FIX_0_765366865 3838 mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) 3839 li t3, 15137 // FIX_1_847759065 3840 mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) 3841 addu t3, t6, t7 // tmp12 3842 addu t7, t3, t4 3843 addu t7, t7, t8 // tmp10 3844 subu t3, t3, t9 3845 subu t3, t3, t5 3846 subu t3, t3, s0 // tmp12 3847 subu t9, t6, t9 3848 subu t9, t9, t4 3849 addu t9, t9, s1 // tmp13 3850 subu t6, t6, t5 3851 subu t6, t6, s2 3852 subu t6, t6, s3 // tmp15 3853 addu t1, t2, t1 // tmp11 3854 subu t0, t2, t0 // tmp14 3855 // even part 3856 lw t2, 16(a0) // z4 3857 lw t4, 8(a0) // z1 3858 lw t5, 0(a0) // z3 3859 lw t8, 24(a0) // z2 3860 li s0, 10033 // FIX(1.224744871) 3861 li s1, 11190 // FIX(1.366025404) 3862 mul t2, t2, s0 // z4 3863 mul s0, t4, s1 // z4 3864 addiu t5, t5, 0x10 3865 sll t5, t5, 13 // z3 3866 sll t4, t4, 13 // z1 3867 sll t8, t8, 13 // z2 3868 subu s1, t4, t8 // tmp12 3869 addu s2, t5, t2 // tmp10 3870 subu t2, t5, t2 // tmp11 3871 addu s3, t5, s1 // tmp21 3872 subu s1, t5, s1 // tmp24 3873 addu t5, s0, t8 // tmp12 3874 addu v0, s2, t5 // tmp20 3875 subu t5, s2, t5 // tmp25 3876 subu t4, s0, t4 3877 subu t4, t4, t8 // tmp12 3878 addu t8, t2, t4 // tmp22 3879 subu t2, t2, t4 // tmp23 3880 // increment counter and pointers 3881 addiu a3, a3, -1 3882 addiu a0, a0, 32 3883 // Final stage 3884 addu t4, v0, t7 3885 subu v0, v0, t7 3886 addu t7, s3, t1 3887 subu s3, s3, t1 3888 addu t1, t8, t3 3889 subu t8, t8, t3 3890 addu t3, t2, t9 3891 subu t2, t2, t9 3892 addu t9, s1, t0 3893 subu s1, s1, t0 3894 addu t0, t5, t6 3895 subu t5, t5, t6 3896 sll t4, t4, 4 3897 sll t7, t7, 4 3898 sll t1, t1, 4 3899 sll t3, t3, 4 3900 sll t9, t9, 4 3901 sll t0, t0, 4 3902 sll t5, t5, 4 3903 sll s1, s1, 4 3904 sll t2, t2, 4 3905 sll t8, t8, 4 3906 sll s3, s3, 4 3907 sll v0, v0, 4 3908 shll_s.w t4, t4, 2 3909 shll_s.w t7, t7, 2 3910 shll_s.w t1, t1, 2 3911 shll_s.w t3, t3, 2 3912 shll_s.w t9, t9, 2 3913 shll_s.w t0, t0, 2 3914 shll_s.w t5, t5, 2 3915 shll_s.w s1, s1, 2 3916 shll_s.w t2, t2, 2 3917 shll_s.w t8, t8, 2 3918 shll_s.w s3, s3, 2 3919 shll_s.w v0, v0, 2 3920 srl t4, t4, 24 3921 srl t7, t7, 24 3922 srl t1, t1, 24 3923 srl t3, t3, 24 3924 srl t9, t9, 24 3925 srl t0, t0, 24 3926 srl t5, t5, 24 3927 srl s1, s1, 24 3928 srl t2, t2, 24 3929 srl t8, t8, 24 3930 srl s3, s3, 24 3931 srl v0, v0, 24 3932 lw t6, 0(a1) 3933 addiu t4, t4, 0x80 3934 addiu t7, t7, 0x80 3935 addiu t1, t1, 0x80 3936 addiu t3, t3, 0x80 3937 addiu t9, t9, 0x80 3938 addiu t0, t0, 0x80 3939 addiu t5, t5, 0x80 3940 addiu s1, s1, 0x80 3941 addiu t2, t2, 0x80 3942 addiu t8, t8, 0x80 3943 addiu s3, s3, 0x80 3944 addiu v0, v0, 0x80 3945 sb t4, 0(t6) 3946 sb t7, 1(t6) 3947 sb t1, 2(t6) 3948 sb t3, 3(t6) 3949 sb t9, 4(t6) 3950 sb t0, 5(t6) 3951 sb t5, 6(t6) 3952 sb s1, 7(t6) 3953 sb t2, 8(t6) 3954 sb t8, 9(t6) 3955 sb s3, 10(t6) 3956 sb v0, 11(t6) 3957 bgtz a3, 1b 3958 addiu a1, a1, 4 3959 3960 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 3961 3962 jr ra 3963 nop 3964 3965END(jsimd_idct_12x12_pass2_dspr2) 3966 3967 3968/*****************************************************************************/ 3969LEAF_DSPR2(jsimd_convsamp_dspr2) 3970/* 3971 * a0 = sample_data 3972 * a1 = start_col 3973 * a2 = workspace 3974 */ 3975 lw t0, 0(a0) 3976 li t7, 0xff80ff80 3977 addu t0, t0, a1 3978 ulw t1, 0(t0) 3979 ulw t2, 4(t0) 3980 preceu.ph.qbr t3, t1 3981 preceu.ph.qbl t4, t1 3982 lw t0, 4(a0) 3983 preceu.ph.qbr t5, t2 3984 preceu.ph.qbl t6, t2 3985 addu t0, t0, a1 3986 addu.ph t3, t3, t7 3987 addu.ph t4, t4, t7 3988 ulw t1, 0(t0) 3989 ulw t2, 4(t0) 3990 addu.ph t5, t5, t7 3991 addu.ph t6, t6, t7 3992 usw t3, 0(a2) 3993 usw t4, 4(a2) 3994 preceu.ph.qbr t3, t1 3995 preceu.ph.qbl t4, t1 3996 usw t5, 8(a2) 3997 usw t6, 12(a2) 3998 3999 lw t0, 8(a0) 4000 preceu.ph.qbr t5, t2 4001 preceu.ph.qbl t6, t2 4002 addu t0, t0, a1 4003 addu.ph t3, t3, t7 4004 addu.ph t4, t4, t7 4005 ulw t1, 0(t0) 4006 ulw t2, 4(t0) 4007 addu.ph t5, t5, t7 4008 addu.ph t6, t6, t7 4009 usw t3, 16(a2) 4010 usw t4, 20(a2) 4011 preceu.ph.qbr t3, t1 4012 preceu.ph.qbl t4, t1 4013 usw t5, 24(a2) 4014 usw t6, 28(a2) 4015 4016 lw t0, 12(a0) 4017 preceu.ph.qbr t5, t2 4018 preceu.ph.qbl t6, t2 4019 addu t0, t0, a1 4020 addu.ph t3, t3, t7 4021 addu.ph t4, t4, t7 4022 ulw t1, 0(t0) 4023 ulw t2, 4(t0) 4024 addu.ph t5, t5, t7 4025 addu.ph t6, t6, t7 4026 usw t3, 32(a2) 4027 usw t4, 36(a2) 4028 preceu.ph.qbr t3, t1 4029 preceu.ph.qbl t4, t1 4030 usw t5, 40(a2) 4031 usw t6, 44(a2) 4032 4033 lw t0, 16(a0) 4034 preceu.ph.qbr t5, t2 4035 preceu.ph.qbl t6, t2 4036 addu t0, t0, a1 4037 addu.ph t3, t3, t7 4038 addu.ph t4, t4, t7 4039 ulw t1, 0(t0) 4040 ulw t2, 4(t0) 4041 addu.ph t5, t5, t7 4042 addu.ph t6, t6, t7 4043 usw t3, 48(a2) 4044 usw t4, 52(a2) 4045 preceu.ph.qbr t3, t1 4046 preceu.ph.qbl t4, t1 4047 usw t5, 56(a2) 4048 usw t6, 60(a2) 4049 4050 lw t0, 20(a0) 4051 preceu.ph.qbr t5, t2 4052 preceu.ph.qbl t6, t2 4053 addu t0, t0, a1 4054 addu.ph t3, t3, t7 4055 addu.ph t4, t4, t7 4056 ulw t1, 0(t0) 4057 ulw t2, 4(t0) 4058 addu.ph t5, t5, t7 4059 addu.ph t6, t6, t7 4060 usw t3, 64(a2) 4061 usw t4, 68(a2) 4062 preceu.ph.qbr t3, t1 4063 preceu.ph.qbl t4, t1 4064 usw t5, 72(a2) 4065 usw t6, 76(a2) 4066 4067 lw t0, 24(a0) 4068 preceu.ph.qbr t5, t2 4069 preceu.ph.qbl t6, t2 4070 addu t0, t0, a1 4071 addu.ph t3, t3, t7 4072 addu.ph t4, t4, t7 4073 ulw t1, 0(t0) 4074 ulw t2, 4(t0) 4075 addu.ph t5, t5, t7 4076 addu.ph t6, t6, t7 4077 usw t3, 80(a2) 4078 usw t4, 84(a2) 4079 preceu.ph.qbr t3, t1 4080 preceu.ph.qbl t4, t1 4081 usw t5, 88(a2) 4082 usw t6, 92(a2) 4083 4084 lw t0, 28(a0) 4085 preceu.ph.qbr t5, t2 4086 preceu.ph.qbl t6, t2 4087 addu t0, t0, a1 4088 addu.ph t3, t3, t7 4089 addu.ph t4, t4, t7 4090 ulw t1, 0(t0) 4091 ulw t2, 4(t0) 4092 addu.ph t5, t5, t7 4093 addu.ph t6, t6, t7 4094 usw t3, 96(a2) 4095 usw t4, 100(a2) 4096 preceu.ph.qbr t3, t1 4097 preceu.ph.qbl t4, t1 4098 usw t5, 104(a2) 4099 usw t6, 108(a2) 4100 preceu.ph.qbr t5, t2 4101 preceu.ph.qbl t6, t2 4102 addu.ph t3, t3, t7 4103 addu.ph t4, t4, t7 4104 addu.ph t5, t5, t7 4105 addu.ph t6, t6, t7 4106 usw t3, 112(a2) 4107 usw t4, 116(a2) 4108 usw t5, 120(a2) 4109 usw t6, 124(a2) 4110 4111 j ra 4112 nop 4113 4114END(jsimd_convsamp_dspr2) 4115 4116 4117#ifndef __mips_soft_float 4118 4119/*****************************************************************************/ 4120LEAF_DSPR2(jsimd_convsamp_float_dspr2) 4121/* 4122 * a0 = sample_data 4123 * a1 = start_col 4124 * a2 = workspace 4125 */ 4126 .set at 4127 4128 lw t0, 0(a0) 4129 addu t0, t0, a1 4130 lbu t1, 0(t0) 4131 lbu t2, 1(t0) 4132 lbu t3, 2(t0) 4133 lbu t4, 3(t0) 4134 lbu t5, 4(t0) 4135 lbu t6, 5(t0) 4136 lbu t7, 6(t0) 4137 lbu t8, 7(t0) 4138 addiu t1, t1, -128 4139 addiu t2, t2, -128 4140 addiu t3, t3, -128 4141 addiu t4, t4, -128 4142 addiu t5, t5, -128 4143 addiu t6, t6, -128 4144 addiu t7, t7, -128 4145 addiu t8, t8, -128 4146 mtc1 t1, f2 4147 mtc1 t2, f4 4148 mtc1 t3, f6 4149 mtc1 t4, f8 4150 mtc1 t5, f10 4151 mtc1 t6, f12 4152 mtc1 t7, f14 4153 mtc1 t8, f16 4154 cvt.s.w f2, f2 4155 cvt.s.w f4, f4 4156 cvt.s.w f6, f6 4157 cvt.s.w f8, f8 4158 cvt.s.w f10, f10 4159 cvt.s.w f12, f12 4160 cvt.s.w f14, f14 4161 cvt.s.w f16, f16 4162 lw t0, 4(a0) 4163 swc1 f2, 0(a2) 4164 swc1 f4, 4(a2) 4165 swc1 f6, 8(a2) 4166 addu t0, t0, a1 4167 swc1 f8, 12(a2) 4168 swc1 f10, 16(a2) 4169 swc1 f12, 20(a2) 4170 swc1 f14, 24(a2) 4171 swc1 f16, 28(a2) 4172 // elemr 1 4173 lbu t1, 0(t0) 4174 lbu t2, 1(t0) 4175 lbu t3, 2(t0) 4176 lbu t4, 3(t0) 4177 lbu t5, 4(t0) 4178 lbu t6, 5(t0) 4179 lbu t7, 6(t0) 4180 lbu t8, 7(t0) 4181 addiu t1, t1, -128 4182 addiu t2, t2, -128 4183 addiu t3, t3, -128 4184 addiu t4, t4, -128 4185 addiu t5, t5, -128 4186 addiu t6, t6, -128 4187 addiu t7, t7, -128 4188 addiu t8, t8, -128 4189 mtc1 t1, f2 4190 mtc1 t2, f4 4191 mtc1 t3, f6 4192 mtc1 t4, f8 4193 mtc1 t5, f10 4194 mtc1 t6, f12 4195 mtc1 t7, f14 4196 mtc1 t8, f16 4197 cvt.s.w f2, f2 4198 cvt.s.w f4, f4 4199 cvt.s.w f6, f6 4200 cvt.s.w f8, f8 4201 cvt.s.w f10, f10 4202 cvt.s.w f12, f12 4203 cvt.s.w f14, f14 4204 cvt.s.w f16, f16 4205 lw t0, 8(a0) 4206 swc1 f2, 32(a2) 4207 swc1 f4, 36(a2) 4208 swc1 f6, 40(a2) 4209 addu t0, t0, a1 4210 swc1 f8, 44(a2) 4211 swc1 f10, 48(a2) 4212 swc1 f12, 52(a2) 4213 swc1 f14, 56(a2) 4214 swc1 f16, 60(a2) 4215 // elemr 2 4216 lbu t1, 0(t0) 4217 lbu t2, 1(t0) 4218 lbu t3, 2(t0) 4219 lbu t4, 3(t0) 4220 lbu t5, 4(t0) 4221 lbu t6, 5(t0) 4222 lbu t7, 6(t0) 4223 lbu t8, 7(t0) 4224 addiu t1, t1, -128 4225 addiu t2, t2, -128 4226 addiu t3, t3, -128 4227 addiu t4, t4, -128 4228 addiu t5, t5, -128 4229 addiu t6, t6, -128 4230 addiu t7, t7, -128 4231 addiu t8, t8, -128 4232 mtc1 t1, f2 4233 mtc1 t2, f4 4234 mtc1 t3, f6 4235 mtc1 t4, f8 4236 mtc1 t5, f10 4237 mtc1 t6, f12 4238 mtc1 t7, f14 4239 mtc1 t8, f16 4240 cvt.s.w f2, f2 4241 cvt.s.w f4, f4 4242 cvt.s.w f6, f6 4243 cvt.s.w f8, f8 4244 cvt.s.w f10, f10 4245 cvt.s.w f12, f12 4246 cvt.s.w f14, f14 4247 cvt.s.w f16, f16 4248 lw t0, 12(a0) 4249 swc1 f2, 64(a2) 4250 swc1 f4, 68(a2) 4251 swc1 f6, 72(a2) 4252 addu t0, t0, a1 4253 swc1 f8, 76(a2) 4254 swc1 f10, 80(a2) 4255 swc1 f12, 84(a2) 4256 swc1 f14, 88(a2) 4257 swc1 f16, 92(a2) 4258 // elemr 3 4259 lbu t1, 0(t0) 4260 lbu t2, 1(t0) 4261 lbu t3, 2(t0) 4262 lbu t4, 3(t0) 4263 lbu t5, 4(t0) 4264 lbu t6, 5(t0) 4265 lbu t7, 6(t0) 4266 lbu t8, 7(t0) 4267 addiu t1, t1, -128 4268 addiu t2, t2, -128 4269 addiu t3, t3, -128 4270 addiu t4, t4, -128 4271 addiu t5, t5, -128 4272 addiu t6, t6, -128 4273 addiu t7, t7, -128 4274 addiu t8, t8, -128 4275 mtc1 t1, f2 4276 mtc1 t2, f4 4277 mtc1 t3, f6 4278 mtc1 t4, f8 4279 mtc1 t5, f10 4280 mtc1 t6, f12 4281 mtc1 t7, f14 4282 mtc1 t8, f16 4283 cvt.s.w f2, f2 4284 cvt.s.w f4, f4 4285 cvt.s.w f6, f6 4286 cvt.s.w f8, f8 4287 cvt.s.w f10, f10 4288 cvt.s.w f12, f12 4289 cvt.s.w f14, f14 4290 cvt.s.w f16, f16 4291 lw t0, 16(a0) 4292 swc1 f2, 96(a2) 4293 swc1 f4, 100(a2) 4294 swc1 f6, 104(a2) 4295 addu t0, t0, a1 4296 swc1 f8, 108(a2) 4297 swc1 f10, 112(a2) 4298 swc1 f12, 116(a2) 4299 swc1 f14, 120(a2) 4300 swc1 f16, 124(a2) 4301 // elemr 4 4302 lbu t1, 0(t0) 4303 lbu t2, 1(t0) 4304 lbu t3, 2(t0) 4305 lbu t4, 3(t0) 4306 lbu t5, 4(t0) 4307 lbu t6, 5(t0) 4308 lbu t7, 6(t0) 4309 lbu t8, 7(t0) 4310 addiu t1, t1, -128 4311 addiu t2, t2, -128 4312 addiu t3, t3, -128 4313 addiu t4, t4, -128 4314 addiu t5, t5, -128 4315 addiu t6, t6, -128 4316 addiu t7, t7, -128 4317 addiu t8, t8, -128 4318 mtc1 t1, f2 4319 mtc1 t2, f4 4320 mtc1 t3, f6 4321 mtc1 t4, f8 4322 mtc1 t5, f10 4323 mtc1 t6, f12 4324 mtc1 t7, f14 4325 mtc1 t8, f16 4326 cvt.s.w f2, f2 4327 cvt.s.w f4, f4 4328 cvt.s.w f6, f6 4329 cvt.s.w f8, f8 4330 cvt.s.w f10, f10 4331 cvt.s.w f12, f12 4332 cvt.s.w f14, f14 4333 cvt.s.w f16, f16 4334 lw t0, 20(a0) 4335 swc1 f2, 128(a2) 4336 swc1 f4, 132(a2) 4337 swc1 f6, 136(a2) 4338 addu t0, t0, a1 4339 swc1 f8, 140(a2) 4340 swc1 f10, 144(a2) 4341 swc1 f12, 148(a2) 4342 swc1 f14, 152(a2) 4343 swc1 f16, 156(a2) 4344 // elemr 5 4345 lbu t1, 0(t0) 4346 lbu t2, 1(t0) 4347 lbu t3, 2(t0) 4348 lbu t4, 3(t0) 4349 lbu t5, 4(t0) 4350 lbu t6, 5(t0) 4351 lbu t7, 6(t0) 4352 lbu t8, 7(t0) 4353 addiu t1, t1, -128 4354 addiu t2, t2, -128 4355 addiu t3, t3, -128 4356 addiu t4, t4, -128 4357 addiu t5, t5, -128 4358 addiu t6, t6, -128 4359 addiu t7, t7, -128 4360 addiu t8, t8, -128 4361 mtc1 t1, f2 4362 mtc1 t2, f4 4363 mtc1 t3, f6 4364 mtc1 t4, f8 4365 mtc1 t5, f10 4366 mtc1 t6, f12 4367 mtc1 t7, f14 4368 mtc1 t8, f16 4369 cvt.s.w f2, f2 4370 cvt.s.w f4, f4 4371 cvt.s.w f6, f6 4372 cvt.s.w f8, f8 4373 cvt.s.w f10, f10 4374 cvt.s.w f12, f12 4375 cvt.s.w f14, f14 4376 cvt.s.w f16, f16 4377 lw t0, 24(a0) 4378 swc1 f2, 160(a2) 4379 swc1 f4, 164(a2) 4380 swc1 f6, 168(a2) 4381 addu t0, t0, a1 4382 swc1 f8, 172(a2) 4383 swc1 f10, 176(a2) 4384 swc1 f12, 180(a2) 4385 swc1 f14, 184(a2) 4386 swc1 f16, 188(a2) 4387 // elemr 6 4388 lbu t1, 0(t0) 4389 lbu t2, 1(t0) 4390 lbu t3, 2(t0) 4391 lbu t4, 3(t0) 4392 lbu t5, 4(t0) 4393 lbu t6, 5(t0) 4394 lbu t7, 6(t0) 4395 lbu t8, 7(t0) 4396 addiu t1, t1, -128 4397 addiu t2, t2, -128 4398 addiu t3, t3, -128 4399 addiu t4, t4, -128 4400 addiu t5, t5, -128 4401 addiu t6, t6, -128 4402 addiu t7, t7, -128 4403 addiu t8, t8, -128 4404 mtc1 t1, f2 4405 mtc1 t2, f4 4406 mtc1 t3, f6 4407 mtc1 t4, f8 4408 mtc1 t5, f10 4409 mtc1 t6, f12 4410 mtc1 t7, f14 4411 mtc1 t8, f16 4412 cvt.s.w f2, f2 4413 cvt.s.w f4, f4 4414 cvt.s.w f6, f6 4415 cvt.s.w f8, f8 4416 cvt.s.w f10, f10 4417 cvt.s.w f12, f12 4418 cvt.s.w f14, f14 4419 cvt.s.w f16, f16 4420 lw t0, 28(a0) 4421 swc1 f2, 192(a2) 4422 swc1 f4, 196(a2) 4423 swc1 f6, 200(a2) 4424 addu t0, t0, a1 4425 swc1 f8, 204(a2) 4426 swc1 f10, 208(a2) 4427 swc1 f12, 212(a2) 4428 swc1 f14, 216(a2) 4429 swc1 f16, 220(a2) 4430 // elemr 7 4431 lbu t1, 0(t0) 4432 lbu t2, 1(t0) 4433 lbu t3, 2(t0) 4434 lbu t4, 3(t0) 4435 lbu t5, 4(t0) 4436 lbu t6, 5(t0) 4437 lbu t7, 6(t0) 4438 lbu t8, 7(t0) 4439 addiu t1, t1, -128 4440 addiu t2, t2, -128 4441 addiu t3, t3, -128 4442 addiu t4, t4, -128 4443 addiu t5, t5, -128 4444 addiu t6, t6, -128 4445 addiu t7, t7, -128 4446 addiu t8, t8, -128 4447 mtc1 t1, f2 4448 mtc1 t2, f4 4449 mtc1 t3, f6 4450 mtc1 t4, f8 4451 mtc1 t5, f10 4452 mtc1 t6, f12 4453 mtc1 t7, f14 4454 mtc1 t8, f16 4455 cvt.s.w f2, f2 4456 cvt.s.w f4, f4 4457 cvt.s.w f6, f6 4458 cvt.s.w f8, f8 4459 cvt.s.w f10, f10 4460 cvt.s.w f12, f12 4461 cvt.s.w f14, f14 4462 cvt.s.w f16, f16 4463 swc1 f2, 224(a2) 4464 swc1 f4, 228(a2) 4465 swc1 f6, 232(a2) 4466 swc1 f8, 236(a2) 4467 swc1 f10, 240(a2) 4468 swc1 f12, 244(a2) 4469 swc1 f14, 248(a2) 4470 swc1 f16, 252(a2) 4471 4472 j ra 4473 nop 4474 4475END(jsimd_convsamp_float_dspr2) 4476 4477#endif 4478 4479/*****************************************************************************/ 4480