1/* 2 * MIPS DSPr2 optimizations for libjpeg-turbo 3 * 4 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California. 5 * All Rights Reserved. 6 * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com> 7 * Darko Laus <darko.laus@imgtec.com> 8 * Copyright (C) 2015, D. R. Commander. All Rights Reserved. 9 * 10 * This software is provided 'as-is', without any express or implied 11 * warranty. In no event will the authors be held liable for any damages 12 * arising from the use of this software. 13 * 14 * Permission is granted to anyone to use this software for any purpose, 15 * including commercial applications, and to alter it and redistribute it 16 * freely, subject to the following restrictions: 17 * 18 * 1. The origin of this software must not be misrepresented; you must not 19 * claim that you wrote the original software. If you use this software 20 * in a product, an acknowledgment in the product documentation would be 21 * appreciated but is not required. 22 * 2. Altered source versions must be plainly marked as such, and must not be 23 * misrepresented as being the original software. 24 * 3. This notice may not be removed or altered from any source distribution. 25 */ 26 27#include "jsimd_dspr2_asm.h" 28 29 30/*****************************************************************************/ 31LEAF_DSPR2(jsimd_c_null_convert_dspr2) 32/* 33 * a0 = cinfo->image_width 34 * a1 = input_buf 35 * a2 = output_buf 36 * a3 = output_row 37 * 16(sp) = num_rows 38 * 20(sp) = cinfo->num_components 39 * 40 * Null conversion for compression 41 */ 42 SAVE_REGS_ON_STACK 8, s0, s1 43 44 lw t9, 24(sp) /* t9 = num_rows */ 45 lw s0, 28(sp) /* s0 = cinfo->num_components */ 46 andi t0, a0, 3 /* t0 = cinfo->image_width & 3 */ 47 beqz t0, 4f /* no residual */ 48 nop 490: 50 addiu t9, t9, -1 51 bltz t9, 7f 52 li t1, 0 531: 54 sll t3, t1, 2 55 lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */ 56 lw t2, 0(a1) /* t2 = inptr = *input_buf */ 57 sll t4, a3, 2 58 lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */ 59 addu t2, t2, t1 60 addu s1, t5, a0 61 addu t6, t5, t0 622: 63 lbu t3, 0(t2) 64 addiu t5, t5, 1 65 sb t3, -1(t5) 66 bne t6, t5, 2b 67 addu t2, t2, s0 683: 69 lbu t3, 0(t2) 70 addu t4, t2, s0 71 addu t7, t4, s0 72 addu t8, t7, s0 73 addu t2, t8, s0 74 lbu t4, 0(t4) 75 lbu t7, 0(t7) 76 lbu t8, 0(t8) 77 addiu t5, t5, 4 78 sb t3, -4(t5) 79 sb t4, -3(t5) 80 sb t7, -2(t5) 81 bne s1, t5, 3b 82 sb t8, -1(t5) 83 addiu t1, t1, 1 84 bne t1, s0, 1b 85 nop 86 addiu a1, a1, 4 87 bgez t9, 0b 88 addiu a3, a3, 1 89 b 7f 90 nop 914: 92 addiu t9, t9, -1 93 bltz t9, 7f 94 li t1, 0 955: 96 sll t3, t1, 2 97 lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */ 98 lw t2, 0(a1) /* t2 = inptr = *input_buf */ 99 sll t4, a3, 2 100 lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */ 101 addu t2, t2, t1 102 addu s1, t5, a0 103 addu t6, t5, t0 1046: 105 lbu t3, 0(t2) 106 addu t4, t2, s0 107 addu t7, t4, s0 108 addu t8, t7, s0 109 addu t2, t8, s0 110 lbu t4, 0(t4) 111 lbu t7, 0(t7) 112 lbu t8, 0(t8) 113 addiu t5, t5, 4 114 sb t3, -4(t5) 115 sb t4, -3(t5) 116 sb t7, -2(t5) 117 bne s1, t5, 6b 118 sb t8, -1(t5) 119 addiu t1, t1, 1 120 bne t1, s0, 5b 121 nop 122 addiu a1, a1, 4 123 bgez t9, 4b 124 addiu a3, a3, 1 1257: 126 RESTORE_REGS_FROM_STACK 8, s0, s1 127 128 j ra 129 nop 130 131END(jsimd_c_null_convert_dspr2) 132 133 134/*****************************************************************************/ 135/* 136 * jsimd_extrgb_ycc_convert_dspr2 137 * jsimd_extbgr_ycc_convert_dspr2 138 * jsimd_extrgbx_ycc_convert_dspr2 139 * jsimd_extbgrx_ycc_convert_dspr2 140 * jsimd_extxbgr_ycc_convert_dspr2 141 * jsimd_extxrgb_ycc_convert_dspr2 142 * 143 * Colorspace conversion RGB -> YCbCr 144 */ 145 146.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \ 147 r_offs, g_offs, b_offs 148 149.macro DO_RGB_TO_YCC r, g, b, inptr 150 lbu \r, \r_offs(\inptr) 151 lbu \g, \g_offs(\inptr) 152 lbu \b, \b_offs(\inptr) 153 addiu \inptr, \pixel_size 154.endm 155 156LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2) 157/* 158 * a0 = cinfo->image_width 159 * a1 = input_buf 160 * a2 = output_buf 161 * a3 = output_row 162 * 16(sp) = num_rows 163 */ 164 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 165 166 lw t7, 48(sp) /* t7 = num_rows */ 167 li s0, 0x4c8b /* FIX(0.29900) */ 168 li s1, 0x9646 /* FIX(0.58700) */ 169 li s2, 0x1d2f /* FIX(0.11400) */ 170 li s3, 0xffffd4cd /* -FIX(0.16874) */ 171 li s4, 0xffffab33 /* -FIX(0.33126) */ 172 li s5, 0x8000 /* FIX(0.50000) */ 173 li s6, 0xffff94d1 /* -FIX(0.41869) */ 174 li s7, 0xffffeb2f /* -FIX(0.08131) */ 175 li t8, 0x807fff /* CBCR_OFFSET + ONE_HALF-1 */ 176 1770: 178 addiu t7, -1 /* --num_rows */ 179 lw t6, 0(a1) /* t6 = input_buf[0] */ 180 lw t0, 0(a2) 181 lw t1, 4(a2) 182 lw t2, 8(a2) 183 sll t3, a3, 2 184 lwx t0, t3(t0) /* t0 = output_buf[0][output_row] */ 185 lwx t1, t3(t1) /* t1 = output_buf[1][output_row] */ 186 lwx t2, t3(t2) /* t2 = output_buf[2][output_row] */ 187 188 addu t9, t2, a0 /* t9 = end address */ 189 addiu a3, 1 190 1911: 192 DO_RGB_TO_YCC t3, t4, t5, t6 193 194 mtlo s5, $ac0 195 mtlo t8, $ac1 196 mtlo t8, $ac2 197 maddu $ac0, s2, t5 198 maddu $ac1, s5, t5 199 maddu $ac2, s5, t3 200 maddu $ac0, s0, t3 201 maddu $ac1, s3, t3 202 maddu $ac2, s6, t4 203 maddu $ac0, s1, t4 204 maddu $ac1, s4, t4 205 maddu $ac2, s7, t5 206 extr.w t3, $ac0, 16 207 extr.w t4, $ac1, 16 208 extr.w t5, $ac2, 16 209 sb t3, 0(t0) 210 sb t4, 0(t1) 211 sb t5, 0(t2) 212 addiu t0, 1 213 addiu t2, 1 214 bne t2, t9, 1b 215 addiu t1, 1 216 bgtz t7, 0b 217 addiu a1, 4 218 219 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 220 221 j ra 222 nop 223END(jsimd_\colorid\()_ycc_convert_dspr2) 224 225.purgem DO_RGB_TO_YCC 226 227.endm 228 229/*-------------------------------------id -- pix R G B */ 230GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 231GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 232GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 233GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 234GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 235GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 236 237 238/*****************************************************************************/ 239/* 240 * jsimd_ycc_extrgb_convert_dspr2 241 * jsimd_ycc_extbgr_convert_dspr2 242 * jsimd_ycc_extrgbx_convert_dspr2 243 * jsimd_ycc_extbgrx_convert_dspr2 244 * jsimd_ycc_extxbgr_convert_dspr2 245 * jsimd_ycc_extxrgb_convert_dspr2 246 * 247 * Colorspace conversion YCbCr -> RGB 248 */ 249 250.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \ 251 r_offs, g_offs, b_offs, a_offs 252 253.macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr 254 sb \scratch0, \r_offs(\outptr) 255 sb \scratch1, \g_offs(\outptr) 256 sb \scratch2, \b_offs(\outptr) 257.if (\pixel_size == 4) 258 li t0, 0xFF 259 sb t0, \a_offs(\outptr) 260.endif 261 addiu \outptr, \pixel_size 262.endm 263 264LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2) 265/* 266 * a0 = cinfo->image_width 267 * a1 = input_buf 268 * a2 = input_row 269 * a3 = output_buf 270 * 16(sp) = num_rows 271 */ 272 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 273 274 lw s1, 48(sp) 275 li t3, 0x8000 276 li t4, 0x166e9 /* FIX(1.40200) */ 277 li t5, 0x1c5a2 /* FIX(1.77200) */ 278 li t6, 0xffff492e /* -FIX(0.71414) */ 279 li t7, 0xffffa7e6 /* -FIX(0.34414) */ 280 repl.ph t8, 128 281 2820: 283 lw s0, 0(a3) 284 lw t0, 0(a1) 285 lw t1, 4(a1) 286 lw t2, 8(a1) 287 sll s5, a2, 2 288 addiu s1, -1 289 lwx s2, s5(t0) 290 lwx s3, s5(t1) 291 lwx s4, s5(t2) 292 addu t9, s2, a0 293 addiu a2, 1 294 2951: 296 lbu s7, 0(s4) /* cr */ 297 lbu s6, 0(s3) /* cb */ 298 lbu s5, 0(s2) /* y */ 299 addiu s2, 1 300 addiu s4, 1 301 addiu s7, -128 302 addiu s6, -128 303 mul t2, t7, s6 304 mul t0, t6, s7 /* Crgtab[cr] */ 305 sll s7, 15 306 mulq_rs.w t1, t4, s7 /* Crrtab[cr] */ 307 sll s6, 15 308 addu t2, t3 /* Cbgtab[cb] */ 309 addu t2, t0 310 311 mulq_rs.w t0, t5, s6 /* Cbbtab[cb] */ 312 sra t2, 16 313 addu t1, s5 314 addu t2, s5 /* add y */ 315 ins t2, t1, 16, 16 316 subu.ph t2, t2, t8 317 addu t0, s5 318 shll_s.ph t2, t2, 8 319 subu t0, 128 320 shra.ph t2, t2, 8 321 shll_s.w t0, t0, 24 322 addu.ph t2, t2, t8 /* clip & store */ 323 sra t0, t0, 24 324 sra t1, t2, 16 325 addiu t0, 128 326 327 STORE_YCC_TO_RGB t1, t2, t0, s0 328 329 bne s2, t9, 1b 330 addiu s3, 1 331 bgtz s1, 0b 332 addiu a3, 4 333 334 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 335 336 j ra 337 nop 338END(jsimd_ycc_\colorid\()_convert_dspr2) 339 340.purgem STORE_YCC_TO_RGB 341 342.endm 343 344/*-------------------------------------id -- pix R G B A */ 345GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3 346GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3 347GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3 348GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3 349GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0 350GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0 351 352 353/*****************************************************************************/ 354/* 355 * jsimd_extrgb_gray_convert_dspr2 356 * jsimd_extbgr_gray_convert_dspr2 357 * jsimd_extrgbx_gray_convert_dspr2 358 * jsimd_extbgrx_gray_convert_dspr2 359 * jsimd_extxbgr_gray_convert_dspr2 360 * jsimd_extxrgb_gray_convert_dspr2 361 * 362 * Colorspace conversion RGB -> GRAY 363 */ 364 365.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \ 366 r_offs, g_offs, b_offs 367 368.macro DO_RGB_TO_GRAY r, g, b, inptr 369 lbu \r, \r_offs(\inptr) 370 lbu \g, \g_offs(\inptr) 371 lbu \b, \b_offs(\inptr) 372 addiu \inptr, \pixel_size 373.endm 374 375LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2) 376/* 377 * a0 = cinfo->image_width 378 * a1 = input_buf 379 * a2 = output_buf 380 * a3 = output_row 381 * 16(sp) = num_rows 382 */ 383 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 384 385 li s0, 0x4c8b /* s0 = FIX(0.29900) */ 386 li s1, 0x9646 /* s1 = FIX(0.58700) */ 387 li s2, 0x1d2f /* s2 = FIX(0.11400) */ 388 li s7, 0x8000 /* s7 = FIX(0.50000) */ 389 lw s6, 48(sp) 390 andi t7, a0, 3 391 3920: 393 addiu s6, -1 /* s6 = num_rows */ 394 lw t0, 0(a1) 395 lw t1, 0(a2) 396 sll t3, a3, 2 397 lwx t1, t3(t1) 398 addiu a3, 1 399 addu t9, t1, a0 400 subu t8, t9, t7 401 beq t1, t8, 2f 402 nop 403 4041: 405 DO_RGB_TO_GRAY t3, t4, t5, t0 406 DO_RGB_TO_GRAY s3, s4, s5, t0 407 408 mtlo s7, $ac0 409 maddu $ac0, s2, t5 410 maddu $ac0, s1, t4 411 maddu $ac0, s0, t3 412 mtlo s7, $ac1 413 maddu $ac1, s2, s5 414 maddu $ac1, s1, s4 415 maddu $ac1, s0, s3 416 extr.w t6, $ac0, 16 417 418 DO_RGB_TO_GRAY t3, t4, t5, t0 419 DO_RGB_TO_GRAY s3, s4, s5, t0 420 421 mtlo s7, $ac0 422 maddu $ac0, s2, t5 423 maddu $ac0, s1, t4 424 extr.w t2, $ac1, 16 425 maddu $ac0, s0, t3 426 mtlo s7, $ac1 427 maddu $ac1, s2, s5 428 maddu $ac1, s1, s4 429 maddu $ac1, s0, s3 430 extr.w t5, $ac0, 16 431 sb t6, 0(t1) 432 sb t2, 1(t1) 433 extr.w t3, $ac1, 16 434 addiu t1, 4 435 sb t5, -2(t1) 436 sb t3, -1(t1) 437 bne t1, t8, 1b 438 nop 439 4402: 441 beqz t7, 4f 442 nop 443 4443: 445 DO_RGB_TO_GRAY t3, t4, t5, t0 446 447 mtlo s7, $ac0 448 maddu $ac0, s2, t5 449 maddu $ac0, s1, t4 450 maddu $ac0, s0, t3 451 extr.w t6, $ac0, 16 452 sb t6, 0(t1) 453 addiu t1, 1 454 bne t1, t9, 3b 455 nop 456 4574: 458 bgtz s6, 0b 459 addiu a1, 4 460 461 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 462 463 j ra 464 nop 465END(jsimd_\colorid\()_gray_convert_dspr2) 466 467.purgem DO_RGB_TO_GRAY 468 469.endm 470 471/*-------------------------------------id -- pix R G B */ 472GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2 473GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0 474GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2 475GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0 476GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1 477GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3 478 479 480/*****************************************************************************/ 481/* 482 * jsimd_h2v2_merged_upsample_dspr2 483 * jsimd_h2v2_extrgb_merged_upsample_dspr2 484 * jsimd_h2v2_extrgbx_merged_upsample_dspr2 485 * jsimd_h2v2_extbgr_merged_upsample_dspr2 486 * jsimd_h2v2_extbgrx_merged_upsample_dspr2 487 * jsimd_h2v2_extxbgr_merged_upsample_dspr2 488 * jsimd_h2v2_extxrgb_merged_upsample_dspr2 489 * 490 * Merged h2v2 upsample routines 491 */ 492.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ 493 r1_offs, g1_offs, \ 494 b1_offs, a1_offs, \ 495 r2_offs, g2_offs, \ 496 b2_offs, a2_offs 497 498.macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ 499 scratch5 outptr 500 sb \scratch0, \r1_offs(\outptr) 501 sb \scratch1, \g1_offs(\outptr) 502 sb \scratch2, \b1_offs(\outptr) 503 sb \scratch3, \r2_offs(\outptr) 504 sb \scratch4, \g2_offs(\outptr) 505 sb \scratch5, \b2_offs(\outptr) 506.if (\pixel_size == 8) 507 li \scratch0, 0xFF 508 sb \scratch0, \a1_offs(\outptr) 509 sb \scratch0, \a2_offs(\outptr) 510.endif 511 addiu \outptr, \pixel_size 512.endm 513 514.macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr 515 sb \scratch0, \r1_offs(\outptr) 516 sb \scratch1, \g1_offs(\outptr) 517 sb \scratch2, \b1_offs(\outptr) 518 519.if (\pixel_size == 8) 520 li t0, 0xFF 521 sb t0, \a1_offs(\outptr) 522.endif 523.endm 524 525LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) 526/* 527 * a0 = cinfo->output_width 528 * a1 = input_buf 529 * a2 = in_row_group_ctr 530 * a3 = output_buf 531 * 16(sp) = cinfo->sample_range_limit 532 */ 533 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 534 535 lw t9, 56(sp) /* cinfo->sample_range_limit */ 536 lw v0, 0(a1) 537 lw v1, 4(a1) 538 lw t0, 8(a1) 539 sll t1, a2, 3 540 addiu t2, t1, 4 541 sll t3, a2, 2 542 lw t4, 0(a3) /* t4 = output_buf[0] */ 543 lwx t1, t1(v0) /* t1 = input_buf[0][in_row_group_ctr*2] */ 544 lwx t2, t2(v0) /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */ 545 lwx t5, t3(v1) /* t5 = input_buf[1][in_row_group_ctr] */ 546 lwx t6, t3(t0) /* t6 = input_buf[2][in_row_group_ctr] */ 547 lw t7, 4(a3) /* t7 = output_buf[1] */ 548 li s1, 0xe6ea 549 addiu t8, s1, 0x7fff /* t8 = 0x166e9 [FIX(1.40200)] */ 550 addiu s0, t8, 0x5eb9 /* s0 = 0x1c5a2 [FIX(1.77200)] */ 551 addiu s1, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */ 552 xori s2, s1, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */ 553 srl t3, a0, 1 554 blez t3, 2f 555 addu t0, t5, t3 /* t0 = end address */ 556 1: 557 lbu t3, 0(t5) 558 lbu s3, 0(t6) 559 addiu t5, t5, 1 560 addiu t3, t3, -128 /* (cb - 128) */ 561 addiu s3, s3, -128 /* (cr - 128) */ 562 mult $ac1, s1, t3 563 madd $ac1, s2, s3 564 sll s3, s3, 15 565 sll t3, t3, 15 566 mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */ 567 extr_r.w s5, $ac1, 16 568 mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */ 569 lbu v0, 0(t1) 570 addiu t6, t6, 1 571 addiu t1, t1, 2 572 addu t3, v0, s4 /* y+cred */ 573 addu s3, v0, s5 /* y+cgreen */ 574 addu v1, v0, s6 /* y+cblue */ 575 addu t3, t9, t3 /* y+cred */ 576 addu s3, t9, s3 /* y+cgreen */ 577 addu v1, t9, v1 /* y+cblue */ 578 lbu AT, 0(t3) 579 lbu s7, 0(s3) 580 lbu ra, 0(v1) 581 lbu v0, -1(t1) 582 addu t3, v0, s4 /* y+cred */ 583 addu s3, v0, s5 /* y+cgreen */ 584 addu v1, v0, s6 /* y+cblue */ 585 addu t3, t9, t3 /* y+cred */ 586 addu s3, t9, s3 /* y+cgreen */ 587 addu v1, t9, v1 /* y+cblue */ 588 lbu t3, 0(t3) 589 lbu s3, 0(s3) 590 lbu v1, 0(v1) 591 lbu v0, 0(t2) 592 593 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4 594 595 addu t3, v0, s4 /* y+cred */ 596 addu s3, v0, s5 /* y+cgreen */ 597 addu v1, v0, s6 /* y+cblue */ 598 addu t3, t9, t3 /* y+cred */ 599 addu s3, t9, s3 /* y+cgreen */ 600 addu v1, t9, v1 /* y+cblue */ 601 lbu AT, 0(t3) 602 lbu s7, 0(s3) 603 lbu ra, 0(v1) 604 lbu v0, 1(t2) 605 addiu t2, t2, 2 606 addu t3, v0, s4 /* y+cred */ 607 addu s3, v0, s5 /* y+cgreen */ 608 addu v1, v0, s6 /* y+cblue */ 609 addu t3, t9, t3 /* y+cred */ 610 addu s3, t9, s3 /* y+cgreen */ 611 addu v1, t9, v1 /* y+cblue */ 612 lbu t3, 0(t3) 613 lbu s3, 0(s3) 614 lbu v1, 0(v1) 615 616 STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7 617 618 bne t0, t5, 1b 619 nop 6202: 621 andi t0, a0, 1 622 beqz t0, 4f 623 lbu t3, 0(t5) 624 lbu s3, 0(t6) 625 addiu t3, t3, -128 /* (cb - 128) */ 626 addiu s3, s3, -128 /* (cr - 128) */ 627 mult $ac1, s1, t3 628 madd $ac1, s2, s3 629 sll s3, s3, 15 630 sll t3, t3, 15 631 lbu v0, 0(t1) 632 extr_r.w s5, $ac1, 16 633 mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */ 634 mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */ 635 addu t3, v0, s4 /* y+cred */ 636 addu s3, v0, s5 /* y+cgreen */ 637 addu v1, v0, s6 /* y+cblue */ 638 addu t3, t9, t3 /* y+cred */ 639 addu s3, t9, s3 /* y+cgreen */ 640 addu v1, t9, v1 /* y+cblue */ 641 lbu t3, 0(t3) 642 lbu s3, 0(s3) 643 lbu v1, 0(v1) 644 lbu v0, 0(t2) 645 646 STORE_H2V2_1_PIXEL t3, s3, v1, t4 647 648 addu t3, v0, s4 /* y+cred */ 649 addu s3, v0, s5 /* y+cgreen */ 650 addu v1, v0, s6 /* y+cblue */ 651 addu t3, t9, t3 /* y+cred */ 652 addu s3, t9, s3 /* y+cgreen */ 653 addu v1, t9, v1 /* y+cblue */ 654 lbu t3, 0(t3) 655 lbu s3, 0(s3) 656 lbu v1, 0(v1) 657 658 STORE_H2V2_1_PIXEL t3, s3, v1, t7 6594: 660 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 661 662 j ra 663 nop 664 665END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2) 666 667.purgem STORE_H2V2_1_PIXEL 668.purgem STORE_H2V2_2_PIXELS 669.endm 670 671/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ 672GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 673GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 674GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 675GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 676GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 677GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 678 679 680/*****************************************************************************/ 681/* 682 * jsimd_h2v1_merged_upsample_dspr2 683 * jsimd_h2v1_extrgb_merged_upsample_dspr2 684 * jsimd_h2v1_extrgbx_merged_upsample_dspr2 685 * jsimd_h2v1_extbgr_merged_upsample_dspr2 686 * jsimd_h2v1_extbgrx_merged_upsample_dspr2 687 * jsimd_h2v1_extxbgr_merged_upsample_dspr2 688 * jsimd_h2v1_extxrgb_merged_upsample_dspr2 689 * 690 * Merged h2v1 upsample routines 691 */ 692 693.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \ 694 r1_offs, g1_offs, \ 695 b1_offs, a1_offs, \ 696 r2_offs, g2_offs, \ 697 b2_offs, a2_offs 698 699.macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \ 700 scratch5 outptr 701 sb \scratch0, \r1_offs(\outptr) 702 sb \scratch1, \g1_offs(\outptr) 703 sb \scratch2, \b1_offs(\outptr) 704 sb \scratch3, \r2_offs(\outptr) 705 sb \scratch4, \g2_offs(\outptr) 706 sb \scratch5, \b2_offs(\outptr) 707.if (\pixel_size == 8) 708 li t0, 0xFF 709 sb t0, \a1_offs(\outptr) 710 sb t0, \a2_offs(\outptr) 711.endif 712 addiu \outptr, \pixel_size 713.endm 714 715.macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr 716 sb \scratch0, \r1_offs(\outptr) 717 sb \scratch1, \g1_offs(\outptr) 718 sb \scratch2, \b1_offs(\outptr) 719.if (\pixel_size == 8) 720 li t0, 0xFF 721 sb t0, \a1_offs(\outptr) 722.endif 723.endm 724 725LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) 726/* 727 * a0 = cinfo->output_width 728 * a1 = input_buf 729 * a2 = in_row_group_ctr 730 * a3 = output_buf 731 * 16(sp) = range_limit 732 */ 733 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 734 735 li t0, 0xe6ea 736 lw t1, 0(a1) /* t1 = input_buf[0] */ 737 lw t2, 4(a1) /* t2 = input_buf[1] */ 738 lw t3, 8(a1) /* t3 = input_buf[2] */ 739 lw t8, 56(sp) /* t8 = range_limit */ 740 addiu s1, t0, 0x7fff /* s1 = 0x166e9 [FIX(1.40200)] */ 741 addiu s2, s1, 0x5eb9 /* s2 = 0x1c5a2 [FIX(1.77200)] */ 742 addiu s0, t0, 0x9916 /* s0 = 0x8000 */ 743 addiu s4, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */ 744 xori s3, s4, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */ 745 srl t0, a0, 1 746 sll t4, a2, 2 747 lwx s5, t4(t1) /* s5 = inptr0 */ 748 lwx s6, t4(t2) /* s6 = inptr1 */ 749 lwx s7, t4(t3) /* s7 = inptr2 */ 750 lw t7, 0(a3) /* t7 = outptr */ 751 blez t0, 2f 752 addu t9, s6, t0 /* t9 = end address */ 7531: 754 lbu t2, 0(s6) /* t2 = cb */ 755 lbu t0, 0(s7) /* t0 = cr */ 756 lbu t1, 0(s5) /* t1 = y */ 757 addiu t2, t2, -128 /* t2 = cb - 128 */ 758 addiu t0, t0, -128 /* t0 = cr - 128 */ 759 mult $ac1, s4, t2 760 madd $ac1, s3, t0 761 sll t0, t0, 15 762 sll t2, t2, 15 763 mulq_rs.w t0, s1, t0 /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */ 764 extr_r.w t5, $ac1, 16 765 mulq_rs.w t6, s2, t2 /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */ 766 addiu s7, s7, 1 767 addiu s6, s6, 1 768 addu t2, t1, t0 /* t2 = y + cred */ 769 addu t3, t1, t5 /* t3 = y + cgreen */ 770 addu t4, t1, t6 /* t4 = y + cblue */ 771 addu t2, t8, t2 772 addu t3, t8, t3 773 addu t4, t8, t4 774 lbu t1, 1(s5) 775 lbu v0, 0(t2) 776 lbu v1, 0(t3) 777 lbu ra, 0(t4) 778 addu t2, t1, t0 779 addu t3, t1, t5 780 addu t4, t1, t6 781 addu t2, t8, t2 782 addu t3, t8, t3 783 addu t4, t8, t4 784 lbu t2, 0(t2) 785 lbu t3, 0(t3) 786 lbu t4, 0(t4) 787 788 STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7 789 790 bne t9, s6, 1b 791 addiu s5, s5, 2 7922: 793 andi t0, a0, 1 794 beqz t0, 4f 795 nop 7963: 797 lbu t2, 0(s6) 798 lbu t0, 0(s7) 799 lbu t1, 0(s5) 800 addiu t2, t2, -128 /* (cb - 128) */ 801 addiu t0, t0, -128 /* (cr - 128) */ 802 mul t3, s4, t2 803 mul t4, s3, t0 804 sll t0, t0, 15 805 sll t2, t2, 15 806 mulq_rs.w t0, s1, t0 /* (C1*cr + ONE_HALF)>> SCALEBITS */ 807 mulq_rs.w t6, s2, t2 /* (C2*cb + ONE_HALF)>> SCALEBITS */ 808 addu t3, t3, s0 809 addu t3, t4, t3 810 sra t5, t3, 16 /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */ 811 addu t2, t1, t0 /* y + cred */ 812 addu t3, t1, t5 /* y + cgreen */ 813 addu t4, t1, t6 /* y + cblue */ 814 addu t2, t8, t2 815 addu t3, t8, t3 816 addu t4, t8, t4 817 lbu t2, 0(t2) 818 lbu t3, 0(t3) 819 lbu t4, 0(t4) 820 821 STORE_H2V1_1_PIXEL t2, t3, t4, t7 8224: 823 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra 824 825 j ra 826 nop 827 828END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2) 829 830.purgem STORE_H2V1_1_PIXEL 831.purgem STORE_H2V1_2_PIXELS 832.endm 833 834/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */ 835GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6 836GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6 837GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7 838GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7 839GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4 840GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4 841 842 843/*****************************************************************************/ 844/* 845 * jsimd_h2v2_fancy_upsample_dspr2 846 * 847 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 848 */ 849LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2) 850/* 851 * a0 = cinfo->max_v_samp_factor 852 * a1 = downsampled_width 853 * a2 = input_data 854 * a3 = output_data_ptr 855 */ 856 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 857 858 li s4, 0 859 lw s2, 0(a3) /* s2 = *output_data_ptr */ 8600: 861 li t9, 2 862 lw s1, -4(a2) /* s1 = inptr1 */ 863 8641: 865 lw s0, 0(a2) /* s0 = inptr0 */ 866 lwx s3, s4(s2) 867 addiu s5, a1, -2 /* s5 = downsampled_width - 2 */ 868 srl t4, s5, 1 869 sll t4, t4, 1 870 lbu t0, 0(s0) 871 lbu t1, 1(s0) 872 lbu t2, 0(s1) 873 lbu t3, 1(s1) 874 addiu s0, 2 875 addiu s1, 2 876 addu t8, s0, t4 /* t8 = end address */ 877 andi s5, s5, 1 /* s5 = residual */ 878 sll t4, t0, 1 879 sll t6, t1, 1 880 addu t0, t0, t4 /* t0 = (*inptr0++) * 3 */ 881 addu t1, t1, t6 /* t1 = (*inptr0++) * 3 */ 882 addu t7, t0, t2 /* t7 = thiscolsum */ 883 addu t6, t1, t3 /* t5 = nextcolsum */ 884 sll t0, t7, 2 /* t0 = thiscolsum * 4 */ 885 subu t1, t0, t7 /* t1 = thiscolsum * 3 */ 886 shra_r.w t0, t0, 4 887 addiu t1, 7 888 addu t1, t1, t6 889 srl t1, t1, 4 890 sb t0, 0(s3) 891 sb t1, 1(s3) 892 beq t8, s0, 22f /* skip to final iteration if width == 3 */ 893 addiu s3, 2 8942: 895 lh t0, 0(s0) /* t0 = A3|A2 */ 896 lh t2, 0(s1) /* t2 = B3|B2 */ 897 addiu s0, 2 898 addiu s1, 2 899 preceu.ph.qbr t0, t0 /* t0 = 0|A3|0|A2 */ 900 preceu.ph.qbr t2, t2 /* t2 = 0|B3|0|B2 */ 901 shll.ph t1, t0, 1 902 sll t3, t6, 1 903 addu.ph t0, t1, t0 /* t0 = A3*3|A2*3 */ 904 addu t3, t3, t6 /* t3 = this * 3 */ 905 addu.ph t0, t0, t2 /* t0 = next2|next1 */ 906 addu t1, t3, t7 907 andi t7, t0, 0xFFFF /* t7 = next1 */ 908 sll t2, t7, 1 909 addu t2, t7, t2 /* t2 = next1*3 */ 910 addu t4, t2, t6 911 srl t6, t0, 16 /* t6 = next2 */ 912 shra_r.w t1, t1, 4 /* t1 = (this*3 + last + 8) >> 4 */ 913 addu t0, t3, t7 914 addiu t0, 7 915 srl t0, t0, 4 /* t0 = (this*3 + next1 + 7) >> 4 */ 916 shra_r.w t4, t4, 4 /* t3 = (next1*3 + this + 8) >> 4 */ 917 addu t2, t2, t6 918 addiu t2, 7 919 srl t2, t2, 4 /* t2 = (next1*3 + next2 + 7) >> 4 */ 920 sb t1, 0(s3) 921 sb t0, 1(s3) 922 sb t4, 2(s3) 923 sb t2, 3(s3) 924 bne t8, s0, 2b 925 addiu s3, 4 92622: 927 beqz s5, 4f 928 addu t8, s0, s5 9293: 930 lbu t0, 0(s0) 931 lbu t2, 0(s1) 932 addiu s0, 1 933 addiu s1, 1 934 sll t3, t6, 1 935 sll t1, t0, 1 936 addu t1, t0, t1 /* t1 = inptr0 * 3 */ 937 addu t3, t3, t6 /* t3 = thiscolsum * 3 */ 938 addu t5, t1, t2 939 addu t1, t3, t7 940 shra_r.w t1, t1, 4 941 addu t0, t3, t5 942 addiu t0, 7 943 srl t0, t0, 4 944 sb t1, 0(s3) 945 sb t0, 1(s3) 946 addiu s3, 2 947 move t7, t6 948 bne t8, s0, 3b 949 move t6, t5 9504: 951 sll t0, t6, 2 /* t0 = thiscolsum * 4 */ 952 subu t1, t0, t6 /* t1 = thiscolsum * 3 */ 953 addu t1, t1, t7 954 addiu s4, 4 955 shra_r.w t1, t1, 4 956 addiu t0, 7 957 srl t0, t0, 4 958 sb t1, 0(s3) 959 sb t0, 1(s3) 960 addiu t9, -1 961 addiu s3, 2 962 bnez t9, 1b 963 lw s1, 4(a2) 964 srl t0, s4, 2 965 subu t0, a0, t0 966 bgtz t0, 0b 967 addiu a2, 4 968 969 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 970 971 j ra 972 nop 973END(jsimd_h2v2_fancy_upsample_dspr2) 974 975 976/*****************************************************************************/ 977LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2) 978/* 979 * a0 = cinfo->max_v_samp_factor 980 * a1 = downsampled_width 981 * a2 = input_data 982 * a3 = output_data_ptr 983 */ 984 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 985 986 .set at 987 988 beqz a0, 3f 989 sll t0, a0, 2 990 lw s1, 0(a3) 991 li s3, 0x10001 992 addu s0, s1, t0 9930: 994 addiu t8, a1, -2 995 srl t9, t8, 2 996 lw t7, 0(a2) 997 lw s2, 0(s1) 998 lbu t0, 0(t7) 999 lbu t1, 1(t7) /* t1 = inptr[1] */ 1000 sll t2, t0, 1 1001 addu t2, t2, t0 /* t2 = invalue*3 */ 1002 addu t2, t2, t1 1003 shra_r.w t2, t2, 2 1004 sb t0, 0(s2) 1005 sb t2, 1(s2) 1006 beqz t9, 11f 1007 addiu s2, 2 10081: 1009 ulw t0, 0(t7) /* t0 = |P3|P2|P1|P0| */ 1010 ulw t1, 1(t7) 1011 ulh t2, 4(t7) /* t2 = |0|0|P5|P4| */ 1012 preceu.ph.qbl t3, t0 /* t3 = |0|P3|0|P2| */ 1013 preceu.ph.qbr t0, t0 /* t0 = |0|P1|0|P0| */ 1014 preceu.ph.qbr t2, t2 /* t2 = |0|P5|0|P4| */ 1015 preceu.ph.qbl t4, t1 /* t4 = |0|P4|0|P3| */ 1016 preceu.ph.qbr t1, t1 /* t1 = |0|P2|0|P1| */ 1017 shll.ph t5, t4, 1 1018 shll.ph t6, t1, 1 1019 addu.ph t5, t5, t4 /* t5 = |P4*3|P3*3| */ 1020 addu.ph t6, t6, t1 /* t6 = |P2*3|P1*3| */ 1021 addu.ph t4, t3, s3 1022 addu.ph t0, t0, s3 1023 addu.ph t4, t4, t5 1024 addu.ph t0, t0, t6 1025 shrl.ph t4, t4, 2 /* t4 = |0|P3|0|P2| */ 1026 shrl.ph t0, t0, 2 /* t0 = |0|P1|0|P0| */ 1027 addu.ph t2, t2, t5 1028 addu.ph t3, t3, t6 1029 shra_r.ph t2, t2, 2 /* t2 = |0|P5|0|P4| */ 1030 shra_r.ph t3, t3, 2 /* t3 = |0|P3|0|P2| */ 1031 shll.ph t2, t2, 8 1032 shll.ph t3, t3, 8 1033 or t2, t4, t2 1034 or t3, t3, t0 1035 addiu t9, -1 1036 usw t3, 0(s2) 1037 usw t2, 4(s2) 1038 addiu s2, 8 1039 bgtz t9, 1b 1040 addiu t7, 4 104111: 1042 andi t8, 3 1043 beqz t8, 22f 1044 addiu t7, 1 1045 10462: 1047 lbu t0, 0(t7) 1048 addiu t7, 1 1049 sll t1, t0, 1 1050 addu t2, t0, t1 /* t2 = invalue */ 1051 lbu t3, -2(t7) 1052 lbu t4, 0(t7) 1053 addiu t3, 1 1054 addiu t4, 2 1055 addu t3, t3, t2 1056 addu t4, t4, t2 1057 srl t3, 2 1058 srl t4, 2 1059 sb t3, 0(s2) 1060 sb t4, 1(s2) 1061 addiu t8, -1 1062 bgtz t8, 2b 1063 addiu s2, 2 1064 106522: 1066 lbu t0, 0(t7) 1067 lbu t2, -1(t7) 1068 sll t1, t0, 1 1069 addu t1, t1, t0 /* t1 = invalue * 3 */ 1070 addu t1, t1, t2 1071 addiu t1, 1 1072 srl t1, t1, 2 1073 sb t1, 0(s2) 1074 sb t0, 1(s2) 1075 addiu s1, 4 1076 bne s1, s0, 0b 1077 addiu a2, 4 10783: 1079 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 1080 1081 j ra 1082 nop 1083END(jsimd_h2v1_fancy_upsample_dspr2) 1084 1085 1086/*****************************************************************************/ 1087LEAF_DSPR2(jsimd_h2v1_downsample_dspr2) 1088/* 1089 * a0 = cinfo->image_width 1090 * a1 = cinfo->max_v_samp_factor 1091 * a2 = compptr->v_samp_factor 1092 * a3 = compptr->width_in_blocks 1093 * 16(sp) = input_data 1094 * 20(sp) = output_data 1095 */ 1096 .set at 1097 1098 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4 1099 1100 beqz a2, 7f 1101 lw s1, 44(sp) /* s1 = output_data */ 1102 lw s0, 40(sp) /* s0 = input_data */ 1103 srl s2, a0, 2 1104 andi t9, a0, 2 1105 srl t7, t9, 1 1106 addu s2, t7, s2 1107 sll t0, a3, 3 /* t0 = width_in_blocks*DCT */ 1108 srl t7, t0, 1 1109 subu s2, t7, s2 11100: 1111 andi t6, a0, 1 /* t6 = temp_index */ 1112 addiu t6, -1 1113 lw t4, 0(s1) /* t4 = outptr */ 1114 lw t5, 0(s0) /* t5 = inptr0 */ 1115 li s3, 0 /* s3 = bias */ 1116 srl t7, a0, 1 /* t7 = image_width1 */ 1117 srl s4, t7, 2 1118 andi t8, t7, 3 11191: 1120 ulhu t0, 0(t5) 1121 ulhu t1, 2(t5) 1122 ulhu t2, 4(t5) 1123 ulhu t3, 6(t5) 1124 raddu.w.qb t0, t0 1125 raddu.w.qb t1, t1 1126 raddu.w.qb t2, t2 1127 raddu.w.qb t3, t3 1128 shra.ph t0, t0, 1 1129 shra_r.ph t1, t1, 1 1130 shra.ph t2, t2, 1 1131 shra_r.ph t3, t3, 1 1132 sb t0, 0(t4) 1133 sb t1, 1(t4) 1134 sb t2, 2(t4) 1135 sb t3, 3(t4) 1136 addiu s4, -1 1137 addiu t4, 4 1138 bgtz s4, 1b 1139 addiu t5, 8 1140 beqz t8, 3f 1141 addu s4, t4, t8 11422: 1143 ulhu t0, 0(t5) 1144 raddu.w.qb t0, t0 1145 addqh.w t0, t0, s3 1146 xori s3, s3, 1 1147 sb t0, 0(t4) 1148 addiu t4, 1 1149 bne t4, s4, 2b 1150 addiu t5, 2 11513: 1152 lbux t1, t6(t5) 1153 sll t1, 1 1154 addqh.w t2, t1, s3 /* t2 = pixval1 */ 1155 xori s3, s3, 1 1156 addqh.w t3, t1, s3 /* t3 = pixval2 */ 1157 blez s2, 5f 1158 append t3, t2, 8 1159 addu t5, t4, s2 /* t5 = loop_end2 */ 11604: 1161 ush t3, 0(t4) 1162 addiu s2, -1 1163 bgtz s2, 4b 1164 addiu t4, 2 11655: 1166 beqz t9, 6f 1167 nop 1168 sb t2, 0(t4) 11696: 1170 addiu s1, 4 1171 addiu a2, -1 1172 bnez a2, 0b 1173 addiu s0, 4 11747: 1175 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4 1176 1177 j ra 1178 nop 1179END(jsimd_h2v1_downsample_dspr2) 1180 1181 1182/*****************************************************************************/ 1183LEAF_DSPR2(jsimd_h2v2_downsample_dspr2) 1184/* 1185 * a0 = cinfo->image_width 1186 * a1 = cinfo->max_v_samp_factor 1187 * a2 = compptr->v_samp_factor 1188 * a3 = compptr->width_in_blocks 1189 * 16(sp) = input_data 1190 * 20(sp) = output_data 1191 */ 1192 .set at 1193 1194 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1195 1196 beqz a2, 8f 1197 lw s1, 52(sp) /* s1 = output_data */ 1198 lw s0, 48(sp) /* s0 = input_data */ 1199 1200 andi t6, a0, 1 /* t6 = temp_index */ 1201 addiu t6, -1 1202 srl t7, a0, 1 /* t7 = image_width1 */ 1203 srl s4, t7, 2 1204 andi t8, t7, 3 1205 andi t9, a0, 2 1206 srl s2, a0, 2 1207 srl t7, t9, 1 1208 addu s2, t7, s2 1209 sll t0, a3, 3 /* s2 = width_in_blocks*DCT */ 1210 srl t7, t0, 1 1211 subu s2, t7, s2 12120: 1213 lw t4, 0(s1) /* t4 = outptr */ 1214 lw t5, 0(s0) /* t5 = inptr0 */ 1215 lw s7, 4(s0) /* s7 = inptr1 */ 1216 li s6, 1 /* s6 = bias */ 12172: 1218 ulw t0, 0(t5) /* t0 = |P3|P2|P1|P0| */ 1219 ulw t1, 0(s7) /* t1 = |Q3|Q2|Q1|Q0| */ 1220 ulw t2, 4(t5) 1221 ulw t3, 4(s7) 1222 precrq.ph.w t7, t0, t1 /* t2 = |P3|P2|Q3|Q2| */ 1223 ins t0, t1, 16, 16 /* t0 = |Q1|Q0|P1|P0| */ 1224 raddu.w.qb t1, t7 1225 raddu.w.qb t0, t0 1226 shra_r.w t1, t1, 2 1227 addiu t0, 1 1228 srl t0, 2 1229 precrq.ph.w t7, t2, t3 1230 ins t2, t3, 16, 16 1231 raddu.w.qb t7, t7 1232 raddu.w.qb t2, t2 1233 shra_r.w t7, t7, 2 1234 addiu t2, 1 1235 srl t2, 2 1236 sb t0, 0(t4) 1237 sb t1, 1(t4) 1238 sb t2, 2(t4) 1239 sb t7, 3(t4) 1240 addiu t4, 4 1241 addiu t5, 8 1242 addiu s4, s4, -1 1243 bgtz s4, 2b 1244 addiu s7, 8 1245 beqz t8, 4f 1246 addu t8, t4, t8 12473: 1248 ulhu t0, 0(t5) 1249 ulhu t1, 0(s7) 1250 ins t0, t1, 16, 16 1251 raddu.w.qb t0, t0 1252 addu t0, t0, s6 1253 srl t0, 2 1254 xori s6, s6, 3 1255 sb t0, 0(t4) 1256 addiu t5, 2 1257 addiu t4, 1 1258 bne t8, t4, 3b 1259 addiu s7, 2 12604: 1261 lbux t1, t6(t5) 1262 sll t1, 1 1263 lbux t0, t6(s7) 1264 sll t0, 1 1265 addu t1, t1, t0 1266 addu t3, t1, s6 1267 srl t0, t3, 2 /* t2 = pixval1 */ 1268 xori s6, s6, 3 1269 addu t2, t1, s6 1270 srl t1, t2, 2 /* t3 = pixval2 */ 1271 blez s2, 6f 1272 append t1, t0, 8 12735: 1274 ush t1, 0(t4) 1275 addiu s2, -1 1276 bgtz s2, 5b 1277 addiu t4, 2 12786: 1279 beqz t9, 7f 1280 nop 1281 sb t0, 0(t4) 12827: 1283 addiu s1, 4 1284 addiu a2, -1 1285 bnez a2, 0b 1286 addiu s0, 8 12878: 1288 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1289 1290 j ra 1291 nop 1292END(jsimd_h2v2_downsample_dspr2) 1293 1294 1295/*****************************************************************************/ 1296LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2) 1297/* 1298 * a0 = input_data 1299 * a1 = output_data 1300 * a2 = compptr->v_samp_factor 1301 * a3 = cinfo->max_v_samp_factor 1302 * 16(sp) = cinfo->smoothing_factor 1303 * 20(sp) = compptr->width_in_blocks 1304 * 24(sp) = cinfo->image_width 1305 */ 1306 .set at 1307 1308 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1309 1310 lw s7, 52(sp) /* compptr->width_in_blocks */ 1311 lw s0, 56(sp) /* cinfo->image_width */ 1312 lw s6, 48(sp) /* cinfo->smoothing_factor */ 1313 sll s7, 3 /* output_cols = width_in_blocks * DCTSIZE */ 1314 sll v0, s7, 1 1315 subu v0, v0, s0 1316 blez v0, 2f 1317 move v1, zero 1318 addiu t0, a3, 2 /* t0 = cinfo->max_v_samp_factor + 2 */ 13190: 1320 addiu t1, a0, -4 1321 sll t2, v1, 2 1322 lwx t1, t2(t1) 1323 move t3, v0 1324 addu t1, t1, s0 1325 lbu t2, -1(t1) 13261: 1327 addiu t3, t3, -1 1328 sb t2, 0(t1) 1329 bgtz t3, 1b 1330 addiu t1, t1, 1 1331 addiu v1, v1, 1 1332 bne v1, t0, 0b 1333 nop 13342: 1335 li v0, 80 1336 mul v0, s6, v0 1337 li v1, 16384 1338 move t4, zero 1339 move t5, zero 1340 subu t6, v1, v0 /* t6 = 16384 - tmp_smoot_f * 80 */ 1341 sll t7, s6, 4 /* t7 = tmp_smoot_f * 16 */ 13423: 1343/* Special case for first column: pretend column -1 is same as column 0 */ 1344 sll v0, t4, 2 1345 lwx t8, v0(a1) /* outptr = output_data[outrow] */ 1346 sll v1, t5, 2 1347 addiu t9, v1, 4 1348 addiu s0, v1, -4 1349 addiu s1, v1, 8 1350 lwx s2, v1(a0) /* inptr0 = input_data[inrow] */ 1351 lwx t9, t9(a0) /* inptr1 = input_data[inrow+1] */ 1352 lwx s0, s0(a0) /* above_ptr = input_data[inrow-1] */ 1353 lwx s1, s1(a0) /* below_ptr = input_data[inrow+2] */ 1354 lh v0, 0(s2) 1355 lh v1, 0(t9) 1356 lh t0, 0(s0) 1357 lh t1, 0(s1) 1358 ins v0, v1, 16, 16 1359 ins t0, t1, 16, 16 1360 raddu.w.qb t2, v0 1361 raddu.w.qb s3, t0 1362 lbu v0, 0(s2) 1363 lbu v1, 2(s2) 1364 lbu t0, 0(t9) 1365 lbu t1, 2(t9) 1366 addu v0, v0, v1 1367 mult $ac1, t2, t6 1368 addu t0, t0, t1 1369 lbu t2, 2(s0) 1370 addu t0, t0, v0 1371 lbu t3, 2(s1) 1372 addu s3, t0, s3 1373 lbu v0, 0(s0) 1374 lbu t0, 0(s1) 1375 sll s3, s3, 1 1376 addu v0, v0, t2 1377 addu t0, t0, t3 1378 addu t0, t0, v0 1379 addu s3, t0, s3 1380 madd $ac1, s3, t7 1381 extr_r.w v0, $ac1, 16 1382 addiu t8, t8, 1 1383 addiu s2, s2, 2 1384 addiu t9, t9, 2 1385 addiu s0, s0, 2 1386 addiu s1, s1, 2 1387 sb v0, -1(t8) 1388 addiu s4, s7, -2 1389 and s4, s4, 3 1390 addu s5, s4, t8 /* end address */ 13914: 1392 lh v0, 0(s2) 1393 lh v1, 0(t9) 1394 lh t0, 0(s0) 1395 lh t1, 0(s1) 1396 ins v0, v1, 16, 16 1397 ins t0, t1, 16, 16 1398 raddu.w.qb t2, v0 1399 raddu.w.qb s3, t0 1400 lbu v0, -1(s2) 1401 lbu v1, 2(s2) 1402 lbu t0, -1(t9) 1403 lbu t1, 2(t9) 1404 addu v0, v0, v1 1405 mult $ac1, t2, t6 1406 addu t0, t0, t1 1407 lbu t2, 2(s0) 1408 addu t0, t0, v0 1409 lbu t3, 2(s1) 1410 addu s3, t0, s3 1411 lbu v0, -1(s0) 1412 lbu t0, -1(s1) 1413 sll s3, s3, 1 1414 addu v0, v0, t2 1415 addu t0, t0, t3 1416 addu t0, t0, v0 1417 addu s3, t0, s3 1418 madd $ac1, s3, t7 1419 extr_r.w t2, $ac1, 16 1420 addiu t8, t8, 1 1421 addiu s2, s2, 2 1422 addiu t9, t9, 2 1423 addiu s0, s0, 2 1424 sb t2, -1(t8) 1425 bne s5, t8, 4b 1426 addiu s1, s1, 2 1427 addiu s5, s7, -2 1428 subu s5, s5, s4 1429 addu s5, s5, t8 /* end address */ 14305: 1431 lh v0, 0(s2) 1432 lh v1, 0(t9) 1433 lh t0, 0(s0) 1434 lh t1, 0(s1) 1435 ins v0, v1, 16, 16 1436 ins t0, t1, 16, 16 1437 raddu.w.qb t2, v0 1438 raddu.w.qb s3, t0 1439 lbu v0, -1(s2) 1440 lbu v1, 2(s2) 1441 lbu t0, -1(t9) 1442 lbu t1, 2(t9) 1443 addu v0, v0, v1 1444 mult $ac1, t2, t6 1445 addu t0, t0, t1 1446 lbu t2, 2(s0) 1447 addu t0, t0, v0 1448 lbu t3, 2(s1) 1449 addu s3, t0, s3 1450 lbu v0, -1(s0) 1451 lbu t0, -1(s1) 1452 sll s3, s3, 1 1453 addu v0, v0, t2 1454 addu t0, t0, t3 1455 lh v1, 2(t9) 1456 addu t0, t0, v0 1457 lh v0, 2(s2) 1458 addu s3, t0, s3 1459 lh t0, 2(s0) 1460 lh t1, 2(s1) 1461 madd $ac1, s3, t7 1462 extr_r.w t2, $ac1, 16 1463 ins t0, t1, 16, 16 1464 ins v0, v1, 16, 16 1465 raddu.w.qb s3, t0 1466 lbu v1, 4(s2) 1467 lbu t0, 1(t9) 1468 lbu t1, 4(t9) 1469 sb t2, 0(t8) 1470 raddu.w.qb t3, v0 1471 lbu v0, 1(s2) 1472 addu t0, t0, t1 1473 mult $ac1, t3, t6 1474 addu v0, v0, v1 1475 lbu t2, 4(s0) 1476 addu t0, t0, v0 1477 lbu v0, 1(s0) 1478 addu s3, t0, s3 1479 lbu t0, 1(s1) 1480 lbu t3, 4(s1) 1481 addu v0, v0, t2 1482 sll s3, s3, 1 1483 addu t0, t0, t3 1484 lh v1, 4(t9) 1485 addu t0, t0, v0 1486 lh v0, 4(s2) 1487 addu s3, t0, s3 1488 lh t0, 4(s0) 1489 lh t1, 4(s1) 1490 madd $ac1, s3, t7 1491 extr_r.w t2, $ac1, 16 1492 ins t0, t1, 16, 16 1493 ins v0, v1, 16, 16 1494 raddu.w.qb s3, t0 1495 lbu v1, 6(s2) 1496 lbu t0, 3(t9) 1497 lbu t1, 6(t9) 1498 sb t2, 1(t8) 1499 raddu.w.qb t3, v0 1500 lbu v0, 3(s2) 1501 addu t0, t0, t1 1502 mult $ac1, t3, t6 1503 addu v0, v0, v1 1504 lbu t2, 6(s0) 1505 addu t0, t0, v0 1506 lbu v0, 3(s0) 1507 addu s3, t0, s3 1508 lbu t0, 3(s1) 1509 lbu t3, 6(s1) 1510 addu v0, v0, t2 1511 sll s3, s3, 1 1512 addu t0, t0, t3 1513 lh v1, 6(t9) 1514 addu t0, t0, v0 1515 lh v0, 6(s2) 1516 addu s3, t0, s3 1517 lh t0, 6(s0) 1518 lh t1, 6(s1) 1519 madd $ac1, s3, t7 1520 extr_r.w t3, $ac1, 16 1521 ins t0, t1, 16, 16 1522 ins v0, v1, 16, 16 1523 raddu.w.qb s3, t0 1524 lbu v1, 8(s2) 1525 lbu t0, 5(t9) 1526 lbu t1, 8(t9) 1527 sb t3, 2(t8) 1528 raddu.w.qb t2, v0 1529 lbu v0, 5(s2) 1530 addu t0, t0, t1 1531 mult $ac1, t2, t6 1532 addu v0, v0, v1 1533 lbu t2, 8(s0) 1534 addu t0, t0, v0 1535 lbu v0, 5(s0) 1536 addu s3, t0, s3 1537 lbu t0, 5(s1) 1538 lbu t3, 8(s1) 1539 addu v0, v0, t2 1540 sll s3, s3, 1 1541 addu t0, t0, t3 1542 addiu t8, t8, 4 1543 addu t0, t0, v0 1544 addiu s2, s2, 8 1545 addu s3, t0, s3 1546 addiu t9, t9, 8 1547 madd $ac1, s3, t7 1548 extr_r.w t1, $ac1, 16 1549 addiu s0, s0, 8 1550 addiu s1, s1, 8 1551 bne s5, t8, 5b 1552 sb t1, -1(t8) 1553/* Special case for last column */ 1554 lh v0, 0(s2) 1555 lh v1, 0(t9) 1556 lh t0, 0(s0) 1557 lh t1, 0(s1) 1558 ins v0, v1, 16, 16 1559 ins t0, t1, 16, 16 1560 raddu.w.qb t2, v0 1561 raddu.w.qb s3, t0 1562 lbu v0, -1(s2) 1563 lbu v1, 1(s2) 1564 lbu t0, -1(t9) 1565 lbu t1, 1(t9) 1566 addu v0, v0, v1 1567 mult $ac1, t2, t6 1568 addu t0, t0, t1 1569 lbu t2, 1(s0) 1570 addu t0, t0, v0 1571 lbu t3, 1(s1) 1572 addu s3, t0, s3 1573 lbu v0, -1(s0) 1574 lbu t0, -1(s1) 1575 sll s3, s3, 1 1576 addu v0, v0, t2 1577 addu t0, t0, t3 1578 addu t0, t0, v0 1579 addu s3, t0, s3 1580 madd $ac1, s3, t7 1581 extr_r.w t0, $ac1, 16 1582 addiu t5, t5, 2 1583 sb t0, 0(t8) 1584 addiu t4, t4, 1 1585 bne t4, a2, 3b 1586 addiu t5, t5, 2 1587 1588 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1589 1590 j ra 1591 nop 1592 1593END(jsimd_h2v2_smooth_downsample_dspr2) 1594 1595 1596/*****************************************************************************/ 1597LEAF_DSPR2(jsimd_int_upsample_dspr2) 1598/* 1599 * a0 = upsample->h_expand[compptr->component_index] 1600 * a1 = upsample->v_expand[compptr->component_index] 1601 * a2 = input_data 1602 * a3 = output_data_ptr 1603 * 16(sp) = cinfo->output_width 1604 * 20(sp) = cinfo->max_v_samp_factor 1605 */ 1606 .set at 1607 1608 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 1609 1610 lw s0, 0(a3) /* s0 = output_data */ 1611 lw s1, 32(sp) /* s1 = cinfo->output_width */ 1612 lw s2, 36(sp) /* s2 = cinfo->max_v_samp_factor */ 1613 li t6, 0 /* t6 = inrow */ 1614 beqz s2, 10f 1615 li s3, 0 /* s3 = outrow */ 16160: 1617 addu t0, a2, t6 1618 addu t7, s0, s3 1619 lw t3, 0(t0) /* t3 = inptr */ 1620 lw t8, 0(t7) /* t8 = outptr */ 1621 beqz s1, 4f 1622 addu t5, t8, s1 /* t5 = outend */ 16231: 1624 lb t2, 0(t3) /* t2 = invalue = *inptr++ */ 1625 addiu t3, 1 1626 beqz a0, 3f 1627 move t0, a0 /* t0 = h_expand */ 16282: 1629 sb t2, 0(t8) 1630 addiu t0, -1 1631 bgtz t0, 2b 1632 addiu t8, 1 16333: 1634 bgt t5, t8, 1b 1635 nop 16364: 1637 addiu t9, a1, -1 /* t9 = v_expand - 1 */ 1638 blez t9, 9f 1639 nop 16405: 1641 lw t3, 0(s0) 1642 lw t4, 4(s0) 1643 subu t0, s1, 0xF 1644 blez t0, 7f 1645 addu t5, t3, s1 /* t5 = end address */ 1646 andi t7, s1, 0xF /* t7 = residual */ 1647 subu t8, t5, t7 16486: 1649 ulw t0, 0(t3) 1650 ulw t1, 4(t3) 1651 ulw t2, 8(t3) 1652 usw t0, 0(t4) 1653 ulw t0, 12(t3) 1654 usw t1, 4(t4) 1655 usw t2, 8(t4) 1656 usw t0, 12(t4) 1657 addiu t3, 16 1658 bne t3, t8, 6b 1659 addiu t4, 16 1660 beqz t7, 8f 1661 nop 16627: 1663 lbu t0, 0(t3) 1664 sb t0, 0(t4) 1665 addiu t3, 1 1666 bne t3, t5, 7b 1667 addiu t4, 1 16688: 1669 addiu t9, -1 1670 bgtz t9, 5b 1671 addiu s0, 8 16729: 1673 addu s3, s3, a1 1674 bne s3, s2, 0b 1675 addiu t6, 1 167610: 1677 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 1678 1679 j ra 1680 nop 1681END(jsimd_int_upsample_dspr2) 1682 1683 1684/*****************************************************************************/ 1685LEAF_DSPR2(jsimd_h2v1_upsample_dspr2) 1686/* 1687 * a0 = cinfo->max_v_samp_factor 1688 * a1 = cinfo->output_width 1689 * a2 = input_data 1690 * a3 = output_data_ptr 1691 */ 1692 lw t7, 0(a3) /* t7 = output_data */ 1693 andi t8, a1, 0xf /* t8 = residual */ 1694 sll t0, a0, 2 1695 blez a0, 4f 1696 addu t9, t7, t0 /* t9 = output_data end address */ 16970: 1698 lw t5, 0(t7) /* t5 = outptr */ 1699 lw t6, 0(a2) /* t6 = inptr */ 1700 addu t3, t5, a1 /* t3 = outptr + output_width (end address) */ 1701 subu t3, t8 /* t3 = end address - residual */ 1702 beq t5, t3, 2f 1703 move t4, t8 17041: 1705 ulw t0, 0(t6) /* t0 = |P3|P2|P1|P0| */ 1706 ulw t2, 4(t6) /* t2 = |P7|P6|P5|P4| */ 1707 srl t1, t0, 16 /* t1 = |X|X|P3|P2| */ 1708 ins t0, t0, 16, 16 /* t0 = |P1|P0|P1|P0| */ 1709 ins t1, t1, 16, 16 /* t1 = |P3|P2|P3|P2| */ 1710 ins t0, t0, 8, 16 /* t0 = |P1|P1|P0|P0| */ 1711 ins t1, t1, 8, 16 /* t1 = |P3|P3|P2|P2| */ 1712 usw t0, 0(t5) 1713 usw t1, 4(t5) 1714 srl t0, t2, 16 /* t0 = |X|X|P7|P6| */ 1715 ins t2, t2, 16, 16 /* t2 = |P5|P4|P5|P4| */ 1716 ins t0, t0, 16, 16 /* t0 = |P7|P6|P7|P6| */ 1717 ins t2, t2, 8, 16 /* t2 = |P5|P5|P4|P4| */ 1718 ins t0, t0, 8, 16 /* t0 = |P7|P7|P6|P6| */ 1719 usw t2, 8(t5) 1720 usw t0, 12(t5) 1721 addiu t5, 16 1722 bne t5, t3, 1b 1723 addiu t6, 8 1724 beqz t8, 3f 1725 move t4, t8 17262: 1727 lbu t1, 0(t6) 1728 sb t1, 0(t5) 1729 sb t1, 1(t5) 1730 addiu t4, -2 1731 addiu t6, 1 1732 bgtz t4, 2b 1733 addiu t5, 2 17343: 1735 addiu t7, 4 1736 bne t9, t7, 0b 1737 addiu a2, 4 17384: 1739 j ra 1740 nop 1741END(jsimd_h2v1_upsample_dspr2) 1742 1743 1744/*****************************************************************************/ 1745LEAF_DSPR2(jsimd_h2v2_upsample_dspr2) 1746/* 1747 * a0 = cinfo->max_v_samp_factor 1748 * a1 = cinfo->output_width 1749 * a2 = input_data 1750 * a3 = output_data_ptr 1751 */ 1752 lw t7, 0(a3) 1753 blez a0, 7f 1754 andi t9, a1, 0xf /* t9 = residual */ 17550: 1756 lw t6, 0(a2) /* t6 = inptr */ 1757 lw t5, 0(t7) /* t5 = outptr */ 1758 addu t8, t5, a1 /* t8 = outptr end address */ 1759 subu t8, t9 /* t8 = end address - residual */ 1760 beq t5, t8, 2f 1761 move t4, t9 17621: 1763 ulw t0, 0(t6) 1764 srl t1, t0, 16 1765 ins t0, t0, 16, 16 1766 ins t0, t0, 8, 16 1767 ins t1, t1, 16, 16 1768 ins t1, t1, 8, 16 1769 ulw t2, 4(t6) 1770 usw t0, 0(t5) 1771 usw t1, 4(t5) 1772 srl t3, t2, 16 1773 ins t2, t2, 16, 16 1774 ins t2, t2, 8, 16 1775 ins t3, t3, 16, 16 1776 ins t3, t3, 8, 16 1777 usw t2, 8(t5) 1778 usw t3, 12(t5) 1779 addiu t5, 16 1780 bne t5, t8, 1b 1781 addiu t6, 8 1782 beqz t9, 3f 1783 move t4, t9 17842: 1785 lbu t0, 0(t6) 1786 sb t0, 0(t5) 1787 sb t0, 1(t5) 1788 addiu t4, -2 1789 addiu t6, 1 1790 bgtz t4, 2b 1791 addiu t5, 2 17923: 1793 lw t6, 0(t7) /* t6 = outptr[0] */ 1794 lw t5, 4(t7) /* t5 = outptr[1] */ 1795 addu t4, t6, a1 /* t4 = new end address */ 1796 beq a1, t9, 5f 1797 subu t8, t4, t9 17984: 1799 ulw t0, 0(t6) 1800 ulw t1, 4(t6) 1801 ulw t2, 8(t6) 1802 usw t0, 0(t5) 1803 ulw t0, 12(t6) 1804 usw t1, 4(t5) 1805 usw t2, 8(t5) 1806 usw t0, 12(t5) 1807 addiu t6, 16 1808 bne t6, t8, 4b 1809 addiu t5, 16 1810 beqz t9, 6f 1811 nop 18125: 1813 lbu t0, 0(t6) 1814 sb t0, 0(t5) 1815 addiu t6, 1 1816 bne t6, t4, 5b 1817 addiu t5, 1 18186: 1819 addiu t7, 8 1820 addiu a0, -2 1821 bgtz a0, 0b 1822 addiu a2, 4 18237: 1824 j ra 1825 nop 1826END(jsimd_h2v2_upsample_dspr2) 1827 1828 1829/*****************************************************************************/ 1830LEAF_DSPR2(jsimd_idct_islow_dspr2) 1831/* 1832 * a0 = coef_block 1833 * a1 = compptr->dcttable 1834 * a2 = output 1835 * a3 = range_limit 1836 */ 1837 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 1838 1839 addiu sp, sp, -256 1840 move v0, sp 1841 addiu v1, zero, 8 /* v1 = DCTSIZE = 8 */ 18421: 1843 lh s4, 32(a0) /* s4 = inptr[16] */ 1844 lh s5, 64(a0) /* s5 = inptr[32] */ 1845 lh s6, 96(a0) /* s6 = inptr[48] */ 1846 lh t1, 112(a0) /* t1 = inptr[56] */ 1847 lh t7, 16(a0) /* t7 = inptr[8] */ 1848 lh t5, 80(a0) /* t5 = inptr[40] */ 1849 lh t3, 48(a0) /* t3 = inptr[24] */ 1850 or s4, s4, t1 1851 or s4, s4, t3 1852 or s4, s4, t5 1853 or s4, s4, t7 1854 or s4, s4, s5 1855 or s4, s4, s6 1856 bnez s4, 2f 1857 addiu v1, v1, -1 1858 lh s5, 0(a1) /* quantptr[DCTSIZE*0] */ 1859 lh s6, 0(a0) /* inptr[DCTSIZE*0] */ 1860 mul s5, s5, s6 /* DEQUANTIZE(inptr[0], quantptr[0]) */ 1861 sll s5, s5, 2 1862 sw s5, 0(v0) 1863 sw s5, 32(v0) 1864 sw s5, 64(v0) 1865 sw s5, 96(v0) 1866 sw s5, 128(v0) 1867 sw s5, 160(v0) 1868 sw s5, 192(v0) 1869 b 3f 1870 sw s5, 224(v0) 18712: 1872 lh t0, 112(a1) 1873 lh t2, 48(a1) 1874 lh t4, 80(a1) 1875 lh t6, 16(a1) 1876 mul t0, t0, t1 /* DEQUANTIZE(inptr[DCTSIZE*7], 1877 quantptr[DCTSIZE*7]) */ 1878 mul t1, t2, t3 /* DEQUANTIZE(inptr[DCTSIZE*3], 1879 quantptr[DCTSIZE*3]) */ 1880 mul t2, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*5], 1881 quantptr[DCTSIZE*5]) */ 1882 mul t3, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*1], 1883 quantptr[DCTSIZE*1]) */ 1884 lh t4, 32(a1) 1885 lh t5, 32(a0) 1886 lh t6, 96(a1) 1887 lh t7, 96(a0) 1888 addu s0, t0, t1 /* z3 = tmp0 + tmp2 */ 1889 addu s1, t1, t2 /* z2 = tmp1 + tmp2 */ 1890 addu s2, t2, t3 /* z4 = tmp1 + tmp3 */ 1891 addu s3, s0, s2 /* z3 + z4 */ 1892 addiu t9, zero, 9633 /* FIX_1_175875602 */ 1893 mul s3, s3, t9 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 1894 addu t8, t0, t3 /* z1 = tmp0 + tmp3 */ 1895 addiu t9, zero, 2446 /* FIX_0_298631336 */ 1896 mul t0, t0, t9 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 1897 addiu t9, zero, 16819 /* FIX_2_053119869 */ 1898 mul t2, t2, t9 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 1899 addiu t9, zero, 25172 /* FIX_3_072711026 */ 1900 mul t1, t1, t9 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 1901 addiu t9, zero, 12299 /* FIX_1_501321110 */ 1902 mul t3, t3, t9 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 1903 addiu t9, zero, 16069 /* FIX_1_961570560 */ 1904 mul s0, s0, t9 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */ 1905 addiu t9, zero, 3196 /* FIX_0_390180644 */ 1906 mul s2, s2, t9 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */ 1907 addiu t9, zero, 7373 /* FIX_0_899976223 */ 1908 mul t8, t8, t9 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */ 1909 addiu t9, zero, 20995 /* FIX_2_562915447 */ 1910 mul s1, s1, t9 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */ 1911 subu s0, s3, s0 /* z3 += z5 */ 1912 addu t0, t0, s0 /* tmp0 += z3 */ 1913 addu t1, t1, s0 /* tmp2 += z3 */ 1914 subu s2, s3, s2 /* z4 += z5 */ 1915 addu t2, t2, s2 /* tmp1 += z4 */ 1916 addu t3, t3, s2 /* tmp3 += z4 */ 1917 subu t0, t0, t8 /* tmp0 += z1 */ 1918 subu t1, t1, s1 /* tmp2 += z2 */ 1919 subu t2, t2, s1 /* tmp1 += z2 */ 1920 subu t3, t3, t8 /* tmp3 += z1 */ 1921 mul s0, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*2], 1922 quantptr[DCTSIZE*2]) */ 1923 addiu t9, zero, 6270 /* FIX_0_765366865 */ 1924 mul s1, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*6], 1925 quantptr[DCTSIZE*6]) */ 1926 lh t4, 0(a1) 1927 lh t5, 0(a0) 1928 lh t6, 64(a1) 1929 lh t7, 64(a0) 1930 mul s2, t9, s0 /* MULTIPLY(z2, FIX_0_765366865) */ 1931 mul t5, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*0], 1932 quantptr[DCTSIZE*0]) */ 1933 mul t6, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*4], 1934 quantptr[DCTSIZE*4]) */ 1935 addiu t9, zero, 4433 /* FIX_0_541196100 */ 1936 addu s3, s0, s1 /* z2 + z3 */ 1937 mul s3, s3, t9 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */ 1938 addiu t9, zero, 15137 /* FIX_1_847759065 */ 1939 mul t8, s1, t9 /* MULTIPLY(z3, FIX_1_847759065) */ 1940 addu t4, t5, t6 1941 subu t5, t5, t6 1942 sll t4, t4, 13 /* tmp0 = (z2 + z3) << CONST_BITS */ 1943 sll t5, t5, 13 /* tmp1 = (z2 - z3) << CONST_BITS */ 1944 addu t7, s3, s2 /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */ 1945 subu t6, s3, t8 /* tmp2 = 1946 z1 + MULTIPLY(z3, -FIX_1_847759065) */ 1947 addu s0, t4, t7 1948 subu s1, t4, t7 1949 addu s2, t5, t6 1950 subu s3, t5, t6 1951 addu t4, s0, t3 1952 subu s0, s0, t3 1953 addu t3, s2, t1 1954 subu s2, s2, t1 1955 addu t1, s3, t2 1956 subu s3, s3, t2 1957 addu t2, s1, t0 1958 subu s1, s1, t0 1959 shra_r.w t4, t4, 11 1960 shra_r.w t3, t3, 11 1961 shra_r.w t1, t1, 11 1962 shra_r.w t2, t2, 11 1963 shra_r.w s1, s1, 11 1964 shra_r.w s3, s3, 11 1965 shra_r.w s2, s2, 11 1966 shra_r.w s0, s0, 11 1967 sw t4, 0(v0) 1968 sw t3, 32(v0) 1969 sw t1, 64(v0) 1970 sw t2, 96(v0) 1971 sw s1, 128(v0) 1972 sw s3, 160(v0) 1973 sw s2, 192(v0) 1974 sw s0, 224(v0) 19753: 1976 addiu a1, a1, 2 1977 addiu a0, a0, 2 1978 bgtz v1, 1b 1979 addiu v0, v0, 4 1980 move v0, sp 1981 addiu v1, zero, 8 19824: 1983 lw t0, 8(v0) /* z2 = (JLONG)wsptr[2] */ 1984 lw t1, 24(v0) /* z3 = (JLONG)wsptr[6] */ 1985 lw t2, 0(v0) /* (JLONG)wsptr[0] */ 1986 lw t3, 16(v0) /* (JLONG)wsptr[4] */ 1987 lw s4, 4(v0) /* (JLONG)wsptr[1] */ 1988 lw s5, 12(v0) /* (JLONG)wsptr[3] */ 1989 lw s6, 20(v0) /* (JLONG)wsptr[5] */ 1990 lw s7, 28(v0) /* (JLONG)wsptr[7] */ 1991 or s4, s4, t0 1992 or s4, s4, t1 1993 or s4, s4, t3 1994 or s4, s4, s7 1995 or s4, s4, s5 1996 or s4, s4, s6 1997 bnez s4, 5f 1998 addiu v1, v1, -1 1999 shra_r.w s5, t2, 5 2000 andi s5, s5, 0x3ff 2001 lbux s5, s5(a3) 2002 lw s1, 0(a2) 2003 replv.qb s5, s5 2004 usw s5, 0(s1) 2005 usw s5, 4(s1) 2006 b 6f 2007 nop 20085: 2009 addu t4, t0, t1 /* z2 + z3 */ 2010 addiu t8, zero, 4433 /* FIX_0_541196100 */ 2011 mul t5, t4, t8 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */ 2012 addiu t8, zero, 15137 /* FIX_1_847759065 */ 2013 mul t1, t1, t8 /* MULTIPLY(z3, FIX_1_847759065) */ 2014 addiu t8, zero, 6270 /* FIX_0_765366865 */ 2015 mul t0, t0, t8 /* MULTIPLY(z2, FIX_0_765366865) */ 2016 addu t4, t2, t3 /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */ 2017 subu t2, t2, t3 /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */ 2018 sll t4, t4, 13 /* tmp0 = 2019 (wsptr[0] + wsptr[4]) << CONST_BITS */ 2020 sll t2, t2, 13 /* tmp1 = 2021 (wsptr[0] - wsptr[4]) << CONST_BITS */ 2022 subu t1, t5, t1 /* tmp2 = 2023 z1 + MULTIPLY(z3, -FIX_1_847759065) */ 2024 subu t3, t2, t1 /* tmp12 = tmp1 - tmp2 */ 2025 addu t2, t2, t1 /* tmp11 = tmp1 + tmp2 */ 2026 addu t5, t5, t0 /* tmp3 = 2027 z1 + MULTIPLY(z2, FIX_0_765366865) */ 2028 subu t1, t4, t5 /* tmp13 = tmp0 - tmp3 */ 2029 addu t0, t4, t5 /* tmp10 = tmp0 + tmp3 */ 2030 lw t4, 28(v0) /* tmp0 = (JLONG)wsptr[7] */ 2031 lw t6, 12(v0) /* tmp2 = (JLONG)wsptr[3] */ 2032 lw t5, 20(v0) /* tmp1 = (JLONG)wsptr[5] */ 2033 lw t7, 4(v0) /* tmp3 = (JLONG)wsptr[1] */ 2034 addu s0, t4, t6 /* z3 = tmp0 + tmp2 */ 2035 addiu t8, zero, 9633 /* FIX_1_175875602 */ 2036 addu s1, t5, t7 /* z4 = tmp1 + tmp3 */ 2037 addu s2, s0, s1 /* z3 + z4 */ 2038 mul s2, s2, t8 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */ 2039 addu s3, t4, t7 /* z1 = tmp0 + tmp3 */ 2040 addu t9, t5, t6 /* z2 = tmp1 + tmp2 */ 2041 addiu t8, zero, 16069 /* FIX_1_961570560 */ 2042 mul s0, s0, t8 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */ 2043 addiu t8, zero, 3196 /* FIX_0_390180644 */ 2044 mul s1, s1, t8 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */ 2045 addiu t8, zero, 2446 /* FIX_0_298631336 */ 2046 mul t4, t4, t8 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */ 2047 addiu t8, zero, 7373 /* FIX_0_899976223 */ 2048 mul s3, s3, t8 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */ 2049 addiu t8, zero, 16819 /* FIX_2_053119869 */ 2050 mul t5, t5, t8 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */ 2051 addiu t8, zero, 20995 /* FIX_2_562915447 */ 2052 mul t9, t9, t8 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */ 2053 addiu t8, zero, 25172 /* FIX_3_072711026 */ 2054 mul t6, t6, t8 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */ 2055 addiu t8, zero, 12299 /* FIX_1_501321110 */ 2056 mul t7, t7, t8 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */ 2057 subu s0, s2, s0 /* z3 += z5 */ 2058 subu s1, s2, s1 /* z4 += z5 */ 2059 addu t4, t4, s0 2060 subu t4, t4, s3 /* tmp0 */ 2061 addu t5, t5, s1 2062 subu t5, t5, t9 /* tmp1 */ 2063 addu t6, t6, s0 2064 subu t6, t6, t9 /* tmp2 */ 2065 addu t7, t7, s1 2066 subu t7, t7, s3 /* tmp3 */ 2067 addu s0, t0, t7 2068 subu t0, t0, t7 2069 addu t7, t2, t6 2070 subu t2, t2, t6 2071 addu t6, t3, t5 2072 subu t3, t3, t5 2073 addu t5, t1, t4 2074 subu t1, t1, t4 2075 shra_r.w s0, s0, 18 2076 shra_r.w t7, t7, 18 2077 shra_r.w t6, t6, 18 2078 shra_r.w t5, t5, 18 2079 shra_r.w t1, t1, 18 2080 shra_r.w t3, t3, 18 2081 shra_r.w t2, t2, 18 2082 shra_r.w t0, t0, 18 2083 andi s0, s0, 0x3ff 2084 andi t7, t7, 0x3ff 2085 andi t6, t6, 0x3ff 2086 andi t5, t5, 0x3ff 2087 andi t1, t1, 0x3ff 2088 andi t3, t3, 0x3ff 2089 andi t2, t2, 0x3ff 2090 andi t0, t0, 0x3ff 2091 lw s1, 0(a2) 2092 lbux s0, s0(a3) 2093 lbux t7, t7(a3) 2094 lbux t6, t6(a3) 2095 lbux t5, t5(a3) 2096 lbux t1, t1(a3) 2097 lbux t3, t3(a3) 2098 lbux t2, t2(a3) 2099 lbux t0, t0(a3) 2100 sb s0, 0(s1) 2101 sb t7, 1(s1) 2102 sb t6, 2(s1) 2103 sb t5, 3(s1) 2104 sb t1, 4(s1) 2105 sb t3, 5(s1) 2106 sb t2, 6(s1) 2107 sb t0, 7(s1) 21086: 2109 addiu v0, v0, 32 2110 bgtz v1, 4b 2111 addiu a2, a2, 4 2112 addiu sp, sp, 256 2113 2114 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2115 2116 j ra 2117 nop 2118 2119END(jsimd_idct_islow_dspr2) 2120 2121 2122/*****************************************************************************/ 2123LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2) 2124/* 2125 * a0 = inptr 2126 * a1 = quantptr 2127 * a2 = wsptr 2128 * a3 = mips_idct_ifast_coefs 2129 */ 2130 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2131 2132 addiu t9, a0, 16 /* end address */ 2133 or AT, a3, zero 2134 21350: 2136 lw s0, 0(a1) /* quantptr[DCTSIZE*0] */ 2137 lw t0, 0(a0) /* inptr[DCTSIZE*0] */ 2138 lw t1, 16(a0) /* inptr[DCTSIZE*1] */ 2139 muleq_s.w.phl v0, t0, s0 /* tmp0 ... */ 2140 lw t2, 32(a0) /* inptr[DCTSIZE*2] */ 2141 lw t3, 48(a0) /* inptr[DCTSIZE*3] */ 2142 lw t4, 64(a0) /* inptr[DCTSIZE*4] */ 2143 lw t5, 80(a0) /* inptr[DCTSIZE*5] */ 2144 muleq_s.w.phr t0, t0, s0 /* ... tmp0 ... */ 2145 lw t6, 96(a0) /* inptr[DCTSIZE*6] */ 2146 lw t7, 112(a0) /* inptr[DCTSIZE*7] */ 2147 or s4, t1, t2 2148 or s5, t3, t4 2149 bnez s4, 1f 2150 ins t0, v0, 16, 16 /* ... tmp0 */ 2151 bnez s5, 1f 2152 or s6, t5, t6 2153 or s6, s6, t7 2154 bnez s6, 1f 2155 sw t0, 0(a2) /* wsptr[DCTSIZE*0] */ 2156 sw t0, 16(a2) /* wsptr[DCTSIZE*1] */ 2157 sw t0, 32(a2) /* wsptr[DCTSIZE*2] */ 2158 sw t0, 48(a2) /* wsptr[DCTSIZE*3] */ 2159 sw t0, 64(a2) /* wsptr[DCTSIZE*4] */ 2160 sw t0, 80(a2) /* wsptr[DCTSIZE*5] */ 2161 sw t0, 96(a2) /* wsptr[DCTSIZE*6] */ 2162 sw t0, 112(a2) /* wsptr[DCTSIZE*7] */ 2163 addiu a0, a0, 4 2164 b 2f 2165 addiu a1, a1, 4 2166 21671: 2168 lw s1, 32(a1) /* quantptr[DCTSIZE*2] */ 2169 lw s2, 64(a1) /* quantptr[DCTSIZE*4] */ 2170 muleq_s.w.phl v0, t2, s1 /* tmp1 ... */ 2171 muleq_s.w.phr t2, t2, s1 /* ... tmp1 ... */ 2172 lw s0, 16(a1) /* quantptr[DCTSIZE*1] */ 2173 lw s1, 48(a1) /* quantptr[DCTSIZE*3] */ 2174 lw s3, 96(a1) /* quantptr[DCTSIZE*6] */ 2175 muleq_s.w.phl v1, t4, s2 /* tmp2 ... */ 2176 muleq_s.w.phr t4, t4, s2 /* ... tmp2 ... */ 2177 lw s2, 80(a1) /* quantptr[DCTSIZE*5] */ 2178 lw t8, 4(AT) /* FIX(1.414213562) */ 2179 ins t2, v0, 16, 16 /* ... tmp1 */ 2180 muleq_s.w.phl v0, t6, s3 /* tmp3 ... */ 2181 muleq_s.w.phr t6, t6, s3 /* ... tmp3 ... */ 2182 ins t4, v1, 16, 16 /* ... tmp2 */ 2183 addq.ph s4, t0, t4 /* tmp10 */ 2184 subq.ph s5, t0, t4 /* tmp11 */ 2185 ins t6, v0, 16, 16 /* ... tmp3 */ 2186 subq.ph s6, t2, t6 /* tmp12 ... */ 2187 addq.ph s7, t2, t6 /* tmp13 */ 2188 mulq_s.ph s6, s6, t8 /* ... tmp12 ... */ 2189 addq.ph t0, s4, s7 /* tmp0 */ 2190 subq.ph t6, s4, s7 /* tmp3 */ 2191 muleq_s.w.phl v0, t1, s0 /* tmp4 ... */ 2192 muleq_s.w.phr t1, t1, s0 /* ... tmp4 ... */ 2193 shll_s.ph s6, s6, 1 /* x2 */ 2194 lw s3, 112(a1) /* quantptr[DCTSIZE*7] */ 2195 subq.ph s6, s6, s7 /* ... tmp12 */ 2196 muleq_s.w.phl v1, t7, s3 /* tmp7 ... */ 2197 muleq_s.w.phr t7, t7, s3 /* ... tmp7 ... */ 2198 ins t1, v0, 16, 16 /* ... tmp4 */ 2199 addq.ph t2, s5, s6 /* tmp1 */ 2200 subq.ph t4, s5, s6 /* tmp2 */ 2201 muleq_s.w.phl v0, t5, s2 /* tmp6 ... */ 2202 muleq_s.w.phr t5, t5, s2 /* ... tmp6 ... */ 2203 ins t7, v1, 16, 16 /* ... tmp7 */ 2204 addq.ph s5, t1, t7 /* z11 */ 2205 subq.ph s6, t1, t7 /* z12 */ 2206 muleq_s.w.phl v1, t3, s1 /* tmp5 ... */ 2207 muleq_s.w.phr t3, t3, s1 /* ... tmp5 ... */ 2208 ins t5, v0, 16, 16 /* ... tmp6 */ 2209 ins t3, v1, 16, 16 /* ... tmp5 */ 2210 addq.ph s7, t5, t3 /* z13 */ 2211 subq.ph v0, t5, t3 /* z10 */ 2212 addq.ph t7, s5, s7 /* tmp7 */ 2213 subq.ph s5, s5, s7 /* tmp11 ... */ 2214 addq.ph v1, v0, s6 /* z5 ... */ 2215 mulq_s.ph s5, s5, t8 /* ... tmp11 */ 2216 lw t8, 8(AT) /* FIX(1.847759065) */ 2217 lw s4, 0(AT) /* FIX(1.082392200) */ 2218 addq.ph s0, t0, t7 2219 subq.ph s1, t0, t7 2220 mulq_s.ph v1, v1, t8 /* ... z5 */ 2221 shll_s.ph s5, s5, 1 /* x2 */ 2222 lw t8, 12(AT) /* FIX(-2.613125930) */ 2223 sw s0, 0(a2) /* wsptr[DCTSIZE*0] */ 2224 shll_s.ph v0, v0, 1 /* x4 */ 2225 mulq_s.ph v0, v0, t8 /* tmp12 ... */ 2226 mulq_s.ph s4, s6, s4 /* tmp10 ... */ 2227 shll_s.ph v1, v1, 1 /* x2 */ 2228 addiu a0, a0, 4 2229 addiu a1, a1, 4 2230 sw s1, 112(a2) /* wsptr[DCTSIZE*7] */ 2231 shll_s.ph s6, v0, 1 /* x4 */ 2232 shll_s.ph s4, s4, 1 /* x2 */ 2233 addq.ph s6, s6, v1 /* ... tmp12 */ 2234 subq.ph t5, s6, t7 /* tmp6 */ 2235 subq.ph s4, s4, v1 /* ... tmp10 */ 2236 subq.ph t3, s5, t5 /* tmp5 */ 2237 addq.ph s2, t2, t5 2238 addq.ph t1, s4, t3 /* tmp4 */ 2239 subq.ph s3, t2, t5 2240 sw s2, 16(a2) /* wsptr[DCTSIZE*1] */ 2241 sw s3, 96(a2) /* wsptr[DCTSIZE*6] */ 2242 addq.ph v0, t4, t3 2243 subq.ph v1, t4, t3 2244 sw v0, 32(a2) /* wsptr[DCTSIZE*2] */ 2245 sw v1, 80(a2) /* wsptr[DCTSIZE*5] */ 2246 addq.ph v0, t6, t1 2247 subq.ph v1, t6, t1 2248 sw v0, 64(a2) /* wsptr[DCTSIZE*4] */ 2249 sw v1, 48(a2) /* wsptr[DCTSIZE*3] */ 2250 22512: 2252 bne a0, t9, 0b 2253 addiu a2, a2, 4 2254 2255 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 2256 2257 j ra 2258 nop 2259 2260END(jsimd_idct_ifast_cols_dspr2) 2261 2262 2263/*****************************************************************************/ 2264LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2) 2265/* 2266 * a0 = wsptr 2267 * a1 = output_buf 2268 * a2 = output_col 2269 * a3 = mips_idct_ifast_coefs 2270 */ 2271 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 2272 2273 addiu t9, a0, 128 /* end address */ 2274 lui s8, 0x8080 2275 ori s8, s8, 0x8080 2276 22770: 2278 lw AT, 36(sp) /* restore $a3 (mips_idct_ifast_coefs) */ 2279 lw t0, 0(a0) /* wsptr[DCTSIZE*0+0/1] b a */ 2280 lw s0, 16(a0) /* wsptr[DCTSIZE*1+0/1] B A */ 2281 lw t2, 4(a0) /* wsptr[DCTSIZE*0+2/3] d c */ 2282 lw s2, 20(a0) /* wsptr[DCTSIZE*1+2/3] D C */ 2283 lw t4, 8(a0) /* wsptr[DCTSIZE*0+4/5] f e */ 2284 lw s4, 24(a0) /* wsptr[DCTSIZE*1+4/5] F E */ 2285 lw t6, 12(a0) /* wsptr[DCTSIZE*0+6/7] h g */ 2286 lw s6, 28(a0) /* wsptr[DCTSIZE*1+6/7] H G */ 2287 precrq.ph.w t1, s0, t0 /* B b */ 2288 ins t0, s0, 16, 16 /* A a */ 2289 bnez t1, 1f 2290 or s0, t2, s2 2291 bnez s0, 1f 2292 or s0, t4, s4 2293 bnez s0, 1f 2294 or s0, t6, s6 2295 bnez s0, 1f 2296 shll_s.ph s0, t0, 2 /* A a */ 2297 lw a3, 0(a1) 2298 lw AT, 4(a1) 2299 precrq.ph.w t0, s0, s0 /* A A */ 2300 ins s0, s0, 16, 16 /* a a */ 2301 addu a3, a3, a2 2302 addu AT, AT, a2 2303 precrq.qb.ph t0, t0, t0 /* A A A A */ 2304 precrq.qb.ph s0, s0, s0 /* a a a a */ 2305 addu.qb s0, s0, s8 2306 addu.qb t0, t0, s8 2307 sw s0, 0(a3) 2308 sw s0, 4(a3) 2309 sw t0, 0(AT) 2310 sw t0, 4(AT) 2311 addiu a0, a0, 32 2312 bne a0, t9, 0b 2313 addiu a1, a1, 8 2314 b 2f 2315 nop 2316 23171: 2318 precrq.ph.w t3, s2, t2 2319 ins t2, s2, 16, 16 2320 precrq.ph.w t5, s4, t4 2321 ins t4, s4, 16, 16 2322 precrq.ph.w t7, s6, t6 2323 ins t6, s6, 16, 16 2324 lw t8, 4(AT) /* FIX(1.414213562) */ 2325 addq.ph s4, t0, t4 /* tmp10 */ 2326 subq.ph s5, t0, t4 /* tmp11 */ 2327 subq.ph s6, t2, t6 /* tmp12 ... */ 2328 addq.ph s7, t2, t6 /* tmp13 */ 2329 mulq_s.ph s6, s6, t8 /* ... tmp12 ... */ 2330 addq.ph t0, s4, s7 /* tmp0 */ 2331 subq.ph t6, s4, s7 /* tmp3 */ 2332 shll_s.ph s6, s6, 1 /* x2 */ 2333 subq.ph s6, s6, s7 /* ... tmp12 */ 2334 addq.ph t2, s5, s6 /* tmp1 */ 2335 subq.ph t4, s5, s6 /* tmp2 */ 2336 addq.ph s5, t1, t7 /* z11 */ 2337 subq.ph s6, t1, t7 /* z12 */ 2338 addq.ph s7, t5, t3 /* z13 */ 2339 subq.ph v0, t5, t3 /* z10 */ 2340 addq.ph t7, s5, s7 /* tmp7 */ 2341 subq.ph s5, s5, s7 /* tmp11 ... */ 2342 addq.ph v1, v0, s6 /* z5 ... */ 2343 mulq_s.ph s5, s5, t8 /* ... tmp11 */ 2344 lw t8, 8(AT) /* FIX(1.847759065) */ 2345 lw s4, 0(AT) /* FIX(1.082392200) */ 2346 addq.ph s0, t0, t7 /* tmp0 + tmp7 */ 2347 subq.ph s7, t0, t7 /* tmp0 - tmp7 */ 2348 mulq_s.ph v1, v1, t8 /* ... z5 */ 2349 lw a3, 0(a1) 2350 lw t8, 12(AT) /* FIX(-2.613125930) */ 2351 shll_s.ph s5, s5, 1 /* x2 */ 2352 addu a3, a3, a2 2353 shll_s.ph v0, v0, 1 /* x4 */ 2354 mulq_s.ph v0, v0, t8 /* tmp12 ... */ 2355 mulq_s.ph s4, s6, s4 /* tmp10 ... */ 2356 shll_s.ph v1, v1, 1 /* x2 */ 2357 addiu a0, a0, 32 2358 addiu a1, a1, 8 2359 shll_s.ph s6, v0, 1 /* x4 */ 2360 shll_s.ph s4, s4, 1 /* x2 */ 2361 addq.ph s6, s6, v1 /* ... tmp12 */ 2362 shll_s.ph s0, s0, 2 2363 subq.ph t5, s6, t7 /* tmp6 */ 2364 subq.ph s4, s4, v1 /* ... tmp10 */ 2365 subq.ph t3, s5, t5 /* tmp5 */ 2366 shll_s.ph s7, s7, 2 2367 addq.ph t1, s4, t3 /* tmp4 */ 2368 addq.ph s1, t2, t5 /* tmp1 + tmp6 */ 2369 subq.ph s6, t2, t5 /* tmp1 - tmp6 */ 2370 addq.ph s2, t4, t3 /* tmp2 + tmp5 */ 2371 subq.ph s5, t4, t3 /* tmp2 - tmp5 */ 2372 addq.ph s4, t6, t1 /* tmp3 + tmp4 */ 2373 subq.ph s3, t6, t1 /* tmp3 - tmp4 */ 2374 shll_s.ph s1, s1, 2 2375 shll_s.ph s2, s2, 2 2376 shll_s.ph s3, s3, 2 2377 shll_s.ph s4, s4, 2 2378 shll_s.ph s5, s5, 2 2379 shll_s.ph s6, s6, 2 2380 precrq.ph.w t0, s1, s0 /* B A */ 2381 ins s0, s1, 16, 16 /* b a */ 2382 precrq.ph.w t2, s3, s2 /* D C */ 2383 ins s2, s3, 16, 16 /* d c */ 2384 precrq.ph.w t4, s5, s4 /* F E */ 2385 ins s4, s5, 16, 16 /* f e */ 2386 precrq.ph.w t6, s7, s6 /* H G */ 2387 ins s6, s7, 16, 16 /* h g */ 2388 precrq.qb.ph t0, t2, t0 /* D C B A */ 2389 precrq.qb.ph s0, s2, s0 /* d c b a */ 2390 precrq.qb.ph t4, t6, t4 /* H G F E */ 2391 precrq.qb.ph s4, s6, s4 /* h g f e */ 2392 addu.qb s0, s0, s8 2393 addu.qb s4, s4, s8 2394 sw s0, 0(a3) /* outptr[0/1/2/3] d c b a */ 2395 sw s4, 4(a3) /* outptr[4/5/6/7] h g f e */ 2396 lw a3, -4(a1) 2397 addu.qb t0, t0, s8 2398 addu a3, a3, a2 2399 addu.qb t4, t4, s8 2400 sw t0, 0(a3) /* outptr[0/1/2/3] D C B A */ 2401 bne a0, t9, 0b 2402 sw t4, 4(a3) /* outptr[4/5/6/7] H G F E */ 2403 24042: 2405 2406 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 2407 2408 j ra 2409 nop 2410 2411END(jsimd_idct_ifast_rows_dspr2) 2412 2413 2414/*****************************************************************************/ 2415LEAF_DSPR2(jsimd_fdct_islow_dspr2) 2416/* 2417 * a0 = data 2418 */ 2419 SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 2420 2421 lui t0, 6437 2422 ori t0, 2260 2423 lui t1, 9633 2424 ori t1, 11363 2425 lui t2, 0xd39e 2426 ori t2, 0xe6dc 2427 lui t3, 0xf72d 2428 ori t3, 9633 2429 lui t4, 2261 2430 ori t4, 9633 2431 lui t5, 0xd39e 2432 ori t5, 6437 2433 lui t6, 9633 2434 ori t6, 0xd39d 2435 lui t7, 0xe6dc 2436 ori t7, 2260 2437 lui t8, 4433 2438 ori t8, 10703 2439 lui t9, 0xd630 2440 ori t9, 4433 2441 li s8, 8 2442 move a1, a0 24431: 2444 lw s0, 0(a1) /* tmp0 = 1|0 */ 2445 lw s1, 4(a1) /* tmp1 = 3|2 */ 2446 lw s2, 8(a1) /* tmp2 = 5|4 */ 2447 lw s3, 12(a1) /* tmp3 = 7|6 */ 2448 packrl.ph s1, s1, s1 /* tmp1 = 2|3 */ 2449 packrl.ph s3, s3, s3 /* tmp3 = 6|7 */ 2450 subq.ph s7, s1, s2 /* tmp7 = 2-5|3-4 = t5|t4 */ 2451 subq.ph s5, s0, s3 /* tmp5 = 1-6|0-7 = t6|t7 */ 2452 mult $0, $0 /* ac0 = 0 */ 2453 dpa.w.ph $ac0, s7, t0 /* ac0 += t5* 6437 + t4* 2260 */ 2454 dpa.w.ph $ac0, s5, t1 /* ac0 += t6* 9633 + t7* 11363 */ 2455 mult $ac1, $0, $0 /* ac1 = 0 */ 2456 dpa.w.ph $ac1, s7, t2 /* ac1 += t5*-11362 + t4* -6436 */ 2457 dpa.w.ph $ac1, s5, t3 /* ac1 += t6* -2259 + t7* 9633 */ 2458 mult $ac2, $0, $0 /* ac2 = 0 */ 2459 dpa.w.ph $ac2, s7, t4 /* ac2 += t5* 2261 + t4* 9633 */ 2460 dpa.w.ph $ac2, s5, t5 /* ac2 += t6*-11362 + t7* 6437 */ 2461 mult $ac3, $0, $0 /* ac3 = 0 */ 2462 dpa.w.ph $ac3, s7, t6 /* ac3 += t5* 9633 + t4*-11363 */ 2463 dpa.w.ph $ac3, s5, t7 /* ac3 += t6* -6436 + t7* 2260 */ 2464 addq.ph s6, s1, s2 /* tmp6 = 2+5|3+4 = t2|t3 */ 2465 addq.ph s4, s0, s3 /* tmp4 = 1+6|0+7 = t1|t0 */ 2466 extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */ 2467 extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */ 2468 extr_r.w s2, $ac2, 11 /* tmp2 = (ac2 + 1024) >> 11 */ 2469 extr_r.w s3, $ac3, 11 /* tmp3 = (ac3 + 1024) >> 11 */ 2470 addq.ph s5, s4, s6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */ 2471 subq.ph s7, s4, s6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */ 2472 sh s0, 2(a1) 2473 sh s1, 6(a1) 2474 sh s2, 10(a1) 2475 sh s3, 14(a1) 2476 mult $0, $0 /* ac0 = 0 */ 2477 dpa.w.ph $ac0, s7, t8 /* ac0 += t12* 4433 + t13* 10703 */ 2478 mult $ac1, $0, $0 /* ac1 = 0 */ 2479 dpa.w.ph $ac1, s7, t9 /* ac1 += t12*-10704 + t13* 4433 */ 2480 sra s4, s5, 16 /* tmp4 = t11 */ 2481 addiu a1, a1, 16 2482 addiu s8, s8, -1 2483 extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */ 2484 extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */ 2485 addu s2, s5, s4 /* tmp2 = t10 + t11 */ 2486 subu s3, s5, s4 /* tmp3 = t10 - t11 */ 2487 sll s2, s2, 2 /* tmp2 = (t10 + t11) << 2 */ 2488 sll s3, s3, 2 /* tmp3 = (t10 - t11) << 2 */ 2489 sh s2, -16(a1) 2490 sh s3, -8(a1) 2491 sh s0, -12(a1) 2492 bgtz s8, 1b 2493 sh s1, -4(a1) 2494 li t0, 2260 2495 li t1, 11363 2496 li t2, 9633 2497 li t3, 6436 2498 li t4, 6437 2499 li t5, 2261 2500 li t6, 11362 2501 li t7, 2259 2502 li t8, 4433 2503 li t9, 10703 2504 li a1, 10704 2505 li s8, 8 2506 25072: 2508 lh a2, 0(a0) /* 0 */ 2509 lh a3, 16(a0) /* 8 */ 2510 lh v0, 32(a0) /* 16 */ 2511 lh v1, 48(a0) /* 24 */ 2512 lh s4, 64(a0) /* 32 */ 2513 lh s5, 80(a0) /* 40 */ 2514 lh s6, 96(a0) /* 48 */ 2515 lh s7, 112(a0) /* 56 */ 2516 addu s2, v0, s5 /* tmp2 = 16 + 40 */ 2517 subu s5, v0, s5 /* tmp5 = 16 - 40 */ 2518 addu s3, v1, s4 /* tmp3 = 24 + 32 */ 2519 subu s4, v1, s4 /* tmp4 = 24 - 32 */ 2520 addu s0, a2, s7 /* tmp0 = 0 + 56 */ 2521 subu s7, a2, s7 /* tmp7 = 0 - 56 */ 2522 addu s1, a3, s6 /* tmp1 = 8 + 48 */ 2523 subu s6, a3, s6 /* tmp6 = 8 - 48 */ 2524 addu a2, s0, s3 /* tmp10 = tmp0 + tmp3 */ 2525 subu v1, s0, s3 /* tmp13 = tmp0 - tmp3 */ 2526 addu a3, s1, s2 /* tmp11 = tmp1 + tmp2 */ 2527 subu v0, s1, s2 /* tmp12 = tmp1 - tmp2 */ 2528 mult s7, t1 /* ac0 = tmp7 * c1 */ 2529 madd s4, t0 /* ac0 += tmp4 * c0 */ 2530 madd s5, t4 /* ac0 += tmp5 * c4 */ 2531 madd s6, t2 /* ac0 += tmp6 * c2 */ 2532 mult $ac1, s7, t2 /* ac1 = tmp7 * c2 */ 2533 msub $ac1, s4, t3 /* ac1 -= tmp4 * c3 */ 2534 msub $ac1, s5, t6 /* ac1 -= tmp5 * c6 */ 2535 msub $ac1, s6, t7 /* ac1 -= tmp6 * c7 */ 2536 mult $ac2, s7, t4 /* ac2 = tmp7 * c4 */ 2537 madd $ac2, s4, t2 /* ac2 += tmp4 * c2 */ 2538 madd $ac2, s5, t5 /* ac2 += tmp5 * c5 */ 2539 msub $ac2, s6, t6 /* ac2 -= tmp6 * c6 */ 2540 mult $ac3, s7, t0 /* ac3 = tmp7 * c0 */ 2541 msub $ac3, s4, t1 /* ac3 -= tmp4 * c1 */ 2542 madd $ac3, s5, t2 /* ac3 += tmp5 * c2 */ 2543 msub $ac3, s6, t3 /* ac3 -= tmp6 * c3 */ 2544 extr_r.w s0, $ac0, 15 /* tmp0 = (ac0 + 16384) >> 15 */ 2545 extr_r.w s1, $ac1, 15 /* tmp1 = (ac1 + 16384) >> 15 */ 2546 extr_r.w s2, $ac2, 15 /* tmp2 = (ac2 + 16384) >> 15 */ 2547 extr_r.w s3, $ac3, 15 /* tmp3 = (ac3 + 16384) >> 15 */ 2548 addiu s8, s8, -1 2549 addu s4, a2, a3 /* tmp4 = tmp10 + tmp11 */ 2550 subu s5, a2, a3 /* tmp5 = tmp10 - tmp11 */ 2551 sh s0, 16(a0) 2552 sh s1, 48(a0) 2553 sh s2, 80(a0) 2554 sh s3, 112(a0) 2555 mult v0, t8 /* ac0 = tmp12 * c8 */ 2556 madd v1, t9 /* ac0 += tmp13 * c9 */ 2557 mult $ac1, v1, t8 /* ac1 = tmp13 * c8 */ 2558 msub $ac1, v0, a1 /* ac1 -= tmp12 * c10 */ 2559 addiu a0, a0, 2 2560 extr_r.w s6, $ac0, 15 /* tmp6 = (ac0 + 16384) >> 15 */ 2561 extr_r.w s7, $ac1, 15 /* tmp7 = (ac1 + 16384) >> 15 */ 2562 shra_r.w s4, s4, 2 /* tmp4 = (tmp4 + 2) >> 2 */ 2563 shra_r.w s5, s5, 2 /* tmp5 = (tmp5 + 2) >> 2 */ 2564 sh s4, -2(a0) 2565 sh s5, 62(a0) 2566 sh s6, 30(a0) 2567 bgtz s8, 2b 2568 sh s7, 94(a0) 2569 2570 RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8 2571 2572 jr ra 2573 nop 2574 2575END(jsimd_fdct_islow_dspr2) 2576 2577 2578/**************************************************************************/ 2579LEAF_DSPR2(jsimd_fdct_ifast_dspr2) 2580/* 2581 * a0 = data 2582 */ 2583 .set at 2584 2585 SAVE_REGS_ON_STACK 8, s0, s1 2586 2587 li a1, 0x014e014e /* FIX_1_306562965 (334 << 16) | 2588 (334 & 0xffff) */ 2589 li a2, 0x008b008b /* FIX_0_541196100 (139 << 16) | 2590 (139 & 0xffff) */ 2591 li a3, 0x00620062 /* FIX_0_382683433 (98 << 16) | 2592 (98 & 0xffff) */ 2593 li s1, 0x00b500b5 /* FIX_0_707106781 (181 << 16) | 2594 (181 & 0xffff) */ 2595 2596 move v0, a0 2597 addiu v1, v0, 128 /* end address */ 2598 25990: 2600 lw t0, 0(v0) /* tmp0 = 1|0 */ 2601 lw t1, 4(v0) /* tmp1 = 3|2 */ 2602 lw t2, 8(v0) /* tmp2 = 5|4 */ 2603 lw t3, 12(v0) /* tmp3 = 7|6 */ 2604 packrl.ph t1, t1, t1 /* tmp1 = 2|3 */ 2605 packrl.ph t3, t3, t3 /* tmp3 = 6|7 */ 2606 subq.ph t7, t1, t2 /* tmp7 = 2-5|3-4 = t5|t4 */ 2607 subq.ph t5, t0, t3 /* tmp5 = 1-6|0-7 = t6|t7 */ 2608 addq.ph t6, t1, t2 /* tmp6 = 2+5|3+4 = t2|t3 */ 2609 addq.ph t4, t0, t3 /* tmp4 = 1+6|0+7 = t1|t0 */ 2610 addq.ph t8, t4, t6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */ 2611 subq.ph t9, t4, t6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */ 2612 sra t4, t8, 16 /* tmp4 = t11 */ 2613 mult $0, $0 /* ac0 = 0 */ 2614 dpa.w.ph $ac0, t9, s1 2615 mult $ac1, $0, $0 /* ac1 = 0 */ 2616 dpa.w.ph $ac1, t7, a3 /* ac1 += t4*98 + t5*98 */ 2617 dpsx.w.ph $ac1, t5, a3 /* ac1 += t6*98 + t7*98 */ 2618 mult $ac2, $0, $0 /* ac2 = 0 */ 2619 dpa.w.ph $ac2, t7, a2 /* ac2 += t4*139 + t5*139 */ 2620 mult $ac3, $0, $0 /* ac3 = 0 */ 2621 dpa.w.ph $ac3, t5, a1 /* ac3 += t6*334 + t7*334 */ 2622 precrq.ph.w t0, t5, t7 /* t0 = t5|t6 */ 2623 addq.ph t2, t8, t4 /* tmp2 = t10 + t11 */ 2624 subq.ph t3, t8, t4 /* tmp3 = t10 - t11 */ 2625 extr.w t4, $ac0, 8 2626 mult $0, $0 /* ac0 = 0 */ 2627 dpa.w.ph $ac0, t0, s1 /* ac0 += t5*181 + t6*181 */ 2628 extr.w t0, $ac1, 8 /* t0 = z5 */ 2629 extr.w t1, $ac2, 8 /* t1 = MULTIPLY(tmp10, 139) */ 2630 extr.w t7, $ac3, 8 /* t2 = MULTIPLY(tmp12, 334) */ 2631 extr.w t8, $ac0, 8 /* t8 = z3 = MULTIPLY(tmp11, 181) */ 2632 add t6, t1, t0 /* t6 = z2 */ 2633 add t7, t7, t0 /* t7 = z4 */ 2634 subq.ph t0, t5, t8 /* t0 = z13 = tmp7 - z3 */ 2635 addq.ph t8, t5, t8 /* t9 = z11 = tmp7 + z3 */ 2636 addq.ph t1, t0, t6 /* t1 = z13 + z2 */ 2637 subq.ph t6, t0, t6 /* t6 = z13 - z2 */ 2638 addq.ph t0, t8, t7 /* t0 = z11 + z4 */ 2639 subq.ph t7, t8, t7 /* t7 = z11 - z4 */ 2640 addq.ph t5, t4, t9 2641 subq.ph t4, t9, t4 2642 sh t2, 0(v0) 2643 sh t5, 4(v0) 2644 sh t3, 8(v0) 2645 sh t4, 12(v0) 2646 sh t1, 10(v0) 2647 sh t6, 6(v0) 2648 sh t0, 2(v0) 2649 sh t7, 14(v0) 2650 addiu v0, 16 2651 bne v1, v0, 0b 2652 nop 2653 move v0, a0 2654 addiu v1, v0, 16 2655 26561: 2657 lh t0, 0(v0) /* 0 */ 2658 lh t1, 16(v0) /* 8 */ 2659 lh t2, 32(v0) /* 16 */ 2660 lh t3, 48(v0) /* 24 */ 2661 lh t4, 64(v0) /* 32 */ 2662 lh t5, 80(v0) /* 40 */ 2663 lh t6, 96(v0) /* 48 */ 2664 lh t7, 112(v0) /* 56 */ 2665 add t8, t0, t7 /* t8 = tmp0 */ 2666 sub t7, t0, t7 /* t7 = tmp7 */ 2667 add t0, t1, t6 /* t0 = tmp1 */ 2668 sub t1, t1, t6 /* t1 = tmp6 */ 2669 add t6, t2, t5 /* t6 = tmp2 */ 2670 sub t5, t2, t5 /* t5 = tmp5 */ 2671 add t2, t3, t4 /* t2 = tmp3 */ 2672 sub t3, t3, t4 /* t3 = tmp4 */ 2673 add t4, t8, t2 /* t4 = tmp10 = tmp0 + tmp3 */ 2674 sub t8, t8, t2 /* t8 = tmp13 = tmp0 - tmp3 */ 2675 sub s0, t0, t6 /* s0 = tmp12 = tmp1 - tmp2 */ 2676 ins t8, s0, 16, 16 /* t8 = tmp12|tmp13 */ 2677 add t2, t0, t6 /* t2 = tmp11 = tmp1 + tmp2 */ 2678 mult $0, $0 /* ac0 = 0 */ 2679 dpa.w.ph $ac0, t8, s1 /* ac0 += t12*181 + t13*181 */ 2680 add s0, t4, t2 /* t8 = tmp10+tmp11 */ 2681 sub t4, t4, t2 /* t4 = tmp10-tmp11 */ 2682 sh s0, 0(v0) 2683 sh t4, 64(v0) 2684 extr.w t2, $ac0, 8 /* z1 = MULTIPLY(tmp12+tmp13, 2685 FIX_0_707106781) */ 2686 addq.ph t4, t8, t2 /* t9 = tmp13 + z1 */ 2687 subq.ph t8, t8, t2 /* t2 = tmp13 - z1 */ 2688 sh t4, 32(v0) 2689 sh t8, 96(v0) 2690 add t3, t3, t5 /* t3 = tmp10 = tmp4 + tmp5 */ 2691 add t0, t5, t1 /* t0 = tmp11 = tmp5 + tmp6 */ 2692 add t1, t1, t7 /* t1 = tmp12 = tmp6 + tmp7 */ 2693 andi t4, a1, 0xffff 2694 mul s0, t1, t4 2695 sra s0, s0, 8 /* s0 = z4 = 2696 MULTIPLY(tmp12, FIX_1_306562965) */ 2697 ins t1, t3, 16, 16 /* t1 = tmp10|tmp12 */ 2698 mult $0, $0 /* ac0 = 0 */ 2699 mulsa.w.ph $ac0, t1, a3 /* ac0 += t10*98 - t12*98 */ 2700 extr.w t8, $ac0, 8 /* z5 = MULTIPLY(tmp10-tmp12, 2701 FIX_0_382683433) */ 2702 add t2, t7, t8 /* t2 = tmp7 + z5 */ 2703 sub t7, t7, t8 /* t7 = tmp7 - z5 */ 2704 andi t4, a2, 0xffff 2705 mul t8, t3, t4 2706 sra t8, t8, 8 /* t8 = z2 = 2707 MULTIPLY(tmp10, FIX_0_541196100) */ 2708 andi t4, s1, 0xffff 2709 mul t6, t0, t4 2710 sra t6, t6, 8 /* t6 = z3 = 2711 MULTIPLY(tmp11, FIX_0_707106781) */ 2712 add t0, t6, t8 /* t0 = z3 + z2 */ 2713 sub t1, t6, t8 /* t1 = z3 - z2 */ 2714 add t3, t6, s0 /* t3 = z3 + z4 */ 2715 sub t4, t6, s0 /* t4 = z3 - z4 */ 2716 sub t5, t2, t1 /* t5 = dataptr[5] */ 2717 sub t6, t7, t0 /* t6 = dataptr[3] */ 2718 add t3, t2, t3 /* t3 = dataptr[1] */ 2719 add t4, t7, t4 /* t4 = dataptr[7] */ 2720 sh t5, 80(v0) 2721 sh t6, 48(v0) 2722 sh t3, 16(v0) 2723 sh t4, 112(v0) 2724 addiu v0, 2 2725 bne v0, v1, 1b 2726 nop 2727 2728 RESTORE_REGS_FROM_STACK 8, s0, s1 2729 2730 j ra 2731 nop 2732END(jsimd_fdct_ifast_dspr2) 2733 2734 2735/*****************************************************************************/ 2736LEAF_DSPR2(jsimd_quantize_dspr2) 2737/* 2738 * a0 = coef_block 2739 * a1 = divisors 2740 * a2 = workspace 2741 */ 2742 .set at 2743 2744 SAVE_REGS_ON_STACK 16, s0, s1, s2 2745 2746 addiu v0, a2, 124 /* v0 = workspace_end */ 2747 lh t0, 0(a2) 2748 lh t1, 0(a1) 2749 lh t2, 128(a1) 2750 sra t3, t0, 15 2751 sll t3, t3, 1 2752 addiu t3, t3, 1 2753 mul t0, t0, t3 2754 lh t4, 384(a1) 2755 lh t5, 130(a1) 2756 lh t6, 2(a2) 2757 lh t7, 2(a1) 2758 lh t8, 386(a1) 2759 27601: 2761 andi t1, 0xffff 2762 add t9, t0, t2 2763 andi t9, 0xffff 2764 mul v1, t9, t1 2765 sra s0, t6, 15 2766 sll s0, s0, 1 2767 addiu s0, s0, 1 2768 addiu t9, t4, 16 2769 srav v1, v1, t9 2770 mul v1, v1, t3 2771 mul t6, t6, s0 2772 andi t7, 0xffff 2773 addiu a2, a2, 4 2774 addiu a1, a1, 4 2775 add s1, t6, t5 2776 andi s1, 0xffff 2777 sh v1, 0(a0) 2778 2779 mul s2, s1, t7 2780 addiu s1, t8, 16 2781 srav s2, s2, s1 2782 mul s2, s2, s0 2783 lh t0, 0(a2) 2784 lh t1, 0(a1) 2785 sra t3, t0, 15 2786 sll t3, t3, 1 2787 addiu t3, t3, 1 2788 mul t0, t0, t3 2789 lh t2, 128(a1) 2790 lh t4, 384(a1) 2791 lh t5, 130(a1) 2792 lh t8, 386(a1) 2793 lh t6, 2(a2) 2794 lh t7, 2(a1) 2795 sh s2, 2(a0) 2796 lh t0, 0(a2) 2797 sra t3, t0, 15 2798 sll t3, t3, 1 2799 addiu t3, t3, 1 2800 mul t0, t0, t3 2801 bne a2, v0, 1b 2802 addiu a0, a0, 4 2803 2804 andi t1, 0xffff 2805 add t9, t0, t2 2806 andi t9, 0xffff 2807 mul v1, t9, t1 2808 sra s0, t6, 15 2809 sll s0, s0, 1 2810 addiu s0, s0, 1 2811 addiu t9, t4, 16 2812 srav v1, v1, t9 2813 mul v1, v1, t3 2814 mul t6, t6, s0 2815 andi t7, 0xffff 2816 sh v1, 0(a0) 2817 add s1, t6, t5 2818 andi s1, 0xffff 2819 mul s2, s1, t7 2820 addiu s1, t8, 16 2821 addiu a2, a2, 4 2822 addiu a1, a1, 4 2823 srav s2, s2, s1 2824 mul s2, s2, s0 2825 sh s2, 2(a0) 2826 2827 RESTORE_REGS_FROM_STACK 16, s0, s1, s2 2828 2829 j ra 2830 nop 2831 2832END(jsimd_quantize_dspr2) 2833 2834 2835#ifndef __mips_soft_float 2836 2837/*****************************************************************************/ 2838LEAF_DSPR2(jsimd_quantize_float_dspr2) 2839/* 2840 * a0 = coef_block 2841 * a1 = divisors 2842 * a2 = workspace 2843 */ 2844 .set at 2845 2846 li t1, 0x46800100 /* integer representation 16384.5 */ 2847 mtc1 t1, f0 2848 li t0, 63 28490: 2850 lwc1 f2, 0(a2) 2851 lwc1 f10, 0(a1) 2852 lwc1 f4, 4(a2) 2853 lwc1 f12, 4(a1) 2854 lwc1 f6, 8(a2) 2855 lwc1 f14, 8(a1) 2856 lwc1 f8, 12(a2) 2857 lwc1 f16, 12(a1) 2858 madd.s f2, f0, f2, f10 2859 madd.s f4, f0, f4, f12 2860 madd.s f6, f0, f6, f14 2861 madd.s f8, f0, f8, f16 2862 lwc1 f10, 16(a1) 2863 lwc1 f12, 20(a1) 2864 trunc.w.s f2, f2 2865 trunc.w.s f4, f4 2866 trunc.w.s f6, f6 2867 trunc.w.s f8, f8 2868 lwc1 f14, 24(a1) 2869 lwc1 f16, 28(a1) 2870 mfc1 t1, f2 2871 mfc1 t2, f4 2872 mfc1 t3, f6 2873 mfc1 t4, f8 2874 lwc1 f2, 16(a2) 2875 lwc1 f4, 20(a2) 2876 lwc1 f6, 24(a2) 2877 lwc1 f8, 28(a2) 2878 madd.s f2, f0, f2, f10 2879 madd.s f4, f0, f4, f12 2880 madd.s f6, f0, f6, f14 2881 madd.s f8, f0, f8, f16 2882 addiu t1, t1, -16384 2883 addiu t2, t2, -16384 2884 addiu t3, t3, -16384 2885 addiu t4, t4, -16384 2886 trunc.w.s f2, f2 2887 trunc.w.s f4, f4 2888 trunc.w.s f6, f6 2889 trunc.w.s f8, f8 2890 sh t1, 0(a0) 2891 sh t2, 2(a0) 2892 sh t3, 4(a0) 2893 sh t4, 6(a0) 2894 mfc1 t1, f2 2895 mfc1 t2, f4 2896 mfc1 t3, f6 2897 mfc1 t4, f8 2898 addiu t0, t0, -8 2899 addiu a2, a2, 32 2900 addiu a1, a1, 32 2901 addiu t1, t1, -16384 2902 addiu t2, t2, -16384 2903 addiu t3, t3, -16384 2904 addiu t4, t4, -16384 2905 sh t1, 8(a0) 2906 sh t2, 10(a0) 2907 sh t3, 12(a0) 2908 sh t4, 14(a0) 2909 bgez t0, 0b 2910 addiu a0, a0, 16 2911 2912 j ra 2913 nop 2914 2915END(jsimd_quantize_float_dspr2) 2916 2917#endif 2918 2919 2920/*****************************************************************************/ 2921LEAF_DSPR2(jsimd_idct_2x2_dspr2) 2922/* 2923 * a0 = compptr->dct_table 2924 * a1 = coef_block 2925 * a2 = output_buf 2926 * a3 = output_col 2927 */ 2928 .set at 2929 2930 SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5 2931 2932 addiu sp, sp, -40 2933 move v0, sp 2934 addiu s2, zero, 29692 2935 addiu s3, zero, -10426 2936 addiu s4, zero, 6967 2937 addiu s5, zero, -5906 2938 lh t0, 0(a1) /* t0 = inptr[DCTSIZE*0] */ 2939 lh t5, 0(a0) /* t5 = quantptr[DCTSIZE*0] */ 2940 lh t1, 48(a1) /* t1 = inptr[DCTSIZE*3] */ 2941 lh t6, 48(a0) /* t6 = quantptr[DCTSIZE*3] */ 2942 mul t4, t5, t0 2943 lh t0, 16(a1) /* t0 = inptr[DCTSIZE*1] */ 2944 lh t5, 16(a0) /* t5 = quantptr[DCTSIZE*1] */ 2945 mul t6, t6, t1 2946 mul t5, t5, t0 2947 lh t2, 80(a1) /* t2 = inptr[DCTSIZE*5] */ 2948 lh t7, 80(a0) /* t7 = quantptr[DCTSIZE*5] */ 2949 lh t3, 112(a1) /* t3 = inptr[DCTSIZE*7] */ 2950 lh t8, 112(a0) /* t8 = quantptr[DCTSIZE*7] */ 2951 mul t7, t7, t2 2952 mult zero, zero 2953 mul t8, t8, t3 2954 li s0, 0x73FCD746 /* s0 = (29692 << 16) | (-10426 & 0xffff) */ 2955 li s1, 0x1B37E8EE /* s1 = (6967 << 16) | (-5906 & 0xffff) */ 2956 ins t6, t5, 16, 16 /* t6 = t5|t6 */ 2957 sll t4, t4, 15 2958 dpa.w.ph $ac0, t6, s0 2959 lh t1, 2(a1) 2960 lh t6, 2(a0) 2961 ins t8, t7, 16, 16 /* t8 = t7|t8 */ 2962 dpa.w.ph $ac0, t8, s1 2963 mflo t0, $ac0 2964 mul t5, t6, t1 2965 lh t1, 18(a1) 2966 lh t6, 18(a0) 2967 lh t2, 50(a1) 2968 lh t7, 50(a0) 2969 mul t6, t6, t1 2970 subu t8, t4, t0 2971 mul t7, t7, t2 2972 addu t0, t4, t0 2973 shra_r.w t0, t0, 13 2974 lh t1, 82(a1) 2975 lh t2, 82(a0) 2976 lh t3, 114(a1) 2977 lh t4, 114(a0) 2978 shra_r.w t8, t8, 13 2979 mul t1, t1, t2 2980 mul t3, t3, t4 2981 sw t0, 0(v0) 2982 sw t8, 20(v0) 2983 sll t4, t5, 15 2984 ins t7, t6, 16, 16 2985 mult zero, zero 2986 dpa.w.ph $ac0, t7, s0 2987 ins t3, t1, 16, 16 2988 lh t1, 6(a1) 2989 lh t6, 6(a0) 2990 dpa.w.ph $ac0, t3, s1 2991 mflo t0, $ac0 2992 mul t5, t6, t1 2993 lh t1, 22(a1) 2994 lh t6, 22(a0) 2995 lh t2, 54(a1) 2996 lh t7, 54(a0) 2997 mul t6, t6, t1 2998 subu t8, t4, t0 2999 mul t7, t7, t2 3000 addu t0, t4, t0 3001 shra_r.w t0, t0, 13 3002 lh t1, 86(a1) 3003 lh t2, 86(a0) 3004 lh t3, 118(a1) 3005 lh t4, 118(a0) 3006 shra_r.w t8, t8, 13 3007 mul t1, t1, t2 3008 mul t3, t3, t4 3009 sw t0, 4(v0) 3010 sw t8, 24(v0) 3011 sll t4, t5, 15 3012 ins t7, t6, 16, 16 3013 mult zero, zero 3014 dpa.w.ph $ac0, t7, s0 3015 ins t3, t1, 16, 16 3016 lh t1, 10(a1) 3017 lh t6, 10(a0) 3018 dpa.w.ph $ac0, t3, s1 3019 mflo t0, $ac0 3020 mul t5, t6, t1 3021 lh t1, 26(a1) 3022 lh t6, 26(a0) 3023 lh t2, 58(a1) 3024 lh t7, 58(a0) 3025 mul t6, t6, t1 3026 subu t8, t4, t0 3027 mul t7, t7, t2 3028 addu t0, t4, t0 3029 shra_r.w t0, t0, 13 3030 lh t1, 90(a1) 3031 lh t2, 90(a0) 3032 lh t3, 122(a1) 3033 lh t4, 122(a0) 3034 shra_r.w t8, t8, 13 3035 mul t1, t1, t2 3036 mul t3, t3, t4 3037 sw t0, 8(v0) 3038 sw t8, 28(v0) 3039 sll t4, t5, 15 3040 ins t7, t6, 16, 16 3041 mult zero, zero 3042 dpa.w.ph $ac0, t7, s0 3043 ins t3, t1, 16, 16 3044 lh t1, 14(a1) 3045 lh t6, 14(a0) 3046 dpa.w.ph $ac0, t3, s1 3047 mflo t0, $ac0 3048 mul t5, t6, t1 3049 lh t1, 30(a1) 3050 lh t6, 30(a0) 3051 lh t2, 62(a1) 3052 lh t7, 62(a0) 3053 mul t6, t6, t1 3054 subu t8, t4, t0 3055 mul t7, t7, t2 3056 addu t0, t4, t0 3057 shra_r.w t0, t0, 13 3058 lh t1, 94(a1) 3059 lh t2, 94(a0) 3060 lh t3, 126(a1) 3061 lh t4, 126(a0) 3062 shra_r.w t8, t8, 13 3063 mul t1, t1, t2 3064 mul t3, t3, t4 3065 sw t0, 12(v0) 3066 sw t8, 32(v0) 3067 sll t4, t5, 15 3068 ins t7, t6, 16, 16 3069 mult zero, zero 3070 dpa.w.ph $ac0, t7, s0 3071 ins t3, t1, 16, 16 3072 dpa.w.ph $ac0, t3, s1 3073 mflo t0, $ac0 3074 lw t9, 0(a2) 3075 lw t3, 0(v0) 3076 lw t7, 4(v0) 3077 lw t1, 8(v0) 3078 addu t9, t9, a3 3079 sll t3, t3, 15 3080 subu t8, t4, t0 3081 addu t0, t4, t0 3082 shra_r.w t0, t0, 13 3083 shra_r.w t8, t8, 13 3084 sw t0, 16(v0) 3085 sw t8, 36(v0) 3086 lw t5, 12(v0) 3087 lw t6, 16(v0) 3088 mult t7, s2 3089 madd t1, s3 3090 madd t5, s4 3091 madd t6, s5 3092 lw t5, 24(v0) 3093 lw t7, 28(v0) 3094 mflo t0, $ac0 3095 lw t8, 32(v0) 3096 lw t2, 36(v0) 3097 mult $ac1, t5, s2 3098 madd $ac1, t7, s3 3099 madd $ac1, t8, s4 3100 madd $ac1, t2, s5 3101 addu t1, t3, t0 3102 subu t6, t3, t0 3103 shra_r.w t1, t1, 20 3104 shra_r.w t6, t6, 20 3105 mflo t4, $ac1 3106 shll_s.w t1, t1, 24 3107 shll_s.w t6, t6, 24 3108 sra t1, t1, 24 3109 sra t6, t6, 24 3110 addiu t1, t1, 128 3111 addiu t6, t6, 128 3112 lw t0, 20(v0) 3113 sb t1, 0(t9) 3114 sb t6, 1(t9) 3115 sll t0, t0, 15 3116 lw t9, 4(a2) 3117 addu t1, t0, t4 3118 subu t6, t0, t4 3119 addu t9, t9, a3 3120 shra_r.w t1, t1, 20 3121 shra_r.w t6, t6, 20 3122 shll_s.w t1, t1, 24 3123 shll_s.w t6, t6, 24 3124 sra t1, t1, 24 3125 sra t6, t6, 24 3126 addiu t1, t1, 128 3127 addiu t6, t6, 128 3128 sb t1, 0(t9) 3129 sb t6, 1(t9) 3130 addiu sp, sp, 40 3131 3132 RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5 3133 3134 j ra 3135 nop 3136 3137END(jsimd_idct_2x2_dspr2) 3138 3139 3140/*****************************************************************************/ 3141LEAF_DSPR2(jsimd_idct_4x4_dspr2) 3142/* 3143 * a0 = compptr->dct_table 3144 * a1 = coef_block 3145 * a2 = output_buf 3146 * a3 = output_col 3147 * 16(sp) = workspace[DCTSIZE*4] (buffers data between passes) 3148 */ 3149 .set at 3150 3151 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3152 3153 lw v1, 48(sp) 3154 move t0, a1 3155 move t1, v1 3156 li t9, 4 3157 li s0, 0x2e75f93e 3158 li s1, 0x21f9ba79 3159 li s2, 0xecc2efb0 3160 li s3, 0x52031ccd 3161 31620: 3163 lh s6, 32(t0) /* inptr[DCTSIZE*2] */ 3164 lh t6, 32(a0) /* quantptr[DCTSIZE*2] */ 3165 lh s7, 96(t0) /* inptr[DCTSIZE*6] */ 3166 lh t7, 96(a0) /* quantptr[DCTSIZE*6] */ 3167 mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * 3168 quantptr[DCTSIZE*2]) */ 3169 lh s4, 0(t0) /* inptr[DCTSIZE*0] */ 3170 mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * 3171 quantptr[DCTSIZE*6]) */ 3172 lh s5, 0(a0) /* quantptr[0] */ 3173 li s6, 15137 3174 li s7, 6270 3175 mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */ 3176 mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * 3177 quantptr[DCTSIZE*2]) */ 3178 lh t5, 112(t0) /* inptr[DCTSIZE*7] */ 3179 mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * 3180 quantptr[DCTSIZE*6]) */ 3181 lh s4, 112(a0) /* quantptr[DCTSIZE*7] */ 3182 lh v0, 80(t0) /* inptr[DCTSIZE*5] */ 3183 lh s5, 80(a0) /* quantptr[DCTSIZE*5] */ 3184 lh s6, 48(a0) /* quantptr[DCTSIZE*3] */ 3185 sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */ 3186 lh s7, 16(a0) /* quantptr[DCTSIZE*1] */ 3187 lh t8, 16(t0) /* inptr[DCTSIZE*1] */ 3188 subu t6, t6, t7 /* tmp2 = 3189 MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */ 3190 lh t7, 48(t0) /* inptr[DCTSIZE*3] */ 3191 mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] * 3192 quantptr[DCTSIZE*7]) */ 3193 mul v0, s5, v0 /* z2 = (inptr[DCTSIZE*5] * 3194 quantptr[DCTSIZE*5]) */ 3195 mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] * 3196 quantptr[DCTSIZE*3]) */ 3197 mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] * 3198 quantptr[DCTSIZE*1]) */ 3199 addu t3, t2, t6 /* tmp10 = tmp0 + z2 */ 3200 subu t4, t2, t6 /* tmp10 = tmp0 - z2 */ 3201 mult $ac0, zero, zero 3202 mult $ac1, zero, zero 3203 ins t5, v0, 16, 16 3204 ins t7, t8, 16, 16 3205 addiu t9, t9, -1 3206 dpa.w.ph $ac0, t5, s0 3207 dpa.w.ph $ac0, t7, s1 3208 dpa.w.ph $ac1, t5, s2 3209 dpa.w.ph $ac1, t7, s3 3210 mflo s4, $ac0 3211 mflo s5, $ac1 3212 addiu a0, a0, 2 3213 addiu t1, t1, 4 3214 addiu t0, t0, 2 3215 addu t6, t4, s4 3216 subu t5, t4, s4 3217 addu s6, t3, s5 3218 subu s7, t3, s5 3219 shra_r.w t6, t6, 12 /* DESCALE(tmp12 + temp1, 12) */ 3220 shra_r.w t5, t5, 12 /* DESCALE(tmp12 - temp1, 12) */ 3221 shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */ 3222 shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */ 3223 sw t6, 28(t1) 3224 sw t5, 60(t1) 3225 sw s6, -4(t1) 3226 bgtz t9, 0b 3227 sw s7, 92(t1) 3228 /* second loop three pass */ 3229 li t9, 3 32301: 3231 lh s6, 34(t0) /* inptr[DCTSIZE*2] */ 3232 lh t6, 34(a0) /* quantptr[DCTSIZE*2] */ 3233 lh s7, 98(t0) /* inptr[DCTSIZE*6] */ 3234 lh t7, 98(a0) /* quantptr[DCTSIZE*6] */ 3235 mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] * 3236 quantptr[DCTSIZE*2]) */ 3237 lh s4, 2(t0) /* inptr[DCTSIZE*0] */ 3238 mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * 3239 quantptr[DCTSIZE*6]) */ 3240 lh s5, 2(a0) /* quantptr[DCTSIZE*0] */ 3241 li s6, 15137 3242 li s7, 6270 3243 mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */ 3244 mul v0, s6, t6 /* z2 = (inptr[DCTSIZE*2] * 3245 quantptr[DCTSIZE*2]) */ 3246 lh t5, 114(t0) /* inptr[DCTSIZE*7] */ 3247 mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] * 3248 quantptr[DCTSIZE*6]) */ 3249 lh s4, 114(a0) /* quantptr[DCTSIZE*7] */ 3250 lh s5, 82(a0) /* quantptr[DCTSIZE*5] */ 3251 lh t6, 82(t0) /* inptr[DCTSIZE*5] */ 3252 sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */ 3253 lh s6, 50(a0) /* quantptr[DCTSIZE*3] */ 3254 lh t8, 18(t0) /* inptr[DCTSIZE*1] */ 3255 subu v0, v0, t7 /* tmp2 = 3256 MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */ 3257 lh t7, 50(t0) /* inptr[DCTSIZE*3] */ 3258 lh s7, 18(a0) /* quantptr[DCTSIZE*1] */ 3259 mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] * 3260 quantptr[DCTSIZE*7]) */ 3261 mul t6, s5, t6 /* z2 = (inptr[DCTSIZE*5] * 3262 quantptr[DCTSIZE*5]) */ 3263 mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] * 3264 quantptr[DCTSIZE*3]) */ 3265 mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] * 3266 quantptr[DCTSIZE*1]) */ 3267 addu t3, t2, v0 /* tmp10 = tmp0 + z2 */ 3268 subu t4, t2, v0 /* tmp10 = tmp0 - z2 */ 3269 mult $ac0, zero, zero 3270 mult $ac1, zero, zero 3271 ins t5, t6, 16, 16 3272 ins t7, t8, 16, 16 3273 dpa.w.ph $ac0, t5, s0 3274 dpa.w.ph $ac0, t7, s1 3275 dpa.w.ph $ac1, t5, s2 3276 dpa.w.ph $ac1, t7, s3 3277 mflo t5, $ac0 3278 mflo t6, $ac1 3279 addiu t9, t9, -1 3280 addiu t0, t0, 2 3281 addiu a0, a0, 2 3282 addiu t1, t1, 4 3283 addu s5, t4, t5 3284 subu s4, t4, t5 3285 addu s6, t3, t6 3286 subu s7, t3, t6 3287 shra_r.w s5, s5, 12 /* DESCALE(tmp12 + temp1, 12) */ 3288 shra_r.w s4, s4, 12 /* DESCALE(tmp12 - temp1, 12) */ 3289 shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */ 3290 shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */ 3291 sw s5, 32(t1) 3292 sw s4, 64(t1) 3293 sw s6, 0(t1) 3294 bgtz t9, 1b 3295 sw s7, 96(t1) 3296 move t1, v1 3297 li s4, 15137 3298 lw s6, 8(t1) /* wsptr[2] */ 3299 li s5, 6270 3300 lw s7, 24(t1) /* wsptr[6] */ 3301 mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], 3302 FIX_1_847759065) */ 3303 lw t2, 0(t1) /* wsptr[0] */ 3304 mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], 3305 -FIX_0_765366865) */ 3306 lh t5, 28(t1) /* wsptr[7] */ 3307 lh t6, 20(t1) /* wsptr[5] */ 3308 lh t7, 12(t1) /* wsptr[3] */ 3309 lh t8, 4(t1) /* wsptr[1] */ 3310 ins t5, t6, 16, 16 3311 ins t7, t8, 16, 16 3312 mult $ac0, zero, zero 3313 dpa.w.ph $ac0, t5, s0 3314 dpa.w.ph $ac0, t7, s1 3315 mult $ac1, zero, zero 3316 dpa.w.ph $ac1, t5, s2 3317 dpa.w.ph $ac1, t7, s3 3318 sll t2, t2, 14 /* tmp0 = 3319 ((JLONG)wsptr[0]) << (CONST_BITS+1) */ 3320 mflo s6, $ac0 3321 /* MULTIPLY(wsptr[2], FIX_1_847759065) + 3322 MULTIPLY(wsptr[6], -FIX_0_765366865) */ 3323 subu s4, s4, s5 3324 addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ 3325 mflo s7, $ac1 3326 subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ 3327 addu t7, t4, s6 3328 subu t8, t4, s6 3329 addu t5, t3, s7 3330 subu t6, t3, s7 3331 shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ 3332 shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ 3333 shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ 3334 shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ 3335 sll s4, t9, 2 3336 lw v0, 0(a2) /* output_buf[ctr] */ 3337 shll_s.w t5, t5, 24 3338 shll_s.w t6, t6, 24 3339 shll_s.w t7, t7, 24 3340 shll_s.w t8, t8, 24 3341 sra t5, t5, 24 3342 sra t6, t6, 24 3343 sra t7, t7, 24 3344 sra t8, t8, 24 3345 addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ 3346 addiu t5, t5, 128 3347 addiu t6, t6, 128 3348 addiu t7, t7, 128 3349 addiu t8, t8, 128 3350 sb t5, 0(v0) 3351 sb t7, 1(v0) 3352 sb t8, 2(v0) 3353 sb t6, 3(v0) 3354 /* 2 */ 3355 li s4, 15137 3356 lw s6, 40(t1) /* wsptr[2] */ 3357 li s5, 6270 3358 lw s7, 56(t1) /* wsptr[6] */ 3359 mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], 3360 FIX_1_847759065) */ 3361 lw t2, 32(t1) /* wsptr[0] */ 3362 mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], 3363 -FIX_0_765366865) */ 3364 lh t5, 60(t1) /* wsptr[7] */ 3365 lh t6, 52(t1) /* wsptr[5] */ 3366 lh t7, 44(t1) /* wsptr[3] */ 3367 lh t8, 36(t1) /* wsptr[1] */ 3368 ins t5, t6, 16, 16 3369 ins t7, t8, 16, 16 3370 mult $ac0, zero, zero 3371 dpa.w.ph $ac0, t5, s0 3372 dpa.w.ph $ac0, t7, s1 3373 mult $ac1, zero, zero 3374 dpa.w.ph $ac1, t5, s2 3375 dpa.w.ph $ac1, t7, s3 3376 sll t2, t2, 14 /* tmp0 = 3377 ((JLONG)wsptr[0]) << (CONST_BITS+1) */ 3378 mflo s6, $ac0 3379 /* MULTIPLY(wsptr[2], FIX_1_847759065) + 3380 MULTIPLY(wsptr[6], -FIX_0_765366865) */ 3381 subu s4, s4, s5 3382 addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ 3383 mflo s7, $ac1 3384 subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ 3385 addu t7, t4, s6 3386 subu t8, t4, s6 3387 addu t5, t3, s7 3388 subu t6, t3, s7 3389 shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 3390 CONST_BITS-PASS1_BITS+1) */ 3391 shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 3392 CONST_BITS-PASS1_BITS+1) */ 3393 shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 3394 CONST_BITS-PASS1_BITS+1) */ 3395 shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 3396 CONST_BITS-PASS1_BITS+1) */ 3397 sll s4, t9, 2 3398 lw v0, 4(a2) /* output_buf[ctr] */ 3399 shll_s.w t5, t5, 24 3400 shll_s.w t6, t6, 24 3401 shll_s.w t7, t7, 24 3402 shll_s.w t8, t8, 24 3403 sra t5, t5, 24 3404 sra t6, t6, 24 3405 sra t7, t7, 24 3406 sra t8, t8, 24 3407 addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ 3408 addiu t5, t5, 128 3409 addiu t6, t6, 128 3410 addiu t7, t7, 128 3411 addiu t8, t8, 128 3412 sb t5, 0(v0) 3413 sb t7, 1(v0) 3414 sb t8, 2(v0) 3415 sb t6, 3(v0) 3416 /* 3 */ 3417 li s4, 15137 3418 lw s6, 72(t1) /* wsptr[2] */ 3419 li s5, 6270 3420 lw s7, 88(t1) /* wsptr[6] */ 3421 mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], 3422 FIX_1_847759065) */ 3423 lw t2, 64(t1) /* wsptr[0] */ 3424 mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], 3425 -FIX_0_765366865) */ 3426 lh t5, 92(t1) /* wsptr[7] */ 3427 lh t6, 84(t1) /* wsptr[5] */ 3428 lh t7, 76(t1) /* wsptr[3] */ 3429 lh t8, 68(t1) /* wsptr[1] */ 3430 ins t5, t6, 16, 16 3431 ins t7, t8, 16, 16 3432 mult $ac0, zero, zero 3433 dpa.w.ph $ac0, t5, s0 3434 dpa.w.ph $ac0, t7, s1 3435 mult $ac1, zero, zero 3436 dpa.w.ph $ac1, t5, s2 3437 dpa.w.ph $ac1, t7, s3 3438 sll t2, t2, 14 /* tmp0 = 3439 ((JLONG)wsptr[0]) << (CONST_BITS+1) */ 3440 mflo s6, $ac0 3441 /* MULTIPLY(wsptr[2], FIX_1_847759065) + 3442 MULTIPLY(wsptr[6], -FIX_0_765366865) */ 3443 subu s4, s4, s5 3444 addu t3, t2, s4 /* tmp10 = tmp0 + z2 */ 3445 mflo s7, $ac1 3446 subu t4, t2, s4 /* tmp10 = tmp0 - z2 */ 3447 addu t7, t4, s6 3448 subu t8, t4, s6 3449 addu t5, t3, s7 3450 subu t6, t3, s7 3451 shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ 3452 shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ 3453 shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ 3454 shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ 3455 sll s4, t9, 2 3456 lw v0, 8(a2) /* output_buf[ctr] */ 3457 shll_s.w t5, t5, 24 3458 shll_s.w t6, t6, 24 3459 shll_s.w t7, t7, 24 3460 shll_s.w t8, t8, 24 3461 sra t5, t5, 24 3462 sra t6, t6, 24 3463 sra t7, t7, 24 3464 sra t8, t8, 24 3465 addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ 3466 addiu t5, t5, 128 3467 addiu t6, t6, 128 3468 addiu t7, t7, 128 3469 addiu t8, t8, 128 3470 sb t5, 0(v0) 3471 sb t7, 1(v0) 3472 sb t8, 2(v0) 3473 sb t6, 3(v0) 3474 li s4, 15137 3475 lw s6, 104(t1) /* wsptr[2] */ 3476 li s5, 6270 3477 lw s7, 120(t1) /* wsptr[6] */ 3478 mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2], 3479 FIX_1_847759065) */ 3480 lw t2, 96(t1) /* wsptr[0] */ 3481 mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6], 3482 -FIX_0_765366865) */ 3483 lh t5, 124(t1) /* wsptr[7] */ 3484 lh t6, 116(t1) /* wsptr[5] */ 3485 lh t7, 108(t1) /* wsptr[3] */ 3486 lh t8, 100(t1) /* wsptr[1] */ 3487 ins t5, t6, 16, 16 3488 ins t7, t8, 16, 16 3489 mult $ac0, zero, zero 3490 dpa.w.ph $ac0, t5, s0 3491 dpa.w.ph $ac0, t7, s1 3492 mult $ac1, zero, zero 3493 dpa.w.ph $ac1, t5, s2 3494 dpa.w.ph $ac1, t7, s3 3495 sll t2, t2, 14 /* tmp0 = 3496 ((JLONG)wsptr[0]) << (CONST_BITS+1) */ 3497 mflo s6, $ac0 3498 /* MULTIPLY(wsptr[2], FIX_1_847759065) + 3499 MULTIPLY(wsptr[6], -FIX_0_765366865) */ 3500 subu s4, s4, s5 3501 addu t3, t2, s4 /* tmp10 = tmp0 + z2; */ 3502 mflo s7, $ac1 3503 subu t4, t2, s4 /* tmp10 = tmp0 - z2; */ 3504 addu t7, t4, s6 3505 subu t8, t4, s6 3506 addu t5, t3, s7 3507 subu t6, t3, s7 3508 shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */ 3509 shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */ 3510 shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */ 3511 shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */ 3512 sll s4, t9, 2 3513 lw v0, 12(a2) /* output_buf[ctr] */ 3514 shll_s.w t5, t5, 24 3515 shll_s.w t6, t6, 24 3516 shll_s.w t7, t7, 24 3517 shll_s.w t8, t8, 24 3518 sra t5, t5, 24 3519 sra t6, t6, 24 3520 sra t7, t7, 24 3521 sra t8, t8, 24 3522 addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */ 3523 addiu t5, t5, 128 3524 addiu t6, t6, 128 3525 addiu t7, t7, 128 3526 addiu t8, t8, 128 3527 sb t5, 0(v0) 3528 sb t7, 1(v0) 3529 sb t8, 2(v0) 3530 sb t6, 3(v0) 3531 3532 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3533 3534 j ra 3535 nop 3536END(jsimd_idct_4x4_dspr2) 3537 3538 3539/*****************************************************************************/ 3540LEAF_DSPR2(jsimd_idct_6x6_dspr2) 3541/* 3542 * a0 = compptr->dct_table 3543 * a1 = coef_block 3544 * a2 = output_buf 3545 * a3 = output_col 3546 */ 3547 .set at 3548 3549 SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3550 3551 addiu sp, sp, -144 3552 move v0, sp 3553 addiu v1, v0, 24 3554 addiu t9, zero, 5793 3555 addiu s0, zero, 10033 3556 addiu s1, zero, 2998 3557 35581: 3559 lh s2, 0(a0) /* q0 = quantptr[ 0] */ 3560 lh s3, 32(a0) /* q1 = quantptr[16] */ 3561 lh s4, 64(a0) /* q2 = quantptr[32] */ 3562 lh t2, 64(a1) /* tmp2 = inptr[32] */ 3563 lh t1, 32(a1) /* tmp1 = inptr[16] */ 3564 lh t0, 0(a1) /* tmp0 = inptr[ 0] */ 3565 mul t2, t2, s4 /* tmp2 = tmp2 * q2 */ 3566 mul t1, t1, s3 /* tmp1 = tmp1 * q1 */ 3567 mul t0, t0, s2 /* tmp0 = tmp0 * q0 */ 3568 lh t6, 16(a1) /* z1 = inptr[ 8] */ 3569 lh t8, 80(a1) /* z3 = inptr[40] */ 3570 lh t7, 48(a1) /* z2 = inptr[24] */ 3571 lh s2, 16(a0) /* q0 = quantptr[ 8] */ 3572 lh s4, 80(a0) /* q2 = quantptr[40] */ 3573 lh s3, 48(a0) /* q1 = quantptr[24] */ 3574 mul t2, t2, t9 /* tmp2 = tmp2 * 5793 */ 3575 mul t1, t1, s0 /* tmp1 = tmp1 * 10033 */ 3576 sll t0, t0, 13 /* tmp0 = tmp0 << 13 */ 3577 mul t6, t6, s2 /* z1 = z1 * q0 */ 3578 mul t8, t8, s4 /* z3 = z3 * q2 */ 3579 mul t7, t7, s3 /* z2 = z2 * q1 */ 3580 addu t3, t0, t2 /* tmp10 = tmp0 + tmp2 */ 3581 sll t2, t2, 1 /* tmp2 = tmp2 << 2 */ 3582 subu t4, t0, t2 /* tmp11 = tmp0 - tmp2; */ 3583 subu t5, t3, t1 /* tmp12 = tmp10 - tmp1 */ 3584 addu t3, t3, t1 /* tmp10 = tmp10 + tmp1 */ 3585 addu t1, t6, t8 /* tmp1 = z1 + z3 */ 3586 mul t1, t1, s1 /* tmp1 = tmp1 * 2998 */ 3587 shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */ 3588 subu t2, t6, t8 /* tmp2 = z1 - z3 */ 3589 subu t2, t2, t7 /* tmp2 = tmp2 - z2 */ 3590 sll t2, t2, 2 /* tmp2 = tmp2 << 2 */ 3591 addu t0, t6, t7 /* tmp0 = z1 + z2 */ 3592 sll t0, t0, 13 /* tmp0 = tmp0 << 13 */ 3593 subu s2, t8, t7 /* q0 = z3 - z2 */ 3594 sll s2, s2, 13 /* q0 = q0 << 13 */ 3595 addu t0, t0, t1 /* tmp0 = tmp0 + tmp1 */ 3596 addu t1, s2, t1 /* tmp1 = q0 + tmp1 */ 3597 addu s2, t4, t2 /* q0 = tmp11 + tmp2 */ 3598 subu s3, t4, t2 /* q1 = tmp11 - tmp2 */ 3599 addu t6, t3, t0 /* z1 = tmp10 + tmp0 */ 3600 subu t7, t3, t0 /* z2 = tmp10 - tmp0 */ 3601 addu t4, t5, t1 /* tmp11 = tmp12 + tmp1 */ 3602 subu t5, t5, t1 /* tmp12 = tmp12 - tmp1 */ 3603 shra_r.w t6, t6, 11 /* z1 = (z1 + 1024) >> 11 */ 3604 shra_r.w t7, t7, 11 /* z2 = (z2 + 1024) >> 11 */ 3605 shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */ 3606 shra_r.w t5, t5, 11 /* tmp12 = (tmp12 + 1024) >> 11 */ 3607 sw s2, 24(v0) 3608 sw s3, 96(v0) 3609 sw t6, 0(v0) 3610 sw t7, 120(v0) 3611 sw t4, 48(v0) 3612 sw t5, 72(v0) 3613 addiu v0, v0, 4 3614 addiu a1, a1, 2 3615 bne v0, v1, 1b 3616 addiu a0, a0, 2 3617 3618 /* Pass 2: process 6 rows from work array, store into output array. */ 3619 move v0, sp 3620 addiu v1, v0, 144 3621 36222: 3623 lw t0, 0(v0) 3624 lw t2, 16(v0) 3625 lw s5, 0(a2) 3626 addiu t0, t0, 16 3627 sll t0, t0, 13 3628 mul t3, t2, t9 3629 lw t6, 4(v0) 3630 lw t8, 20(v0) 3631 lw t7, 12(v0) 3632 addu s5, s5, a3 3633 addu s6, t6, t8 3634 mul s6, s6, s1 3635 addu t1, t0, t3 3636 subu t4, t0, t3 3637 subu t4, t4, t3 3638 lw t3, 8(v0) 3639 mul t0, t3, s0 3640 addu s7, t6, t7 3641 sll s7, s7, 13 3642 addu s7, s6, s7 3643 subu t2, t8, t7 3644 sll t2, t2, 13 3645 addu t2, s6, t2 3646 subu s6, t6, t7 3647 subu s6, s6, t8 3648 sll s6, s6, 13 3649 addu t3, t1, t0 3650 subu t5, t1, t0 3651 addu t6, t3, s7 3652 subu t3, t3, s7 3653 addu t7, t4, s6 3654 subu t4, t4, s6 3655 addu t8, t5, t2 3656 subu t5, t5, t2 3657 shll_s.w t6, t6, 6 3658 shll_s.w t3, t3, 6 3659 shll_s.w t7, t7, 6 3660 shll_s.w t4, t4, 6 3661 shll_s.w t8, t8, 6 3662 shll_s.w t5, t5, 6 3663 sra t6, t6, 24 3664 addiu t6, t6, 128 3665 sra t3, t3, 24 3666 addiu t3, t3, 128 3667 sb t6, 0(s5) 3668 sra t7, t7, 24 3669 addiu t7, t7, 128 3670 sb t3, 5(s5) 3671 sra t4, t4, 24 3672 addiu t4, t4, 128 3673 sb t7, 1(s5) 3674 sra t8, t8, 24 3675 addiu t8, t8, 128 3676 sb t4, 4(s5) 3677 addiu v0, v0, 24 3678 sra t5, t5, 24 3679 addiu t5, t5, 128 3680 sb t8, 2(s5) 3681 addiu a2, a2, 4 3682 bne v0, v1, 2b 3683 sb t5, 3(s5) 3684 3685 addiu sp, sp, 144 3686 3687 RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 3688 3689 j ra 3690 nop 3691 3692END(jsimd_idct_6x6_dspr2) 3693 3694 3695/*****************************************************************************/ 3696LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2) 3697/* 3698 * a0 = compptr->dct_table 3699 * a1 = coef_block 3700 * a2 = workspace 3701 */ 3702 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 3703 3704 li a3, 8 3705 37061: 3707 /* odd part */ 3708 lh t0, 48(a1) 3709 lh t1, 48(a0) 3710 lh t2, 16(a1) 3711 lh t3, 16(a0) 3712 lh t4, 80(a1) 3713 lh t5, 80(a0) 3714 lh t6, 112(a1) 3715 lh t7, 112(a0) 3716 mul t0, t0, t1 /* z2 */ 3717 mul t1, t2, t3 /* z1 */ 3718 mul t2, t4, t5 /* z3 */ 3719 mul t3, t6, t7 /* z4 */ 3720 li t4, 10703 /* FIX(1.306562965) */ 3721 li t5, 4433 /* FIX_0_541196100 */ 3722 li t6, 7053 /* FIX(0.860918669) */ 3723 mul t4, t0, t4 /* tmp11 */ 3724 mul t5, t0, t5 /* -tmp14 */ 3725 addu t7, t1, t2 /* tmp10 */ 3726 addu t8, t7, t3 /* tmp10 + z4 */ 3727 mul t6, t6, t8 /* tmp15 */ 3728 li t8, 2139 /* FIX(0.261052384) */ 3729 mul t8, t7, t8 /* MULTIPLY(tmp10, FIX(0.261052384)) */ 3730 li t7, 2295 /* FIX(0.280143716) */ 3731 mul t7, t1, t7 /* MULTIPLY(z1, FIX(0.280143716)) */ 3732 addu t9, t2, t3 /* z3 + z4 */ 3733 li s0, 8565 /* FIX(1.045510580) */ 3734 mul t9, t9, s0 /* -tmp13 */ 3735 li s0, 12112 /* FIX(1.478575242) */ 3736 mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242) */ 3737 li s1, 12998 /* FIX(1.586706681) */ 3738 mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */ 3739 li s2, 5540 /* FIX(0.676326758) */ 3740 mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */ 3741 li s3, 16244 /* FIX(1.982889723) */ 3742 mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */ 3743 subu t1, t1, t3 /* z1-=z4 */ 3744 subu t0, t0, t2 /* z2-=z3 */ 3745 addu t2, t0, t1 /* z1+z2 */ 3746 li t3, 4433 /* FIX_0_541196100 */ 3747 mul t2, t2, t3 /* z3 */ 3748 li t3, 6270 /* FIX_0_765366865 */ 3749 mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */ 3750 li t3, 15137 /* FIX_0_765366865 */ 3751 mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */ 3752 addu t8, t6, t8 /* tmp12 */ 3753 addu t3, t8, t4 /* tmp12 + tmp11 */ 3754 addu t3, t3, t7 /* tmp10 */ 3755 subu t8, t8, t9 /* tmp12 + tmp13 */ 3756 addu s0, t5, s0 3757 subu t8, t8, s0 /* tmp12 */ 3758 subu t9, t6, t9 3759 subu s1, s1, t4 3760 addu t9, t9, s1 /* tmp13 */ 3761 subu t6, t6, t5 3762 subu t6, t6, s2 3763 subu t6, t6, s3 /* tmp15 */ 3764 /* even part start */ 3765 lh t4, 64(a1) 3766 lh t5, 64(a0) 3767 lh t7, 32(a1) 3768 lh s0, 32(a0) 3769 lh s1, 0(a1) 3770 lh s2, 0(a0) 3771 lh s3, 96(a1) 3772 lh v0, 96(a0) 3773 mul t4, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*4], 3774 quantptr[DCTSIZE*4]) */ 3775 mul t5, t7, s0 /* DEQUANTIZE(inptr[DCTSIZE*2], 3776 quantptr[DCTSIZE*2]) */ 3777 mul t7, s1, s2 /* DEQUANTIZE(inptr[DCTSIZE*0], 3778 quantptr[DCTSIZE*0]) */ 3779 mul s0, s3, v0 /* DEQUANTIZE(inptr[DCTSIZE*6], 3780 quantptr[DCTSIZE*6]) */ 3781 /* odd part end */ 3782 addu t1, t2, t1 /* tmp11 */ 3783 subu t0, t2, t0 /* tmp14 */ 3784 /* update counter and pointers */ 3785 addiu a3, a3, -1 3786 addiu a0, a0, 2 3787 addiu a1, a1, 2 3788 /* even part rest */ 3789 li s1, 10033 3790 li s2, 11190 3791 mul t4, t4, s1 /* z4 */ 3792 mul s1, t5, s2 /* z4 */ 3793 sll t5, t5, 13 /* z1 */ 3794 sll t7, t7, 13 3795 addiu t7, t7, 1024 /* z3 */ 3796 sll s0, s0, 13 /* z2 */ 3797 addu s2, t7, t4 /* tmp10 */ 3798 subu t4, t7, t4 /* tmp11 */ 3799 subu s3, t5, s0 /* tmp12 */ 3800 addu t2, t7, s3 /* tmp21 */ 3801 subu s3, t7, s3 /* tmp24 */ 3802 addu t7, s1, s0 /* tmp12 */ 3803 addu v0, s2, t7 /* tmp20 */ 3804 subu s2, s2, t7 /* tmp25 */ 3805 subu s1, s1, t5 /* z4 - z1 */ 3806 subu s1, s1, s0 /* tmp12 */ 3807 addu s0, t4, s1 /* tmp22 */ 3808 subu t4, t4, s1 /* tmp23 */ 3809 /* final output stage */ 3810 addu t5, v0, t3 3811 subu v0, v0, t3 3812 addu t3, t2, t1 3813 subu t2, t2, t1 3814 addu t1, s0, t8 3815 subu s0, s0, t8 3816 addu t8, t4, t9 3817 subu t4, t4, t9 3818 addu t9, s3, t0 3819 subu s3, s3, t0 3820 addu t0, s2, t6 3821 subu s2, s2, t6 3822 sra t5, t5, 11 3823 sra t3, t3, 11 3824 sra t1, t1, 11 3825 sra t8, t8, 11 3826 sra t9, t9, 11 3827 sra t0, t0, 11 3828 sra s2, s2, 11 3829 sra s3, s3, 11 3830 sra t4, t4, 11 3831 sra s0, s0, 11 3832 sra t2, t2, 11 3833 sra v0, v0, 11 3834 sw t5, 0(a2) 3835 sw t3, 32(a2) 3836 sw t1, 64(a2) 3837 sw t8, 96(a2) 3838 sw t9, 128(a2) 3839 sw t0, 160(a2) 3840 sw s2, 192(a2) 3841 sw s3, 224(a2) 3842 sw t4, 256(a2) 3843 sw s0, 288(a2) 3844 sw t2, 320(a2) 3845 sw v0, 352(a2) 3846 bgtz a3, 1b 3847 addiu a2, a2, 4 3848 3849 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 3850 3851 j ra 3852 nop 3853 3854END(jsimd_idct_12x12_pass1_dspr2) 3855 3856 3857/*****************************************************************************/ 3858LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2) 3859/* 3860 * a0 = workspace 3861 * a1 = output 3862 */ 3863 SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 3864 3865 li a3, 12 3866 38671: 3868 /* Odd part */ 3869 lw t0, 12(a0) 3870 lw t1, 4(a0) 3871 lw t2, 20(a0) 3872 lw t3, 28(a0) 3873 li t4, 10703 /* FIX(1.306562965) */ 3874 li t5, 4433 /* FIX_0_541196100 */ 3875 mul t4, t0, t4 /* tmp11 */ 3876 mul t5, t0, t5 /* -tmp14 */ 3877 addu t6, t1, t2 /* tmp10 */ 3878 li t7, 2139 /* FIX(0.261052384) */ 3879 mul t7, t6, t7 /* MULTIPLY(tmp10, FIX(0.261052384)) */ 3880 addu t6, t6, t3 /* tmp10 + z4 */ 3881 li t8, 7053 /* FIX(0.860918669) */ 3882 mul t6, t6, t8 /* tmp15 */ 3883 li t8, 2295 /* FIX(0.280143716) */ 3884 mul t8, t1, t8 /* MULTIPLY(z1, FIX(0.280143716)) */ 3885 addu t9, t2, t3 /* z3 + z4 */ 3886 li s0, 8565 /* FIX(1.045510580) */ 3887 mul t9, t9, s0 /* -tmp13 */ 3888 li s0, 12112 /* FIX(1.478575242) */ 3889 mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242)) */ 3890 li s1, 12998 /* FIX(1.586706681) */ 3891 mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */ 3892 li s2, 5540 /* FIX(0.676326758) */ 3893 mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */ 3894 li s3, 16244 /* FIX(1.982889723) */ 3895 mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */ 3896 subu t1, t1, t3 /* z1 -= z4 */ 3897 subu t0, t0, t2 /* z2 -= z3 */ 3898 addu t2, t1, t0 /* z1 + z2 */ 3899 li t3, 4433 /* FIX_0_541196100 */ 3900 mul t2, t2, t3 /* z3 */ 3901 li t3, 6270 /* FIX_0_765366865 */ 3902 mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */ 3903 li t3, 15137 /* FIX_1_847759065 */ 3904 mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */ 3905 addu t3, t6, t7 /* tmp12 */ 3906 addu t7, t3, t4 3907 addu t7, t7, t8 /* tmp10 */ 3908 subu t3, t3, t9 3909 subu t3, t3, t5 3910 subu t3, t3, s0 /* tmp12 */ 3911 subu t9, t6, t9 3912 subu t9, t9, t4 3913 addu t9, t9, s1 /* tmp13 */ 3914 subu t6, t6, t5 3915 subu t6, t6, s2 3916 subu t6, t6, s3 /* tmp15 */ 3917 addu t1, t2, t1 /* tmp11 */ 3918 subu t0, t2, t0 /* tmp14 */ 3919 /* even part */ 3920 lw t2, 16(a0) /* z4 */ 3921 lw t4, 8(a0) /* z1 */ 3922 lw t5, 0(a0) /* z3 */ 3923 lw t8, 24(a0) /* z2 */ 3924 li s0, 10033 /* FIX(1.224744871) */ 3925 li s1, 11190 /* FIX(1.366025404) */ 3926 mul t2, t2, s0 /* z4 */ 3927 mul s0, t4, s1 /* z4 */ 3928 addiu t5, t5, 0x10 3929 sll t5, t5, 13 /* z3 */ 3930 sll t4, t4, 13 /* z1 */ 3931 sll t8, t8, 13 /* z2 */ 3932 subu s1, t4, t8 /* tmp12 */ 3933 addu s2, t5, t2 /* tmp10 */ 3934 subu t2, t5, t2 /* tmp11 */ 3935 addu s3, t5, s1 /* tmp21 */ 3936 subu s1, t5, s1 /* tmp24 */ 3937 addu t5, s0, t8 /* tmp12 */ 3938 addu v0, s2, t5 /* tmp20 */ 3939 subu t5, s2, t5 /* tmp25 */ 3940 subu t4, s0, t4 3941 subu t4, t4, t8 /* tmp12 */ 3942 addu t8, t2, t4 /* tmp22 */ 3943 subu t2, t2, t4 /* tmp23 */ 3944 /* increment counter and pointers */ 3945 addiu a3, a3, -1 3946 addiu a0, a0, 32 3947 /* Final stage */ 3948 addu t4, v0, t7 3949 subu v0, v0, t7 3950 addu t7, s3, t1 3951 subu s3, s3, t1 3952 addu t1, t8, t3 3953 subu t8, t8, t3 3954 addu t3, t2, t9 3955 subu t2, t2, t9 3956 addu t9, s1, t0 3957 subu s1, s1, t0 3958 addu t0, t5, t6 3959 subu t5, t5, t6 3960 sll t4, t4, 4 3961 sll t7, t7, 4 3962 sll t1, t1, 4 3963 sll t3, t3, 4 3964 sll t9, t9, 4 3965 sll t0, t0, 4 3966 sll t5, t5, 4 3967 sll s1, s1, 4 3968 sll t2, t2, 4 3969 sll t8, t8, 4 3970 sll s3, s3, 4 3971 sll v0, v0, 4 3972 shll_s.w t4, t4, 2 3973 shll_s.w t7, t7, 2 3974 shll_s.w t1, t1, 2 3975 shll_s.w t3, t3, 2 3976 shll_s.w t9, t9, 2 3977 shll_s.w t0, t0, 2 3978 shll_s.w t5, t5, 2 3979 shll_s.w s1, s1, 2 3980 shll_s.w t2, t2, 2 3981 shll_s.w t8, t8, 2 3982 shll_s.w s3, s3, 2 3983 shll_s.w v0, v0, 2 3984 srl t4, t4, 24 3985 srl t7, t7, 24 3986 srl t1, t1, 24 3987 srl t3, t3, 24 3988 srl t9, t9, 24 3989 srl t0, t0, 24 3990 srl t5, t5, 24 3991 srl s1, s1, 24 3992 srl t2, t2, 24 3993 srl t8, t8, 24 3994 srl s3, s3, 24 3995 srl v0, v0, 24 3996 lw t6, 0(a1) 3997 addiu t4, t4, 0x80 3998 addiu t7, t7, 0x80 3999 addiu t1, t1, 0x80 4000 addiu t3, t3, 0x80 4001 addiu t9, t9, 0x80 4002 addiu t0, t0, 0x80 4003 addiu t5, t5, 0x80 4004 addiu s1, s1, 0x80 4005 addiu t2, t2, 0x80 4006 addiu t8, t8, 0x80 4007 addiu s3, s3, 0x80 4008 addiu v0, v0, 0x80 4009 sb t4, 0(t6) 4010 sb t7, 1(t6) 4011 sb t1, 2(t6) 4012 sb t3, 3(t6) 4013 sb t9, 4(t6) 4014 sb t0, 5(t6) 4015 sb t5, 6(t6) 4016 sb s1, 7(t6) 4017 sb t2, 8(t6) 4018 sb t8, 9(t6) 4019 sb s3, 10(t6) 4020 sb v0, 11(t6) 4021 bgtz a3, 1b 4022 addiu a1, a1, 4 4023 4024 RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 4025 4026 jr ra 4027 nop 4028 4029END(jsimd_idct_12x12_pass2_dspr2) 4030 4031 4032/*****************************************************************************/ 4033LEAF_DSPR2(jsimd_convsamp_dspr2) 4034/* 4035 * a0 = sample_data 4036 * a1 = start_col 4037 * a2 = workspace 4038 */ 4039 lw t0, 0(a0) 4040 li t7, 0xff80ff80 4041 addu t0, t0, a1 4042 ulw t1, 0(t0) 4043 ulw t2, 4(t0) 4044 preceu.ph.qbr t3, t1 4045 preceu.ph.qbl t4, t1 4046 lw t0, 4(a0) 4047 preceu.ph.qbr t5, t2 4048 preceu.ph.qbl t6, t2 4049 addu t0, t0, a1 4050 addu.ph t3, t3, t7 4051 addu.ph t4, t4, t7 4052 ulw t1, 0(t0) 4053 ulw t2, 4(t0) 4054 addu.ph t5, t5, t7 4055 addu.ph t6, t6, t7 4056 usw t3, 0(a2) 4057 usw t4, 4(a2) 4058 preceu.ph.qbr t3, t1 4059 preceu.ph.qbl t4, t1 4060 usw t5, 8(a2) 4061 usw t6, 12(a2) 4062 4063 lw t0, 8(a0) 4064 preceu.ph.qbr t5, t2 4065 preceu.ph.qbl t6, t2 4066 addu t0, t0, a1 4067 addu.ph t3, t3, t7 4068 addu.ph t4, t4, t7 4069 ulw t1, 0(t0) 4070 ulw t2, 4(t0) 4071 addu.ph t5, t5, t7 4072 addu.ph t6, t6, t7 4073 usw t3, 16(a2) 4074 usw t4, 20(a2) 4075 preceu.ph.qbr t3, t1 4076 preceu.ph.qbl t4, t1 4077 usw t5, 24(a2) 4078 usw t6, 28(a2) 4079 4080 lw t0, 12(a0) 4081 preceu.ph.qbr t5, t2 4082 preceu.ph.qbl t6, t2 4083 addu t0, t0, a1 4084 addu.ph t3, t3, t7 4085 addu.ph t4, t4, t7 4086 ulw t1, 0(t0) 4087 ulw t2, 4(t0) 4088 addu.ph t5, t5, t7 4089 addu.ph t6, t6, t7 4090 usw t3, 32(a2) 4091 usw t4, 36(a2) 4092 preceu.ph.qbr t3, t1 4093 preceu.ph.qbl t4, t1 4094 usw t5, 40(a2) 4095 usw t6, 44(a2) 4096 4097 lw t0, 16(a0) 4098 preceu.ph.qbr t5, t2 4099 preceu.ph.qbl t6, t2 4100 addu t0, t0, a1 4101 addu.ph t3, t3, t7 4102 addu.ph t4, t4, t7 4103 ulw t1, 0(t0) 4104 ulw t2, 4(t0) 4105 addu.ph t5, t5, t7 4106 addu.ph t6, t6, t7 4107 usw t3, 48(a2) 4108 usw t4, 52(a2) 4109 preceu.ph.qbr t3, t1 4110 preceu.ph.qbl t4, t1 4111 usw t5, 56(a2) 4112 usw t6, 60(a2) 4113 4114 lw t0, 20(a0) 4115 preceu.ph.qbr t5, t2 4116 preceu.ph.qbl t6, t2 4117 addu t0, t0, a1 4118 addu.ph t3, t3, t7 4119 addu.ph t4, t4, t7 4120 ulw t1, 0(t0) 4121 ulw t2, 4(t0) 4122 addu.ph t5, t5, t7 4123 addu.ph t6, t6, t7 4124 usw t3, 64(a2) 4125 usw t4, 68(a2) 4126 preceu.ph.qbr t3, t1 4127 preceu.ph.qbl t4, t1 4128 usw t5, 72(a2) 4129 usw t6, 76(a2) 4130 4131 lw t0, 24(a0) 4132 preceu.ph.qbr t5, t2 4133 preceu.ph.qbl t6, t2 4134 addu t0, t0, a1 4135 addu.ph t3, t3, t7 4136 addu.ph t4, t4, t7 4137 ulw t1, 0(t0) 4138 ulw t2, 4(t0) 4139 addu.ph t5, t5, t7 4140 addu.ph t6, t6, t7 4141 usw t3, 80(a2) 4142 usw t4, 84(a2) 4143 preceu.ph.qbr t3, t1 4144 preceu.ph.qbl t4, t1 4145 usw t5, 88(a2) 4146 usw t6, 92(a2) 4147 4148 lw t0, 28(a0) 4149 preceu.ph.qbr t5, t2 4150 preceu.ph.qbl t6, t2 4151 addu t0, t0, a1 4152 addu.ph t3, t3, t7 4153 addu.ph t4, t4, t7 4154 ulw t1, 0(t0) 4155 ulw t2, 4(t0) 4156 addu.ph t5, t5, t7 4157 addu.ph t6, t6, t7 4158 usw t3, 96(a2) 4159 usw t4, 100(a2) 4160 preceu.ph.qbr t3, t1 4161 preceu.ph.qbl t4, t1 4162 usw t5, 104(a2) 4163 usw t6, 108(a2) 4164 preceu.ph.qbr t5, t2 4165 preceu.ph.qbl t6, t2 4166 addu.ph t3, t3, t7 4167 addu.ph t4, t4, t7 4168 addu.ph t5, t5, t7 4169 addu.ph t6, t6, t7 4170 usw t3, 112(a2) 4171 usw t4, 116(a2) 4172 usw t5, 120(a2) 4173 usw t6, 124(a2) 4174 4175 j ra 4176 nop 4177 4178END(jsimd_convsamp_dspr2) 4179 4180 4181#ifndef __mips_soft_float 4182 4183/*****************************************************************************/ 4184LEAF_DSPR2(jsimd_convsamp_float_dspr2) 4185/* 4186 * a0 = sample_data 4187 * a1 = start_col 4188 * a2 = workspace 4189 */ 4190 .set at 4191 4192 lw t0, 0(a0) 4193 addu t0, t0, a1 4194 lbu t1, 0(t0) 4195 lbu t2, 1(t0) 4196 lbu t3, 2(t0) 4197 lbu t4, 3(t0) 4198 lbu t5, 4(t0) 4199 lbu t6, 5(t0) 4200 lbu t7, 6(t0) 4201 lbu t8, 7(t0) 4202 addiu t1, t1, -128 4203 addiu t2, t2, -128 4204 addiu t3, t3, -128 4205 addiu t4, t4, -128 4206 addiu t5, t5, -128 4207 addiu t6, t6, -128 4208 addiu t7, t7, -128 4209 addiu t8, t8, -128 4210 mtc1 t1, f2 4211 mtc1 t2, f4 4212 mtc1 t3, f6 4213 mtc1 t4, f8 4214 mtc1 t5, f10 4215 mtc1 t6, f12 4216 mtc1 t7, f14 4217 mtc1 t8, f16 4218 cvt.s.w f2, f2 4219 cvt.s.w f4, f4 4220 cvt.s.w f6, f6 4221 cvt.s.w f8, f8 4222 cvt.s.w f10, f10 4223 cvt.s.w f12, f12 4224 cvt.s.w f14, f14 4225 cvt.s.w f16, f16 4226 lw t0, 4(a0) 4227 swc1 f2, 0(a2) 4228 swc1 f4, 4(a2) 4229 swc1 f6, 8(a2) 4230 addu t0, t0, a1 4231 swc1 f8, 12(a2) 4232 swc1 f10, 16(a2) 4233 swc1 f12, 20(a2) 4234 swc1 f14, 24(a2) 4235 swc1 f16, 28(a2) 4236 /* elemr 1 */ 4237 lbu t1, 0(t0) 4238 lbu t2, 1(t0) 4239 lbu t3, 2(t0) 4240 lbu t4, 3(t0) 4241 lbu t5, 4(t0) 4242 lbu t6, 5(t0) 4243 lbu t7, 6(t0) 4244 lbu t8, 7(t0) 4245 addiu t1, t1, -128 4246 addiu t2, t2, -128 4247 addiu t3, t3, -128 4248 addiu t4, t4, -128 4249 addiu t5, t5, -128 4250 addiu t6, t6, -128 4251 addiu t7, t7, -128 4252 addiu t8, t8, -128 4253 mtc1 t1, f2 4254 mtc1 t2, f4 4255 mtc1 t3, f6 4256 mtc1 t4, f8 4257 mtc1 t5, f10 4258 mtc1 t6, f12 4259 mtc1 t7, f14 4260 mtc1 t8, f16 4261 cvt.s.w f2, f2 4262 cvt.s.w f4, f4 4263 cvt.s.w f6, f6 4264 cvt.s.w f8, f8 4265 cvt.s.w f10, f10 4266 cvt.s.w f12, f12 4267 cvt.s.w f14, f14 4268 cvt.s.w f16, f16 4269 lw t0, 8(a0) 4270 swc1 f2, 32(a2) 4271 swc1 f4, 36(a2) 4272 swc1 f6, 40(a2) 4273 addu t0, t0, a1 4274 swc1 f8, 44(a2) 4275 swc1 f10, 48(a2) 4276 swc1 f12, 52(a2) 4277 swc1 f14, 56(a2) 4278 swc1 f16, 60(a2) 4279 /* elemr 2 */ 4280 lbu t1, 0(t0) 4281 lbu t2, 1(t0) 4282 lbu t3, 2(t0) 4283 lbu t4, 3(t0) 4284 lbu t5, 4(t0) 4285 lbu t6, 5(t0) 4286 lbu t7, 6(t0) 4287 lbu t8, 7(t0) 4288 addiu t1, t1, -128 4289 addiu t2, t2, -128 4290 addiu t3, t3, -128 4291 addiu t4, t4, -128 4292 addiu t5, t5, -128 4293 addiu t6, t6, -128 4294 addiu t7, t7, -128 4295 addiu t8, t8, -128 4296 mtc1 t1, f2 4297 mtc1 t2, f4 4298 mtc1 t3, f6 4299 mtc1 t4, f8 4300 mtc1 t5, f10 4301 mtc1 t6, f12 4302 mtc1 t7, f14 4303 mtc1 t8, f16 4304 cvt.s.w f2, f2 4305 cvt.s.w f4, f4 4306 cvt.s.w f6, f6 4307 cvt.s.w f8, f8 4308 cvt.s.w f10, f10 4309 cvt.s.w f12, f12 4310 cvt.s.w f14, f14 4311 cvt.s.w f16, f16 4312 lw t0, 12(a0) 4313 swc1 f2, 64(a2) 4314 swc1 f4, 68(a2) 4315 swc1 f6, 72(a2) 4316 addu t0, t0, a1 4317 swc1 f8, 76(a2) 4318 swc1 f10, 80(a2) 4319 swc1 f12, 84(a2) 4320 swc1 f14, 88(a2) 4321 swc1 f16, 92(a2) 4322 /* elemr 3 */ 4323 lbu t1, 0(t0) 4324 lbu t2, 1(t0) 4325 lbu t3, 2(t0) 4326 lbu t4, 3(t0) 4327 lbu t5, 4(t0) 4328 lbu t6, 5(t0) 4329 lbu t7, 6(t0) 4330 lbu t8, 7(t0) 4331 addiu t1, t1, -128 4332 addiu t2, t2, -128 4333 addiu t3, t3, -128 4334 addiu t4, t4, -128 4335 addiu t5, t5, -128 4336 addiu t6, t6, -128 4337 addiu t7, t7, -128 4338 addiu t8, t8, -128 4339 mtc1 t1, f2 4340 mtc1 t2, f4 4341 mtc1 t3, f6 4342 mtc1 t4, f8 4343 mtc1 t5, f10 4344 mtc1 t6, f12 4345 mtc1 t7, f14 4346 mtc1 t8, f16 4347 cvt.s.w f2, f2 4348 cvt.s.w f4, f4 4349 cvt.s.w f6, f6 4350 cvt.s.w f8, f8 4351 cvt.s.w f10, f10 4352 cvt.s.w f12, f12 4353 cvt.s.w f14, f14 4354 cvt.s.w f16, f16 4355 lw t0, 16(a0) 4356 swc1 f2, 96(a2) 4357 swc1 f4, 100(a2) 4358 swc1 f6, 104(a2) 4359 addu t0, t0, a1 4360 swc1 f8, 108(a2) 4361 swc1 f10, 112(a2) 4362 swc1 f12, 116(a2) 4363 swc1 f14, 120(a2) 4364 swc1 f16, 124(a2) 4365 /* elemr 4 */ 4366 lbu t1, 0(t0) 4367 lbu t2, 1(t0) 4368 lbu t3, 2(t0) 4369 lbu t4, 3(t0) 4370 lbu t5, 4(t0) 4371 lbu t6, 5(t0) 4372 lbu t7, 6(t0) 4373 lbu t8, 7(t0) 4374 addiu t1, t1, -128 4375 addiu t2, t2, -128 4376 addiu t3, t3, -128 4377 addiu t4, t4, -128 4378 addiu t5, t5, -128 4379 addiu t6, t6, -128 4380 addiu t7, t7, -128 4381 addiu t8, t8, -128 4382 mtc1 t1, f2 4383 mtc1 t2, f4 4384 mtc1 t3, f6 4385 mtc1 t4, f8 4386 mtc1 t5, f10 4387 mtc1 t6, f12 4388 mtc1 t7, f14 4389 mtc1 t8, f16 4390 cvt.s.w f2, f2 4391 cvt.s.w f4, f4 4392 cvt.s.w f6, f6 4393 cvt.s.w f8, f8 4394 cvt.s.w f10, f10 4395 cvt.s.w f12, f12 4396 cvt.s.w f14, f14 4397 cvt.s.w f16, f16 4398 lw t0, 20(a0) 4399 swc1 f2, 128(a2) 4400 swc1 f4, 132(a2) 4401 swc1 f6, 136(a2) 4402 addu t0, t0, a1 4403 swc1 f8, 140(a2) 4404 swc1 f10, 144(a2) 4405 swc1 f12, 148(a2) 4406 swc1 f14, 152(a2) 4407 swc1 f16, 156(a2) 4408 /* elemr 5 */ 4409 lbu t1, 0(t0) 4410 lbu t2, 1(t0) 4411 lbu t3, 2(t0) 4412 lbu t4, 3(t0) 4413 lbu t5, 4(t0) 4414 lbu t6, 5(t0) 4415 lbu t7, 6(t0) 4416 lbu t8, 7(t0) 4417 addiu t1, t1, -128 4418 addiu t2, t2, -128 4419 addiu t3, t3, -128 4420 addiu t4, t4, -128 4421 addiu t5, t5, -128 4422 addiu t6, t6, -128 4423 addiu t7, t7, -128 4424 addiu t8, t8, -128 4425 mtc1 t1, f2 4426 mtc1 t2, f4 4427 mtc1 t3, f6 4428 mtc1 t4, f8 4429 mtc1 t5, f10 4430 mtc1 t6, f12 4431 mtc1 t7, f14 4432 mtc1 t8, f16 4433 cvt.s.w f2, f2 4434 cvt.s.w f4, f4 4435 cvt.s.w f6, f6 4436 cvt.s.w f8, f8 4437 cvt.s.w f10, f10 4438 cvt.s.w f12, f12 4439 cvt.s.w f14, f14 4440 cvt.s.w f16, f16 4441 lw t0, 24(a0) 4442 swc1 f2, 160(a2) 4443 swc1 f4, 164(a2) 4444 swc1 f6, 168(a2) 4445 addu t0, t0, a1 4446 swc1 f8, 172(a2) 4447 swc1 f10, 176(a2) 4448 swc1 f12, 180(a2) 4449 swc1 f14, 184(a2) 4450 swc1 f16, 188(a2) 4451 /* elemr 6 */ 4452 lbu t1, 0(t0) 4453 lbu t2, 1(t0) 4454 lbu t3, 2(t0) 4455 lbu t4, 3(t0) 4456 lbu t5, 4(t0) 4457 lbu t6, 5(t0) 4458 lbu t7, 6(t0) 4459 lbu t8, 7(t0) 4460 addiu t1, t1, -128 4461 addiu t2, t2, -128 4462 addiu t3, t3, -128 4463 addiu t4, t4, -128 4464 addiu t5, t5, -128 4465 addiu t6, t6, -128 4466 addiu t7, t7, -128 4467 addiu t8, t8, -128 4468 mtc1 t1, f2 4469 mtc1 t2, f4 4470 mtc1 t3, f6 4471 mtc1 t4, f8 4472 mtc1 t5, f10 4473 mtc1 t6, f12 4474 mtc1 t7, f14 4475 mtc1 t8, f16 4476 cvt.s.w f2, f2 4477 cvt.s.w f4, f4 4478 cvt.s.w f6, f6 4479 cvt.s.w f8, f8 4480 cvt.s.w f10, f10 4481 cvt.s.w f12, f12 4482 cvt.s.w f14, f14 4483 cvt.s.w f16, f16 4484 lw t0, 28(a0) 4485 swc1 f2, 192(a2) 4486 swc1 f4, 196(a2) 4487 swc1 f6, 200(a2) 4488 addu t0, t0, a1 4489 swc1 f8, 204(a2) 4490 swc1 f10, 208(a2) 4491 swc1 f12, 212(a2) 4492 swc1 f14, 216(a2) 4493 swc1 f16, 220(a2) 4494 /* elemr 7 */ 4495 lbu t1, 0(t0) 4496 lbu t2, 1(t0) 4497 lbu t3, 2(t0) 4498 lbu t4, 3(t0) 4499 lbu t5, 4(t0) 4500 lbu t6, 5(t0) 4501 lbu t7, 6(t0) 4502 lbu t8, 7(t0) 4503 addiu t1, t1, -128 4504 addiu t2, t2, -128 4505 addiu t3, t3, -128 4506 addiu t4, t4, -128 4507 addiu t5, t5, -128 4508 addiu t6, t6, -128 4509 addiu t7, t7, -128 4510 addiu t8, t8, -128 4511 mtc1 t1, f2 4512 mtc1 t2, f4 4513 mtc1 t3, f6 4514 mtc1 t4, f8 4515 mtc1 t5, f10 4516 mtc1 t6, f12 4517 mtc1 t7, f14 4518 mtc1 t8, f16 4519 cvt.s.w f2, f2 4520 cvt.s.w f4, f4 4521 cvt.s.w f6, f6 4522 cvt.s.w f8, f8 4523 cvt.s.w f10, f10 4524 cvt.s.w f12, f12 4525 cvt.s.w f14, f14 4526 cvt.s.w f16, f16 4527 swc1 f2, 224(a2) 4528 swc1 f4, 228(a2) 4529 swc1 f6, 232(a2) 4530 swc1 f8, 236(a2) 4531 swc1 f10, 240(a2) 4532 swc1 f12, 244(a2) 4533 swc1 f14, 248(a2) 4534 swc1 f16, 252(a2) 4535 4536 j ra 4537 nop 4538 4539END(jsimd_convsamp_float_dspr2) 4540 4541#endif 4542 4543/*****************************************************************************/ 4544