1# 2# Copyright (C) 2011 The Android Open Source Project 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15 16 17# IDCT implementation using the MIPS DSP ASE (little endian version) 18# 19# See MIPS Technologies Inc documents: 20# "JPEG Decoder Optimization for MIPS32(R) Cores" MD00483 21# 22# "MIPS32(R) Architecture for Programmers Volume IV-e: The MIPS(R) DSP 23# Application Specifice Extension to the MIPS32(R) Architecture" MD00374 24# 25 26 .set noreorder 27 .set nomacro 28 .set noat 29 30# This table has been moved to mips_jidctfst.c to avoid having to mess 31# with the global pointer to make this code PIC. 32# .rdata 33# 34# mips_idct_coefs: 35# # Constant table of scaled IDCT coefficients. 36# 37# .word 0x45464546 # FIX( 1.082392200 / 2) = 17734 = 0x4546 38# .word 0x5A825A82 # FIX( 1.414213562 / 2) = 23170 = 0x5A82 39# .word 0x76427642 # FIX( 1.847759065 / 2) = 30274 = 0x7642 40# .word 0xAC61AC61 # FIX(-2.613125930 / 4) = -21407 = 0xAC61 41 42 .text 43 44 .global mips_idct_columns 45 .ent mips_idct_columns 46 47# void mips_idct_columns(JCOEF * inptr, IFAST_MULT_TYPE * quantptr, 48# DCTELEM * wsptr, const int * mips_idct_coefs); 49 50mips_idct_columns: 51 52# $a0 - inptr 53# $a1 - quantptr 54# $a2 - wsptr 55# $a3, $at - mips_idct_coefs 56# $t0:7 - simd data 57# $t8 - coefficients, temp 58# $t9 - loop end address 59# $s0:3 - simd quantization factors 60# $s4:7 - temp results 61# $v0:1 - temp results 62 63 addiu $sp, $sp, -32 # reserve stack space for s0-s7 64 65 sw $s0, 28($sp) 66 sw $s1, 24($sp) 67 sw $s2, 20($sp) 68 sw $s3, 16($sp) 69 sw $s4, 12($sp) 70 sw $s5, 8($sp) 71 sw $s6, 4($sp) 72 sw $s7, 0($sp) 73 74 addiu $t9, $a0, 16 # end address 75 76 #lui $at, %hi(mips_idct_coefs) 77 #ori $at, %lo(mips_idct_coefs) 78 # move mips_idct_coefs address from $a3 into $at where the rest of this code expects it 79 or $at, $a3, $zero 80 81loop_columns: 82 83 lw $s0, 0($a1) # quantptr[DCTSIZE*0] 84 85 lw $t0, 0($a0) # inptr[DCTSIZE*0] 86 lw $t1, 16($a0) # inptr[DCTSIZE*1] 87 88 muleq_s.w.phl $v0, $t0, $s0 # tmp0 ... 89 90 lw $t2, 32($a0) # inptr[DCTSIZE*2] 91 lw $t3, 48($a0) # inptr[DCTSIZE*3] 92 lw $t4, 64($a0) # inptr[DCTSIZE*4] 93 lw $t5, 80($a0) # inptr[DCTSIZE*5] 94 95 muleq_s.w.phr $t0, $t0, $s0 # ... tmp0 ... 96 97 lw $t6, 96($a0) # inptr[DCTSIZE*6] 98 lw $t7, 112($a0) # inptr[DCTSIZE*7] 99 100 or $s4, $t1, $t2 101 or $s5, $t3, $t4 102 103 bnez $s4, full_column 104 ins $t0, $v0, 16, 16 # ... tmp0 105 106 bnez $s5, full_column 107 or $s6, $t5, $t6 108 or $s6, $s6, $t7 109 bnez $s6, full_column 110 111 sw $t0, 0($a2) # wsptr[DCTSIZE*0] 112 sw $t0, 16($a2) # wsptr[DCTSIZE*1] 113 sw $t0, 32($a2) # wsptr[DCTSIZE*2] 114 sw $t0, 48($a2) # wsptr[DCTSIZE*3] 115 sw $t0, 64($a2) # wsptr[DCTSIZE*4] 116 sw $t0, 80($a2) # wsptr[DCTSIZE*5] 117 sw $t0, 96($a2) # wsptr[DCTSIZE*6] 118 sw $t0, 112($a2) # wsptr[DCTSIZE*7] 119 120 addiu $a0, $a0, 4 121 122 b continue_columns 123 addiu $a1, $a1, 4 124 125 126full_column: 127 128 lw $s1, 32($a1) # quantptr[DCTSIZE*2] 129 lw $s2, 64($a1) # quantptr[DCTSIZE*4] 130 131 muleq_s.w.phl $v0, $t2, $s1 # tmp1 ... 132 muleq_s.w.phr $t2, $t2, $s1 # ... tmp1 ... 133 134 lw $s0, 16($a1) # quantptr[DCTSIZE*1] 135 lw $s1, 48($a1) # quantptr[DCTSIZE*3] 136 lw $s3, 96($a1) # quantptr[DCTSIZE*6] 137 138 muleq_s.w.phl $v1, $t4, $s2 # tmp2 ... 139 muleq_s.w.phr $t4, $t4, $s2 # ... tmp2 ... 140 141 lw $s2, 80($a1) # quantptr[DCTSIZE*5] 142 lw $t8, 4($at) # FIX(1.414213562) 143 ins $t2, $v0, 16, 16 # ... tmp1 144 145 muleq_s.w.phl $v0, $t6, $s3 # tmp3 ... 146 muleq_s.w.phr $t6, $t6, $s3 # ... tmp3 ... 147 148 ins $t4, $v1, 16, 16 # ... tmp2 149 150 addq.ph $s4, $t0, $t4 # tmp10 151 subq.ph $s5, $t0, $t4 # tmp11 152 153 ins $t6, $v0, 16, 16 # ... tmp3 154 155 subq.ph $s6, $t2, $t6 # tmp12 ... 156 addq.ph $s7, $t2, $t6 # tmp13 157 158 mulq_rs.ph $s6, $s6, $t8 # ... tmp12 ... 159 160 addq.ph $t0, $s4, $s7 # tmp0 161 subq.ph $t6, $s4, $s7 # tmp3 162 163################ 164 165 muleq_s.w.phl $v0, $t1, $s0 # tmp4 ... 166 muleq_s.w.phr $t1, $t1, $s0 # ... tmp4 ... 167 168 shll_s.ph $s6, $s6, 1 # x2 169 170 lw $s3, 112($a1) # quantptr[DCTSIZE*7] 171 172 subq.ph $s6, $s6, $s7 # ... tmp12 173 174 muleq_s.w.phl $v1, $t7, $s3 # tmp7 ... 175 muleq_s.w.phr $t7, $t7, $s3 # ... tmp7 ... 176 177 ins $t1, $v0, 16, 16 # ... tmp4 178 179 addq.ph $t2, $s5, $s6 # tmp1 180 subq.ph $t4, $s5, $s6 # tmp2 181 182 muleq_s.w.phl $v0, $t5, $s2 # tmp6 ... 183 muleq_s.w.phr $t5, $t5, $s2 # ... tmp6 ... 184 185 ins $t7, $v1, 16, 16 # ... tmp7 186 187 addq.ph $s5, $t1, $t7 # z11 188 subq.ph $s6, $t1, $t7 # z12 189 190 muleq_s.w.phl $v1, $t3, $s1 # tmp5 ... 191 muleq_s.w.phr $t3, $t3, $s1 # ... tmp5 ... 192 193 ins $t5, $v0, 16, 16 # ... tmp6 194 195# stalls 196 197 ins $t3, $v1, 16, 16 # ... tmp5 198 199 200 addq.ph $s7, $t5, $t3 # z13 201 subq.ph $v0, $t5, $t3 # z10 202 203 addq.ph $t7, $s5, $s7 # tmp7 204 subq.ph $s5, $s5, $s7 # tmp11 ... 205 206 addq.ph $v1, $v0, $s6 # z5 ... 207 208 mulq_rs.ph $s5, $s5, $t8 # ... tmp11 209 210 lw $t8, 8($at) # FIX(1.847759065) 211 lw $s4, 0($at) # FIX(1.082392200) 212 213 addq.ph $s0, $t0, $t7 214 subq.ph $s1, $t0, $t7 215 216 mulq_rs.ph $v1, $v1, $t8 # ... z5 217 218 shll_s.ph $s5, $s5, 1 # x2 219 220 lw $t8, 12($at) # FIX(-2.613125930) 221 sw $s0, 0($a2) # wsptr[DCTSIZE*0] 222 223 mulq_rs.ph $v0, $v0, $t8 # tmp12 ... 224 mulq_rs.ph $s4, $s6, $s4 # tmp10 ... 225 226 shll_s.ph $v1, $v1, 1 # x2 227 228 addiu $a0, $a0, 4 229 addiu $a1, $a1, 4 230 231 sw $s1, 112($a2) # wsptr[DCTSIZE*7] 232 233 shll_s.ph $s6, $v0, 2 # x4 234 shll_s.ph $s4, $s4, 1 # x2 235 addq.ph $s6, $s6, $v1 # ... tmp12 236 237 subq.ph $t5, $s6, $t7 # tmp6 238 subq.ph $s4, $s4, $v1 # ... tmp10 239 subq.ph $t3, $s5, $t5 # tmp5 240 addq.ph $s2, $t2, $t5 241 addq.ph $t1, $s4, $t3 # tmp4 242 subq.ph $s3, $t2, $t5 243 244 sw $s2, 16($a2) # wsptr[DCTSIZE*1] 245 sw $s3, 96($a2) # wsptr[DCTSIZE*6] 246 247 addq.ph $v0, $t4, $t3 248 subq.ph $v1, $t4, $t3 249 250 sw $v0, 32($a2) # wsptr[DCTSIZE*2] 251 sw $v1, 80($a2) # wsptr[DCTSIZE*5] 252 253 addq.ph $v0, $t6, $t1 254 subq.ph $v1, $t6, $t1 255 256 sw $v0, 64($a2) # wsptr[DCTSIZE*4] 257 sw $v1, 48($a2) # wsptr[DCTSIZE*3] 258 259continue_columns: 260 261 bne $a0, $t9, loop_columns 262 addiu $a2, $a2, 4 263 264 265 lw $s0, 28($sp) 266 lw $s1, 24($sp) 267 lw $s2, 20($sp) 268 lw $s3, 16($sp) 269 lw $s4, 12($sp) 270 lw $s5, 8($sp) 271 lw $s6, 4($sp) 272 lw $s7, 0($sp) 273 274 jr $ra 275 addiu $sp, $sp, 32 276 277 278 .end mips_idct_columns 279 280 281################################################################## 282 283 284 .global mips_idct_rows 285 .ent mips_idct_rows 286 287# void mips_idct_rows(DCTELEM * wsptr, JSAMPARRAY output_buf, 288# JDIMENSION output_col, const int * mips_idct_coefs); 289 290mips_idct_rows: 291 292# $a0 - wsptr 293# $a1 - output_buf 294# $a2 - output_col 295# $a3 - outptr 296# $a3, $at - mips_idct_coefs 297# $t0:7 - simd data 298# $t8 - coefficients, temp 299# $t9 - loop end address 300# $s0:3 - simd quantization factors 301# $s4:7 - temp results 302# s8 - const 0x80808080 303# $v0:1 - temp results 304 305SHIFT = 2 306 307 addiu $sp, $sp, -48 # reserve stack space for s0-s8 308 309 # save $a3 (mips_idct_coefs) because it might get clobbered below 310 sw $a3, 36($sp) 311 312 sw $s0, 32($sp) 313 sw $s1, 28($sp) 314 sw $s2, 24($sp) 315 sw $s3, 20($sp) 316 sw $s4, 16($sp) 317 sw $s5, 12($sp) 318 sw $s6, 8($sp) 319 sw $s7, 4($sp) 320 sw $s8, 0($sp) 321 322 addiu $t9, $a0, 128 # end address 323 324 lui $s8, 0x8080 325 ori $s8, $s8, 0x8080 326 327loop_rows: 328 329 lw $at, 36($sp) # restore saved $a3 (mips_idct_coefs) 330 331 lw $t0, 0+0($a0) # wsptr[DCTSIZE*0+0/1] b a 332 lw $s0, 16+0($a0) # wsptr[DCTSIZE*1+0/1] B A 333 lw $t2, 0+4($a0) # wsptr[DCTSIZE*0+2/3] d c 334 lw $s2, 16+4($a0) # wsptr[DCTSIZE*1+2/3] D C 335 lw $t4, 0+8($a0) # wsptr[DCTSIZE*0+4/5] f e 336 lw $s4, 16+8($a0) # wsptr[DCTSIZE*1+4/5] F E 337 lw $t6, 0+12($a0) # wsptr[DCTSIZE*0+6/7] h g 338 lw $s6, 16+12($a0) # wsptr[DCTSIZE*1+6/7] H G 339 340 precrq.ph.w $t1, $s0, $t0 # B b 341 ins $t0, $s0, 16, 16 # A a 342 343 bnez $t1, full_row 344 or $s0, $t2, $s2 345 bnez $s0, full_row 346 or $s0, $t4, $s4 347 bnez $s0, full_row 348 or $s0, $t6, $s6 349 bnez $s0, full_row 350 351 shll_s.ph $s0, $t0, SHIFT # A a 352 353 lw $a3, 0($a1) 354 lw $at, 4($a1) 355 356 precrq.ph.w $t0, $s0, $s0 # A A 357 ins $s0, $s0, 16, 16 # a a 358 359 addu $a3, $a3, $a2 360 addu $at, $at, $a2 361 362 precrq.qb.ph $t0, $t0, $t0 # A A A A 363 precrq.qb.ph $s0, $s0, $s0 # a a a a 364 365 366 addu.qb $s0, $s0, $s8 367 addu.qb $t0, $t0, $s8 368 369 370 sw $s0, 0($a3) 371 sw $s0, 4($a3) 372 373 sw $t0, 0($at) 374 sw $t0, 4($at) 375 376 377 addiu $a0, $a0, 32 378 379 bne $a0, $t9, loop_rows 380 addiu $a1, $a1, 8 381 382 b exit_rows 383 nop 384 385 386full_row: 387 388 precrq.ph.w $t3, $s2, $t2 389 ins $t2, $s2, 16, 16 390 391 precrq.ph.w $t5, $s4, $t4 392 ins $t4, $s4, 16, 16 393 394 precrq.ph.w $t7, $s6, $t6 395 ins $t6, $s6, 16, 16 396 397 398 lw $t8, 4($at) # FIX(1.414213562) 399 400 addq.ph $s4, $t0, $t4 # tmp10 401 subq.ph $s5, $t0, $t4 # tmp11 402 403 subq.ph $s6, $t2, $t6 # tmp12 ... 404 addq.ph $s7, $t2, $t6 # tmp13 405 406 mulq_rs.ph $s6, $s6, $t8 # ... tmp12 ... 407 408 addq.ph $t0, $s4, $s7 # tmp0 409 subq.ph $t6, $s4, $s7 # tmp3 410 411 shll_s.ph $s6, $s6, 1 # x2 412 413 subq.ph $s6, $s6, $s7 # ... tmp12 414 415 addq.ph $t2, $s5, $s6 # tmp1 416 subq.ph $t4, $s5, $s6 # tmp2 417 418################ 419 420 addq.ph $s5, $t1, $t7 # z11 421 subq.ph $s6, $t1, $t7 # z12 422 423 addq.ph $s7, $t5, $t3 # z13 424 subq.ph $v0, $t5, $t3 # z10 425 426 addq.ph $t7, $s5, $s7 # tmp7 427 subq.ph $s5, $s5, $s7 # tmp11 ... 428 429 addq.ph $v1, $v0, $s6 # z5 ... 430 431 mulq_rs.ph $s5, $s5, $t8 # ... tmp11 432 433 lw $t8, 8($at) # FIX(1.847759065) 434 lw $s4, 0($at) # FIX(1.082392200) 435 436 addq.ph $s0, $t0, $t7 # tmp0 + tmp7 437 subq.ph $s7, $t0, $t7 # tmp0 - tmp7 438 439 mulq_rs.ph $v1, $v1, $t8 # ... z5 440 441 lw $a3, 0($a1) 442 lw $t8, 12($at) # FIX(-2.613125930) 443 444 shll_s.ph $s5, $s5, 1 # x2 445 446 addu $a3, $a3, $a2 447 448 mulq_rs.ph $v0, $v0, $t8 # tmp12 ... 449 mulq_rs.ph $s4, $s6, $s4 # tmp10 ... 450 451 shll_s.ph $v1, $v1, 1 # x2 452 453 addiu $a0, $a0, 32 454 addiu $a1, $a1, 8 455 456 457 shll_s.ph $s6, $v0, 2 # x4 458 shll_s.ph $s4, $s4, 1 # x2 459 addq.ph $s6, $s6, $v1 # ... tmp12 460 461 shll_s.ph $s0, $s0, SHIFT 462 463 subq.ph $t5, $s6, $t7 # tmp6 464 subq.ph $s4, $s4, $v1 # ... tmp10 465 subq.ph $t3, $s5, $t5 # tmp5 466 467 shll_s.ph $s7, $s7, SHIFT 468 469 addq.ph $t1, $s4, $t3 # tmp4 470 471 472 addq.ph $s1, $t2, $t5 # tmp1 + tmp6 473 subq.ph $s6, $t2, $t5 # tmp1 - tmp6 474 475 addq.ph $s2, $t4, $t3 # tmp2 + tmp5 476 subq.ph $s5, $t4, $t3 # tmp2 - tmp5 477 478 addq.ph $s4, $t6, $t1 # tmp3 + tmp4 479 subq.ph $s3, $t6, $t1 # tmp3 - tmp4 480 481 482 shll_s.ph $s1, $s1, SHIFT 483 shll_s.ph $s2, $s2, SHIFT 484 shll_s.ph $s3, $s3, SHIFT 485 shll_s.ph $s4, $s4, SHIFT 486 shll_s.ph $s5, $s5, SHIFT 487 shll_s.ph $s6, $s6, SHIFT 488 489 490 precrq.ph.w $t0, $s1, $s0 # B A 491 ins $s0, $s1, 16, 16 # b a 492 493 precrq.ph.w $t2, $s3, $s2 # D C 494 ins $s2, $s3, 16, 16 # d c 495 496 precrq.ph.w $t4, $s5, $s4 # F E 497 ins $s4, $s5, 16, 16 # f e 498 499 precrq.ph.w $t6, $s7, $s6 # H G 500 ins $s6, $s7, 16, 16 # h g 501 502 precrq.qb.ph $t0, $t2, $t0 # D C B A 503 precrq.qb.ph $s0, $s2, $s0 # d c b a 504 505 precrq.qb.ph $t4, $t6, $t4 # H G F E 506 precrq.qb.ph $s4, $s6, $s4 # h g f e 507 508 509 addu.qb $s0, $s0, $s8 510 addu.qb $s4, $s4, $s8 511 512 513 sw $s0, 0($a3) # outptr[0/1/2/3] d c b a 514 sw $s4, 4($a3) # outptr[4/5/6/7] h g f e 515 516 lw $a3, -4($a1) 517 518 addu.qb $t0, $t0, $s8 519 520 addu $a3, $a3, $a2 521 522 addu.qb $t4, $t4, $s8 523 524 525 sw $t0, 0($a3) # outptr[0/1/2/3] D C B A 526 527 bne $a0, $t9, loop_rows 528 sw $t4, 4($a3) # outptr[4/5/6/7] H G F E 529 530 531exit_rows: 532 533 lw $s0, 32($sp) 534 lw $s1, 28($sp) 535 lw $s2, 24($sp) 536 lw $s3, 20($sp) 537 lw $s4, 16($sp) 538 lw $s5, 12($sp) 539 lw $s6, 8($sp) 540 lw $s7, 4($sp) 541 lw $s8, 0($sp) 542 543 jr $ra 544 addiu $sp, $sp, 48 545 546 547 .end mips_idct_rows 548