1/* 2 * Simple IDCT 3 * 4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> 5 * Copyright (c) 2006 Mans Rullgard <mans@mansr.com> 6 * 7 * This file is part of FFmpeg. 8 * 9 * FFmpeg is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * FFmpeg is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with FFmpeg; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24#include "libavutil/arm/asm.S" 25 26#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 27#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 28#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 29#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 30#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 31#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 32#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 33#define ROW_SHIFT 11 34#define COL_SHIFT 20 35 36#define W13 (W1 | (W3 << 16)) 37#define W26 (W2 | (W6 << 16)) 38#define W57 (W5 | (W7 << 16)) 39 40function idct_row_armv5te 41 str lr, [sp, #-4]! 42 43 ldrd v1, v2, [a1, #8] 44 ldrd a3, a4, [a1] /* a3 = row[1:0], a4 = row[3:2] */ 45 orrs v1, v1, v2 46 itt eq 47 cmpeq v1, a4 48 cmpeq v1, a3, lsr #16 49 beq row_dc_only 50 51 mov v1, #(1<<(ROW_SHIFT-1)) 52 mov ip, #16384 53 sub ip, ip, #1 /* ip = W4 */ 54 smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ 55 ldr ip, =W26 /* ip = W2 | (W6 << 16) */ 56 smultb a2, ip, a4 57 smulbb lr, ip, a4 58 add v2, v1, a2 59 sub v3, v1, a2 60 sub v4, v1, lr 61 add v1, v1, lr 62 63 ldr ip, =W13 /* ip = W1 | (W3 << 16) */ 64 ldr lr, =W57 /* lr = W5 | (W7 << 16) */ 65 smulbt v5, ip, a3 66 smultt v6, lr, a4 67 smlatt v5, ip, a4, v5 68 smultt a2, ip, a3 69 smulbt v7, lr, a3 70 sub v6, v6, a2 71 smulbt a2, ip, a4 72 smultt fp, lr, a3 73 sub v7, v7, a2 74 smulbt a2, lr, a4 75 ldrd a3, a4, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ 76 sub fp, fp, a2 77 78 orrs a2, a3, a4 79 beq 1f 80 81 smlabt v5, lr, a3, v5 82 smlabt v6, ip, a3, v6 83 smlatt v5, lr, a4, v5 84 smlabt v6, lr, a4, v6 85 smlatt v7, lr, a3, v7 86 smlatt fp, ip, a3, fp 87 smulbt a2, ip, a4 88 smlatt v7, ip, a4, v7 89 sub fp, fp, a2 90 91 ldr ip, =W26 /* ip = W2 | (W6 << 16) */ 92 mov a2, #16384 93 sub a2, a2, #1 /* a2 = W4 */ 94 smulbb a2, a2, a3 /* a2 = W4*row[4] */ 95 smultb lr, ip, a4 /* lr = W6*row[6] */ 96 add v1, v1, a2 /* v1 += W4*row[4] */ 97 add v1, v1, lr /* v1 += W6*row[6] */ 98 add v4, v4, a2 /* v4 += W4*row[4] */ 99 sub v4, v4, lr /* v4 -= W6*row[6] */ 100 smulbb lr, ip, a4 /* lr = W2*row[6] */ 101 sub v2, v2, a2 /* v2 -= W4*row[4] */ 102 sub v2, v2, lr /* v2 -= W2*row[6] */ 103 sub v3, v3, a2 /* v3 -= W4*row[4] */ 104 add v3, v3, lr /* v3 += W2*row[6] */ 105 1061: add a2, v1, v5 107 mov a3, a2, lsr #11 108 bic a3, a3, #0x1f0000 109 sub a2, v2, v6 110 mov a2, a2, lsr #11 111 add a3, a3, a2, lsl #16 112 add a2, v3, v7 113 mov a4, a2, lsr #11 114 bic a4, a4, #0x1f0000 115 add a2, v4, fp 116 mov a2, a2, lsr #11 117 add a4, a4, a2, lsl #16 118 strd a3, a4, [a1] 119 120 sub a2, v4, fp 121 mov a3, a2, lsr #11 122 bic a3, a3, #0x1f0000 123 sub a2, v3, v7 124 mov a2, a2, lsr #11 125 add a3, a3, a2, lsl #16 126 add a2, v2, v6 127 mov a4, a2, lsr #11 128 bic a4, a4, #0x1f0000 129 sub a2, v1, v5 130 mov a2, a2, lsr #11 131 add a4, a4, a2, lsl #16 132 strd a3, a4, [a1, #8] 133 134 ldr pc, [sp], #4 135 136row_dc_only: 137 orr a3, a3, a3, lsl #16 138 bic a3, a3, #0xe000 139 mov a3, a3, lsl #3 140 mov a4, a3 141 strd a3, a4, [a1] 142 strd a3, a4, [a1, #8] 143 144 ldr pc, [sp], #4 145endfunc 146 147 .macro idct_col 148 ldr a4, [a1] /* a4 = col[1:0] */ 149 mov ip, #16384 150 sub ip, ip, #1 /* ip = W4 */ 151 mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */ 152 add v2, v1, a4, asr #16 153 rsb v2, v2, v2, lsl #14 154 mov a4, a4, lsl #16 155 add v1, v1, a4, asr #16 156 ldr a4, [a1, #(16*4)] 157 rsb v1, v1, v1, lsl #14 158 159 smulbb lr, ip, a4 160 smulbt a3, ip, a4 161 sub v3, v1, lr 162 sub v5, v1, lr 163 add v7, v1, lr 164 add v1, v1, lr 165 sub v4, v2, a3 166 sub v6, v2, a3 167 add fp, v2, a3 168 ldr ip, =W26 169 ldr a4, [a1, #(16*2)] 170 add v2, v2, a3 171 172 smulbb lr, ip, a4 173 smultb a3, ip, a4 174 add v1, v1, lr 175 sub v7, v7, lr 176 add v3, v3, a3 177 sub v5, v5, a3 178 smulbt lr, ip, a4 179 smultt a3, ip, a4 180 add v2, v2, lr 181 sub fp, fp, lr 182 add v4, v4, a3 183 ldr a4, [a1, #(16*6)] 184 sub v6, v6, a3 185 186 smultb lr, ip, a4 187 smulbb a3, ip, a4 188 add v1, v1, lr 189 sub v7, v7, lr 190 sub v3, v3, a3 191 add v5, v5, a3 192 smultt lr, ip, a4 193 smulbt a3, ip, a4 194 add v2, v2, lr 195 sub fp, fp, lr 196 sub v4, v4, a3 197 add v6, v6, a3 198 199 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} 200 201 ldr ip, =W13 202 ldr a4, [a1, #(16*1)] 203 ldr lr, =W57 204 smulbb v1, ip, a4 205 smultb v3, ip, a4 206 smulbb v5, lr, a4 207 smultb v7, lr, a4 208 smulbt v2, ip, a4 209 smultt v4, ip, a4 210 smulbt v6, lr, a4 211 smultt fp, lr, a4 212 rsb v4, v4, #0 213 ldr a4, [a1, #(16*3)] 214 rsb v3, v3, #0 215 216 smlatb v1, ip, a4, v1 217 smlatb v3, lr, a4, v3 218 smulbb a3, ip, a4 219 smulbb a2, lr, a4 220 sub v5, v5, a3 221 sub v7, v7, a2 222 smlatt v2, ip, a4, v2 223 smlatt v4, lr, a4, v4 224 smulbt a3, ip, a4 225 smulbt a2, lr, a4 226 sub v6, v6, a3 227 ldr a4, [a1, #(16*5)] 228 sub fp, fp, a2 229 230 smlabb v1, lr, a4, v1 231 smlabb v3, ip, a4, v3 232 smlatb v5, lr, a4, v5 233 smlatb v7, ip, a4, v7 234 smlabt v2, lr, a4, v2 235 smlabt v4, ip, a4, v4 236 smlatt v6, lr, a4, v6 237 ldr a3, [a1, #(16*7)] 238 smlatt fp, ip, a4, fp 239 240 smlatb v1, lr, a3, v1 241 smlabb v3, lr, a3, v3 242 smlatb v5, ip, a3, v5 243 smulbb a4, ip, a3 244 smlatt v2, lr, a3, v2 245 sub v7, v7, a4 246 smlabt v4, lr, a3, v4 247 smulbt a4, ip, a3 248 smlatt v6, ip, a3, v6 249 sub fp, fp, a4 250 .endm 251 252function idct_col_armv5te 253 str lr, [sp, #-4]! 254 255 idct_col 256 257 ldmfd sp!, {a3, a4} 258 adds a2, a3, v1 259 mov a2, a2, lsr #20 260 it mi 261 orrmi a2, a2, #0xf000 262 add ip, a4, v2 263 mov ip, ip, asr #20 264 orr a2, a2, ip, lsl #16 265 str a2, [a1] 266 subs a3, a3, v1 267 mov a2, a3, lsr #20 268 it mi 269 orrmi a2, a2, #0xf000 270 sub a4, a4, v2 271 mov a4, a4, asr #20 272 orr a2, a2, a4, lsl #16 273 ldmfd sp!, {a3, a4} 274 str a2, [a1, #(16*7)] 275 276 subs a2, a3, v3 277 mov a2, a2, lsr #20 278 it mi 279 orrmi a2, a2, #0xf000 280 sub ip, a4, v4 281 mov ip, ip, asr #20 282 orr a2, a2, ip, lsl #16 283 str a2, [a1, #(16*1)] 284 adds a3, a3, v3 285 mov a2, a3, lsr #20 286 it mi 287 orrmi a2, a2, #0xf000 288 add a4, a4, v4 289 mov a4, a4, asr #20 290 orr a2, a2, a4, lsl #16 291 ldmfd sp!, {a3, a4} 292 str a2, [a1, #(16*6)] 293 294 adds a2, a3, v5 295 mov a2, a2, lsr #20 296 it mi 297 orrmi a2, a2, #0xf000 298 add ip, a4, v6 299 mov ip, ip, asr #20 300 orr a2, a2, ip, lsl #16 301 str a2, [a1, #(16*2)] 302 subs a3, a3, v5 303 mov a2, a3, lsr #20 304 it mi 305 orrmi a2, a2, #0xf000 306 sub a4, a4, v6 307 mov a4, a4, asr #20 308 orr a2, a2, a4, lsl #16 309 ldmfd sp!, {a3, a4} 310 str a2, [a1, #(16*5)] 311 312 adds a2, a3, v7 313 mov a2, a2, lsr #20 314 it mi 315 orrmi a2, a2, #0xf000 316 add ip, a4, fp 317 mov ip, ip, asr #20 318 orr a2, a2, ip, lsl #16 319 str a2, [a1, #(16*3)] 320 subs a3, a3, v7 321 mov a2, a3, lsr #20 322 it mi 323 orrmi a2, a2, #0xf000 324 sub a4, a4, fp 325 mov a4, a4, asr #20 326 orr a2, a2, a4, lsl #16 327 str a2, [a1, #(16*4)] 328 329 ldr pc, [sp], #4 330endfunc 331 332.macro clip dst, src:vararg 333 movs \dst, \src 334 it mi 335 movmi \dst, #0 336 cmp \dst, #255 337 it gt 338 movgt \dst, #255 339.endm 340 341.macro aclip dst, src:vararg 342 adds \dst, \src 343 it mi 344 movmi \dst, #0 345 cmp \dst, #255 346 it gt 347 movgt \dst, #255 348.endm 349 350function idct_col_put_armv5te 351 str lr, [sp, #-4]! 352 353 idct_col 354 355 ldmfd sp!, {a3, a4} 356 ldr lr, [sp, #32] 357 add a2, a3, v1 358 clip a2, a2, asr #20 359 add ip, a4, v2 360 clip ip, ip, asr #20 361 orr a2, a2, ip, lsl #8 362 sub a3, a3, v1 363 clip a3, a3, asr #20 364 sub a4, a4, v2 365 clip a4, a4, asr #20 366 ldr v1, [sp, #28] 367 strh a2, [v1] 368 add a2, v1, #2 369 str a2, [sp, #28] 370 orr a2, a3, a4, lsl #8 371 rsb v2, lr, lr, lsl #3 372 ldmfd sp!, {a3, a4} 373 strh_pre a2, v2, v1 374 375 sub a2, a3, v3 376 clip a2, a2, asr #20 377 sub ip, a4, v4 378 clip ip, ip, asr #20 379 orr a2, a2, ip, lsl #8 380 strh_pre a2, v1, lr 381 add a3, a3, v3 382 clip a2, a3, asr #20 383 add a4, a4, v4 384 clip a4, a4, asr #20 385 orr a2, a2, a4, lsl #8 386 ldmfd sp!, {a3, a4} 387 strh_dpre a2, v2, lr 388 389 add a2, a3, v5 390 clip a2, a2, asr #20 391 add ip, a4, v6 392 clip ip, ip, asr #20 393 orr a2, a2, ip, lsl #8 394 strh_pre a2, v1, lr 395 sub a3, a3, v5 396 clip a2, a3, asr #20 397 sub a4, a4, v6 398 clip a4, a4, asr #20 399 orr a2, a2, a4, lsl #8 400 ldmfd sp!, {a3, a4} 401 strh_dpre a2, v2, lr 402 403 add a2, a3, v7 404 clip a2, a2, asr #20 405 add ip, a4, fp 406 clip ip, ip, asr #20 407 orr a2, a2, ip, lsl #8 408 strh a2, [v1, lr] 409 sub a3, a3, v7 410 clip a2, a3, asr #20 411 sub a4, a4, fp 412 clip a4, a4, asr #20 413 orr a2, a2, a4, lsl #8 414 strh_dpre a2, v2, lr 415 416 ldr pc, [sp], #4 417endfunc 418 419function idct_col_add_armv5te 420 str lr, [sp, #-4]! 421 422 idct_col 423 424 ldr lr, [sp, #36] 425 426 ldmfd sp!, {a3, a4} 427 ldrh ip, [lr] 428 add a2, a3, v1 429 sub a3, a3, v1 430 and v1, ip, #255 431 aclip a2, v1, a2, asr #20 432 add v1, a4, v2 433 mov v1, v1, asr #20 434 aclip v1, v1, ip, lsr #8 435 orr a2, a2, v1, lsl #8 436 ldr v1, [sp, #32] 437 sub a4, a4, v2 438 rsb v2, v1, v1, lsl #3 439 ldrh_pre ip, v2, lr 440 strh a2, [lr] 441 and a2, ip, #255 442 aclip a3, a2, a3, asr #20 443 mov a4, a4, asr #20 444 aclip a4, a4, ip, lsr #8 445 add a2, lr, #2 446 str a2, [sp, #28] 447 orr a2, a3, a4, lsl #8 448 strh a2, [v2] 449 450 ldmfd sp!, {a3, a4} 451 ldrh_pre ip, lr, v1 452 sub a2, a3, v3 453 add a3, a3, v3 454 and v3, ip, #255 455 aclip a2, v3, a2, asr #20 456 sub v3, a4, v4 457 mov v3, v3, asr #20 458 aclip v3, v3, ip, lsr #8 459 orr a2, a2, v3, lsl #8 460 add a4, a4, v4 461 ldrh_dpre ip, v2, v1 462 strh a2, [lr] 463 and a2, ip, #255 464 aclip a3, a2, a3, asr #20 465 mov a4, a4, asr #20 466 aclip a4, a4, ip, lsr #8 467 orr a2, a3, a4, lsl #8 468 strh a2, [v2] 469 470 ldmfd sp!, {a3, a4} 471 ldrh_pre ip, lr, v1 472 add a2, a3, v5 473 sub a3, a3, v5 474 and v3, ip, #255 475 aclip a2, v3, a2, asr #20 476 add v3, a4, v6 477 mov v3, v3, asr #20 478 aclip v3, v3, ip, lsr #8 479 orr a2, a2, v3, lsl #8 480 sub a4, a4, v6 481 ldrh_dpre ip, v2, v1 482 strh a2, [lr] 483 and a2, ip, #255 484 aclip a3, a2, a3, asr #20 485 mov a4, a4, asr #20 486 aclip a4, a4, ip, lsr #8 487 orr a2, a3, a4, lsl #8 488 strh a2, [v2] 489 490 ldmfd sp!, {a3, a4} 491 ldrh_pre ip, lr, v1 492 add a2, a3, v7 493 sub a3, a3, v7 494 and v3, ip, #255 495 aclip a2, v3, a2, asr #20 496 add v3, a4, fp 497 mov v3, v3, asr #20 498 aclip v3, v3, ip, lsr #8 499 orr a2, a2, v3, lsl #8 500 sub a4, a4, fp 501 ldrh_dpre ip, v2, v1 502 strh a2, [lr] 503 and a2, ip, #255 504 aclip a3, a2, a3, asr #20 505 mov a4, a4, asr #20 506 aclip a4, a4, ip, lsr #8 507 orr a2, a3, a4, lsl #8 508 strh a2, [v2] 509 510 ldr pc, [sp], #4 511endfunc 512 513function ff_simple_idct_armv5te, export=1 514 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} 515 516 bl idct_row_armv5te 517 add a1, a1, #16 518 bl idct_row_armv5te 519 add a1, a1, #16 520 bl idct_row_armv5te 521 add a1, a1, #16 522 bl idct_row_armv5te 523 add a1, a1, #16 524 bl idct_row_armv5te 525 add a1, a1, #16 526 bl idct_row_armv5te 527 add a1, a1, #16 528 bl idct_row_armv5te 529 add a1, a1, #16 530 bl idct_row_armv5te 531 532 sub a1, a1, #(16*7) 533 534 bl idct_col_armv5te 535 add a1, a1, #4 536 bl idct_col_armv5te 537 add a1, a1, #4 538 bl idct_col_armv5te 539 add a1, a1, #4 540 bl idct_col_armv5te 541 542 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} 543endfunc 544 545function ff_simple_idct_add_armv5te, export=1 546 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} 547 548 mov a1, a3 549 550 bl idct_row_armv5te 551 add a1, a1, #16 552 bl idct_row_armv5te 553 add a1, a1, #16 554 bl idct_row_armv5te 555 add a1, a1, #16 556 bl idct_row_armv5te 557 add a1, a1, #16 558 bl idct_row_armv5te 559 add a1, a1, #16 560 bl idct_row_armv5te 561 add a1, a1, #16 562 bl idct_row_armv5te 563 add a1, a1, #16 564 bl idct_row_armv5te 565 566 sub a1, a1, #(16*7) 567 568 bl idct_col_add_armv5te 569 add a1, a1, #4 570 bl idct_col_add_armv5te 571 add a1, a1, #4 572 bl idct_col_add_armv5te 573 add a1, a1, #4 574 bl idct_col_add_armv5te 575 576 add sp, sp, #8 577 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} 578endfunc 579 580function ff_simple_idct_put_armv5te, export=1 581 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} 582 583 mov a1, a3 584 585 bl idct_row_armv5te 586 add a1, a1, #16 587 bl idct_row_armv5te 588 add a1, a1, #16 589 bl idct_row_armv5te 590 add a1, a1, #16 591 bl idct_row_armv5te 592 add a1, a1, #16 593 bl idct_row_armv5te 594 add a1, a1, #16 595 bl idct_row_armv5te 596 add a1, a1, #16 597 bl idct_row_armv5te 598 add a1, a1, #16 599 bl idct_row_armv5te 600 601 sub a1, a1, #(16*7) 602 603 bl idct_col_put_armv5te 604 add a1, a1, #4 605 bl idct_col_put_armv5te 606 add a1, a1, #4 607 bl idct_col_put_armv5te 608 add a1, a1, #4 609 bl idct_col_put_armv5te 610 611 add sp, sp, #8 612 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} 613endfunc 614