1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <machine/cpu-features.h> 18 19 .text 20 .align 21 22 .global jpeg_idct_ifast 23 .func jpeg_idct_ifast 24 25// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15 26 27// jpeg_idct_ifast (j_decompress_ptr cinfo, 28// jpeg_component_info * compptr, 29// short* coef_block, 30// unsigned char* output_buf, 31// int output_col) 32 33#define local_TMP0123 sp 34#define local_TMP0 [sp, #0] 35#define local_TMP1 [sp, #4] 36#define local_TMP2 [sp, #8] 37#define local_TMP3 [sp, #12] 38#define local_RANGE_TABLE [sp, #16] 39#define local_OUTPUT_COL [sp, #20] 40#define local_OUTPUT_BUF [sp, #24] 41#define local_UNUSED [sp, #28] 42#define off_WORKSPACE 32 43#define local_WORKSPACE [sp, #offWORKSPACE] 44#define local_SIZE (off_WORKSPACE + 8*8*4) 45 46#define off_DECOMPRESS_range_limit_base 324 47#define off_COMPINFO_quanttable 80 48 49#define DCTSIZE 8 50#define VY(x) ((x)*DCTSIZE*2) 51#define QY(x) ((x)*DCTSIZE*4) 52 53#define VX(x) ((x)*2) 54#define QX(x) ((x)*4) 55 56#define FIX_1_414213562 #362 57#define FIX_1_082392200 #277 58#define FIX_1_847759065 #473 59#define FIX_2_613125930 #669 60 61#define RANGE_MASK 1023 62 63 64 65jpeg_idct_ifast: 66 PLD (r2, #0) 67 stmdb sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr} 68 ldr r4, [sp, #4*10] 69 sub sp, #local_SIZE 70 71 ldr r10,[r1, #off_COMPINFO_quanttable] // r10 = quanttable 72 str r4, local_OUTPUT_COL 73 str r3, local_OUTPUT_BUF 74 ldr r5, [r0, #off_DECOMPRESS_range_limit_base] 75 add r5, r5, #128 76 str r5, local_RANGE_TABLE 77 mov fp, r2 // fp = coef_block 78 add ip, sp, #off_WORKSPACE 79 80VLoopTail: 81 ldrsh r0, [fp, #VY(0)] 82 ldrsh r1, [fp, #VY(1)] 83 ldrsh r2, [fp, #VY(2)] 84 ldrsh r3, [fp, #VY(3)] 85 ldrsh r4, [fp, #VY(4)] 86 ldrsh r5, [fp, #VY(5)] 87 ldrsh r6, [fp, #VY(6)] 88 ldrsh r7, [fp, #VY(7)] 89 90 cmp r1, #0 91 orreqs r8, r2, r3 92 orreqs r8, r4, r5 93 orreqs r8, r6, r7 94 beq VLoopHeadZero 95 96VLoopHead: 97 // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0] (r0) 98 // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4] (r4) 99 // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2] (r2) 100 // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6] (r6) 101 // tmp10 = tmp0 + tmp2 (r0) 102 // tmp11 = tmp0 - tmp2 (r4) 103 104 ldr r9, [r10, #QY(4)] 105 ldr r8, [r10, #QY(0)] 106#if __ARM_HAVE_HALFWORD_MULTIPLY 107 smulbb r4, r9, r4 108 smlabb r0, r8, r0, r4 109#else 110 mul r4, r9, r4 111 mul r0, r8, r0 112 add r0, r4 113#endif 114 ldr r9, [r10, #QY(6)] 115 ldr r8, [r10, #QY(2)] 116 sub r4, r0, r4, lsl #1 117#if __ARM_HAVE_HALFWORD_MULTIPLY 118 smulbb r6, r9, r6 119 smlabb r2, r8, r2, r6 120#else 121 mul r6, r9, r6 122 mul r2, r8, r2 123 add r2, r6 124#endif 125 126 // tmp13 = tmp1 + tmp3 (r2) 127 // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13 (r6) 128 // FIX_1_4142... = 362 = 45*8 + 2 129 sub r6, r2, r6, lsl #1 130 mov r8, #360 131 add r8, r8, #2 132 mul r9, r6, r8 133 134 // tmp0 = tmp10 + tmp13; (r0) 135 // tmp3 = tmp10 - tmp13; (r8) 136 // tmp1 = tmp11 + tmp12; (r4) 137 // tmp2 = tmp11 - tmp12; (r6) 138 add r0, r0, r2 139 rsb r6, r2, r9, asr #8 140 sub r8, r0, r2, lsl #1 141 add r4, r4, r6 142 sub r6, r4, r6, lsl #1 143 144 stmia local_TMP0123, {r0, r4, r6, r8} 145 146 // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above 147 148 // odd part 149 // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] ) (r1) 150 // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] ) (r5) 151 // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] ) (r3) 152 // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] ) (r7) 153 // z13 = tmp6 + tmp5; (r0) 154 // z10 = tmp6 - tmp5; (r2) 155 // z11 = tmp4 + tmp7; (r4) 156 // z12 = tmp4 - tmp7; (r6) 157 158 ldr r2, [r10, #QY(1)] 159 ldr r9, [r10, #QY(5)] 160#if __ARM_HAVE_HALFWORD_MULTIPLY 161 smulbb r1, r2, r1 162#else 163 mul r1, r2, r1 164#endif 165 ldr r2, [r10, #QY(3)] 166#if __ARM_HAVE_HALFWORD_MULTIPLY 167 smulbb r5, r9, r5 168#else 169 mul r5, r9, r5 170#endif 171 ldr r9, [r10, #QY(7)] 172#if __ARM_HAVE_HALFWORD_MULTIPLY 173 smlabb r0, r2, r3, r5 174 smlabb r4, r9, r7, r1 175#else 176 mul r0, r2, r3 177 add r0, r5 178 mul r4, r9, r7 179 add r4, r1 180#endif 181 rsb r2, r0, r5, lsl #1 182 rsb r6, r4, r1, lsl #1 183 184 // tmp7 = z11 + z13; (r7) 185 // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1) 186 // FIX_... = 360 + 2 187 add r7, r4, r0 188 sub r1, r4, r0 189 mov r8, #360 190 add r8, r8, #2 191 mul r1, r8, r1 192 193 // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8) 194 // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0) 195 // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2) 196 // FIX_1_8477... = 473 = 472 + 1 197 // FIX_1_082... = 277 = 276 + 1 198 // FIX_2_... = 669 = 668 + 1 199 add r8, r2, r6 200 mov r9, #472 201 mla r8, r9, r8, r8 202 mov r9, #276 203 mla r0, r6, r9, r6 204 mov r9, #668 205 mla r2, r9, r2, r2 206 sub r0, r0, r8 207 rsb r2, r2, r8 208 209 // tmp6 = tmp12 - tmp7; (r6) 210 // tmp5 = tmp11 - tmp6; (r5) 211 // tmp4 = tmp10 + tmp5; (r4) 212 rsb r6, r7, r2, asr #8 213 rsb r5, r6, r1, asr #8 214 add r4, r5, r0, asr #8 215 216 ldmia local_TMP0123, {r0, r1, r2, r3} 217 218 // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); 219 // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); 220 // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); 221 // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); 222 // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5); 223 // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); 224 // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); 225 // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); 226 227 add r0, r0, r7 228 sub r7, r0, r7, lsl #1 229 add r1, r1, r6 230 sub r6, r1, r6, lsl #1 231 add r2, r2, r5 232 sub r5, r2, r5, lsl #1 233 sub r3, r3, r4 234 add r4, r3, r4, lsl #1 235 236 str r0, [ip, #QY(0)] 237 str r1, [ip, #QY(1)] 238 str r2, [ip, #QY(2)] 239 str r3, [ip, #QY(3)] 240 str r4, [ip, #QY(4)] 241 str r5, [ip, #QY(5)] 242 str r6, [ip, #QY(6)] 243 str r7, [ip, #QY(7)] 244 245 // inptr++; /* advance pointers to next column */ 246 // quantptr++; 247 // wsptr++; 248 add fp, fp, #2 249 add r10, r10, #4 250 add ip, ip, #4 251 add r0, sp, #(off_WORKSPACE + 4*8) 252 cmp ip, r0 253 bne VLoopTail 254 255 256 257HLoopStart: 258 // reset pointers 259 PLD (sp, #off_WORKSPACE) 260 add ip, sp, #off_WORKSPACE 261 ldr r10, local_RANGE_TABLE 262 263HLoopTail: 264 // output = *output_buf++ + output_col 265 ldr r0, local_OUTPUT_BUF 266 ldr r1, local_OUTPUT_COL 267 ldr r2, [r0], #4 268 str r0, local_OUTPUT_BUF 269 add fp, r2, r1 270 271 PLD (ip, #32) 272 ldmia ip!, {r0-r7} 273 274 cmp r1, #0 275 orreqs r8, r2, r3 276 orreqs r8, r4, r5 277 orreqs r8, r6, r7 278 beq HLoopTailZero 279 280HLoopHead: 281 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); (r0) 282 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); (r4) 283 add r0, r0, r4 284 sub r4, r0, r4, lsl #1 285 286 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); (r2) 287 // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13; (r6) 288 // FIX_... = 360 + 2 289 add r2, r2, r6 290 sub r6, r2, r6, lsl #1 291 mov r8, #360 292 add r8, r8, #2 293 mul r6, r8, r6 294 295 // tmp0 = tmp10 + tmp13; (r0) 296 // tmp3 = tmp10 - tmp13; (r8) 297 // tmp1 = tmp11 + tmp12; (r4) 298 // tmp2 = tmp11 - tmp12; (r6) 299 add r0, r0, r2 300 rsb r6, r2, r6, asr #8 301 sub r8, r0, r2, lsl #1 302 add r4, r4, r6 303 sub r6, r4, r6, lsl #1 304 305 stmia local_TMP0123, {r0, r4, r6, r8} 306 307 // Odd part 308 309 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; (r0) 310 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; (r2) 311 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; (r4) 312 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; (r6) 313 add r0, r5, r3 314 sub r2, r5, r3 315 add r4, r1, r7 316 sub r6, r1, r7 317 318 // tmp7 = z11 + z13; (r7) 319 // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1) 320 // FIX_... = 360 + 2 321 add r7, r4, r0 322 sub r1, r4, r0 323 mov r8, #360 324 add r8, r8, #2 325 mul r1, r8, r1 326 327 // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8) 328 // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0) 329 // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2) 330 // FIX_1_8477... = 473 = 472 + 1 331 // FIX_1_082... = 277 = 276 + 1 332 // FIX_2_... = 669 = 668 + 1 333 add r8, r2, r6 334 mov r9, #472 335 mla r8, r9, r8, r8 336 mov r9, #276 337 mla r0, r6, r9, r6 338 mov r9, #668 339 mla r2, r9, r2, r2 340 sub r0, r0, r8 341 sub r2, r8, r2 342 343 // tmp6 = tmp12 - tmp7; (r6) 344 // tmp5 = tmp11 - tmp6; (r5) 345 // tmp4 = tmp10 + tmp5; (r4) 346 rsb r6, r7, r2, asr #8 347 rsb r5, r6, r1, asr #8 348 add r4, r5, r0, asr #8 349 350 ldmia local_TMP0123, {r0, r1, r2, r3} 351 352 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK]; 353 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK]; 354 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK]; 355 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK]; 356 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK]; 357 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK]; 358 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK]; 359 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK]; 360 361 mov r8, #128 362 add r0, r0, r7 363 sub r7, r0, r7, lsl #1 364 add r0, r8, r0, asr #5 365 add r7, r8, r7, asr #5 366 add r1, r1, r6 367 sub r6, r1, r6, lsl #1 368 add r1, r8, r1, asr #5 369 add r6, r8, r6, asr #5 370 add r2, r2, r5 371 sub r5, r2, r5, lsl #1 372 add r2, r8, r2, asr #5 373 add r5, r8, r5, asr #5 374 sub r3, r3, r4 375 add r4, r3, r4, lsl #1 376 add r3, r8, r3, asr #5 377 add r4, r8, r4, asr #5 378 379#if __ARM_ARCH__ >= 6 380 usat r0, #8, r0 381 usat r1, #8, r1 382 usat r2, #8, r2 383 usat r3, #8, r3 384 usat r4, #8, r4 385 usat r5, #8, r5 386 usat r6, #8, r6 387 usat r7, #8, r7 388#else 389 cmp r0, #255 390 mvnhi r0, r0, asr #31 391 andhi r0, #255 392 cmp r7, #255 393 mvnhi r7, r7, asr #31 394 cmp r1, #255 395 mvnhi r1, r1, asr #31 396 andhi r1, #255 397 cmp r6, #255 398 mvnhi r6, r6, asr #31 399 andhi r6, #255 400 cmp r2, #255 401 mvnhi r2, r2, asr #31 402 andhi r2, #255 403 cmp r5, #255 404 mvnhi r5, r5, asr #31 405 andhi r5, #255 406 cmp r3, #255 407 mvnhi r3, r3, asr #31 408 cmp r4, #255 409 mvnhi r4, r4, asr #31 410 andhi r4, #255 411#endif 412 413 // r3 r2 r1 r0 414 orr r0, r0, r1, lsl #8 415 orr r0, r0, r2, lsl #16 416 orr r0, r0, r3, lsl #24 417 418 // r7 r6 r5 r4 419 orr r1, r4, r5, lsl #8 420 orr r1, r1, r6, lsl #16 421 orr r1, r1, r7, lsl #24 422 stmia fp, {r0, r1} 423 424 add r0, sp, #(off_WORKSPACE + 8*8*4) 425 cmp ip, r0 426 bne HLoopTail 427 428Exit: 429 add sp, sp, #local_SIZE 430 ldmia sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr} 431 bx lr 432 433 434VLoopHeadZero: 435// ok, all AC coefficients are 0 436 ldr r1, [r10, #QY(0)] 437 add fp, fp, #2 438 add r10, r10, #4 439 mul r0, r1, r0 440 str r0, [ip, #QY(0)] 441 str r0, [ip, #QY(1)] 442 str r0, [ip, #QY(2)] 443 str r0, [ip, #QY(3)] 444 str r0, [ip, #QY(4)] 445 str r0, [ip, #QY(5)] 446 str r0, [ip, #QY(6)] 447 str r0, [ip, #QY(7)] 448 add ip, ip, #4 449 add r0, sp, #(off_WORKSPACE + 4*8) 450 cmp ip, r0 451 beq HLoopStart 452 b VLoopTail 453 454HLoopTailZero: 455 mov r0, r0, asr #5 456 add r0, #128 457 458#if __ARM_ARCH__ >= 6 459 usat r0, #8, r0 460#else 461 cmp r0, #255 462 mvnhi r0, r0, asr #31 463 andhi r0, r0, #255 464#endif 465 466 orr r0, r0, lsl #8 467 orr r0, r0, lsl #16 468 mov r1, r0 469 stmia fp, {r0, r1} 470 471 add r0, sp, #(off_WORKSPACE + 64*4) 472 cmp ip, r0 473 beq Exit 474 b HLoopTail 475 476 .endfunc 477