1/* 2 * Simple IDCT 3 * 4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> 5 * Copyright (c) 2007 Mans Rullgard <mans@mansr.com> 6 * 7 * This file is part of FFmpeg. 8 * 9 * FFmpeg is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * FFmpeg is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with FFmpeg; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24#include "libavutil/arm/asm.S" 25 26#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 27#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 28#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 29#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 30#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 31#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 32#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 33#define ROW_SHIFT 11 34#define COL_SHIFT 20 35 36#define W13 (W1 | (W3 << 16)) 37#define W26 (W2 | (W6 << 16)) 38#define W42 (W4 | (W2 << 16)) 39#define W42n (-W4&0xffff | (-W2 << 16)) 40#define W46 (W4 | (W6 << 16)) 41#define W57 (W5 | (W7 << 16)) 42 43/* 44 Compute partial IDCT of single row. 45 shift = left-shift amount 46 r0 = source address 47 r2 = row[2,0] <= 2 cycles 48 r3 = row[3,1] 49 ip = w42 <= 2 cycles 50 51 Output in registers r4--r11 52*/ 53 .macro idct_row shift 54 ldr lr, =W46 /* lr = W4 | (W6 << 16) */ 55 mov r1, #(1<<(\shift-1)) 56 smlad r4, r2, ip, r1 57 smlsd r7, r2, ip, r1 58 ldr ip, =W13 /* ip = W1 | (W3 << 16) */ 59 ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ 60 smlad r5, r2, lr, r1 61 smlsd r6, r2, lr, r1 62 63 smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ 64 smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ 65 ldr lr, [r0, #12] /* lr = row[7,5] */ 66 pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ 67 pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ 68 smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ 69 smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ 70 smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ 71 72 ldr r3, =W42n /* r3 = -W4 | (-W2 << 16) */ 73 smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ 74 ldr r2, [r0, #4] /* r2 = row[6,4] */ 75 smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ 76 ldr ip, =W46 /* ip = W4 | (W6 << 16) */ 77 smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ 78 79 smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ 80 smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */ 81 smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */ 82 smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */ 83 .endm 84 85/* 86 Compute partial IDCT of half row. 87 shift = left-shift amount 88 r2 = row[2,0] 89 r3 = row[3,1] 90 ip = w42 91 92 Output in registers r4--r11 93*/ 94 .macro idct_row4 shift 95 ldr lr, =W46 /* lr = W4 | (W6 << 16) */ 96 ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ 97 mov r1, #(1<<(\shift-1)) 98 smlad r4, r2, ip, r1 99 smlsd r7, r2, ip, r1 100 ldr ip, =W13 /* ip = W1 | (W3 << 16) */ 101 smlad r5, r2, lr, r1 102 smlsd r6, r2, lr, r1 103 smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ 104 smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ 105 pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ 106 pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ 107 smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ 108 smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ 109 .endm 110 111/* 112 Compute final part of IDCT single row without shift. 113 Input in registers r4--r11 114 Output in registers ip, r4--r6, lr, r8--r10 115*/ 116 .macro idct_finish 117 add ip, r4, r8 /* r1 = A0 + B0 */ 118 sub lr, r4, r8 /* r2 = A0 - B0 */ 119 sub r4, r5, r9 /* r2 = A1 + B1 */ 120 add r8, r5, r9 /* r2 = A1 - B1 */ 121 add r5, r6, r10 /* r1 = A2 + B2 */ 122 sub r9, r6, r10 /* r1 = A2 - B2 */ 123 add r6, r7, r11 /* r2 = A3 + B3 */ 124 sub r10,r7, r11 /* r2 = A3 - B3 */ 125 .endm 126 127/* 128 Compute final part of IDCT single row. 129 shift = right-shift amount 130 Input/output in registers r4--r11 131*/ 132 .macro idct_finish_shift shift 133 add r3, r4, r8 /* r3 = A0 + B0 */ 134 sub r2, r4, r8 /* r2 = A0 - B0 */ 135 mov r4, r3, asr #\shift 136 mov r8, r2, asr #\shift 137 138 sub r3, r5, r9 /* r3 = A1 + B1 */ 139 add r2, r5, r9 /* r2 = A1 - B1 */ 140 mov r5, r3, asr #\shift 141 mov r9, r2, asr #\shift 142 143 add r3, r6, r10 /* r3 = A2 + B2 */ 144 sub r2, r6, r10 /* r2 = A2 - B2 */ 145 mov r6, r3, asr #\shift 146 mov r10,r2, asr #\shift 147 148 add r3, r7, r11 /* r3 = A3 + B3 */ 149 sub r2, r7, r11 /* r2 = A3 - B3 */ 150 mov r7, r3, asr #\shift 151 mov r11,r2, asr #\shift 152 .endm 153 154/* 155 Compute final part of IDCT single row, saturating results at 8 bits. 156 shift = right-shift amount 157 Input/output in registers r4--r11 158*/ 159 .macro idct_finish_shift_sat shift 160 add r3, r4, r8 /* r3 = A0 + B0 */ 161 sub ip, r4, r8 /* ip = A0 - B0 */ 162 usat r4, #8, r3, asr #\shift 163 usat r8, #8, ip, asr #\shift 164 165 sub r3, r5, r9 /* r3 = A1 + B1 */ 166 add ip, r5, r9 /* ip = A1 - B1 */ 167 usat r5, #8, r3, asr #\shift 168 usat r9, #8, ip, asr #\shift 169 170 add r3, r6, r10 /* r3 = A2 + B2 */ 171 sub ip, r6, r10 /* ip = A2 - B2 */ 172 usat r6, #8, r3, asr #\shift 173 usat r10,#8, ip, asr #\shift 174 175 add r3, r7, r11 /* r3 = A3 + B3 */ 176 sub ip, r7, r11 /* ip = A3 - B3 */ 177 usat r7, #8, r3, asr #\shift 178 usat r11,#8, ip, asr #\shift 179 .endm 180 181/* 182 Compute IDCT of single row, storing as column. 183 r0 = source 184 r1 = dest 185*/ 186function idct_row_armv6 187 push {lr} 188 189 ldr lr, [r0, #12] /* lr = row[7,5] */ 190 ldr ip, [r0, #4] /* ip = row[6,4] */ 191 ldr r3, [r0, #8] /* r3 = row[3,1] */ 192 ldr r2, [r0] /* r2 = row[2,0] */ 193 orrs lr, lr, ip 194 itt eq 195 cmpeq lr, r3 196 cmpeq lr, r2, lsr #16 197 beq 1f 198 push {r1} 199 ldr ip, =W42 /* ip = W4 | (W2 << 16) */ 200 cmp lr, #0 201 beq 2f 202 203 idct_row ROW_SHIFT 204 b 3f 205 2062: idct_row4 ROW_SHIFT 207 2083: pop {r1} 209 idct_finish_shift ROW_SHIFT 210 211 strh r4, [r1] 212 strh r5, [r1, #(16*2)] 213 strh r6, [r1, #(16*4)] 214 strh r7, [r1, #(16*6)] 215 strh r11,[r1, #(16*1)] 216 strh r10,[r1, #(16*3)] 217 strh r9, [r1, #(16*5)] 218 strh r8, [r1, #(16*7)] 219 220 pop {pc} 221 2221: mov r2, r2, lsl #3 223 strh r2, [r1] 224 strh r2, [r1, #(16*2)] 225 strh r2, [r1, #(16*4)] 226 strh r2, [r1, #(16*6)] 227 strh r2, [r1, #(16*1)] 228 strh r2, [r1, #(16*3)] 229 strh r2, [r1, #(16*5)] 230 strh r2, [r1, #(16*7)] 231 pop {pc} 232endfunc 233 234/* 235 Compute IDCT of single column, read as row. 236 r0 = source 237 r1 = dest 238*/ 239function idct_col_armv6 240 push {r1, lr} 241 242 ldr r2, [r0] /* r2 = row[2,0] */ 243 ldr ip, =W42 /* ip = W4 | (W2 << 16) */ 244 ldr r3, [r0, #8] /* r3 = row[3,1] */ 245 idct_row COL_SHIFT 246 pop {r1} 247 idct_finish_shift COL_SHIFT 248 249 strh r4, [r1] 250 strh r5, [r1, #(16*1)] 251 strh r6, [r1, #(16*2)] 252 strh r7, [r1, #(16*3)] 253 strh r11,[r1, #(16*4)] 254 strh r10,[r1, #(16*5)] 255 strh r9, [r1, #(16*6)] 256 strh r8, [r1, #(16*7)] 257 258 pop {pc} 259endfunc 260 261/* 262 Compute IDCT of single column, read as row, store saturated 8-bit. 263 r0 = source 264 r1 = dest 265 r2 = line size 266*/ 267function idct_col_put_armv6 268 push {r1, r2, lr} 269 270 ldr r2, [r0] /* r2 = row[2,0] */ 271 ldr ip, =W42 /* ip = W4 | (W2 << 16) */ 272 ldr r3, [r0, #8] /* r3 = row[3,1] */ 273 idct_row COL_SHIFT 274 pop {r1, r2} 275 idct_finish_shift_sat COL_SHIFT 276 277 strb_post r4, r1, r2 278 strb_post r5, r1, r2 279 strb_post r6, r1, r2 280 strb_post r7, r1, r2 281 strb_post r11,r1, r2 282 strb_post r10,r1, r2 283 strb_post r9, r1, r2 284 strb_post r8, r1, r2 285 286 sub r1, r1, r2, lsl #3 287 288 pop {pc} 289endfunc 290 291/* 292 Compute IDCT of single column, read as row, add/store saturated 8-bit. 293 r0 = source 294 r1 = dest 295 r2 = line size 296*/ 297function idct_col_add_armv6 298 push {r1, r2, lr} 299 300 ldr r2, [r0] /* r2 = row[2,0] */ 301 ldr ip, =W42 /* ip = W4 | (W2 << 16) */ 302 ldr r3, [r0, #8] /* r3 = row[3,1] */ 303 idct_row COL_SHIFT 304 pop {r1, r2} 305 idct_finish 306 307 ldrb r3, [r1] 308 ldrb r7, [r1, r2] 309 ldrb r11,[r1, r2, lsl #2] 310 add ip, r3, ip, asr #COL_SHIFT 311 usat ip, #8, ip 312 add r4, r7, r4, asr #COL_SHIFT 313 strb_post ip, r1, r2 314 ldrb ip, [r1, r2] 315 usat r4, #8, r4 316 ldrb r11,[r1, r2, lsl #2] 317 add r5, ip, r5, asr #COL_SHIFT 318 usat r5, #8, r5 319 strb_post r4, r1, r2 320 ldrb r3, [r1, r2] 321 ldrb ip, [r1, r2, lsl #2] 322 strb_post r5, r1, r2 323 ldrb r7, [r1, r2] 324 ldrb r4, [r1, r2, lsl #2] 325 add r6, r3, r6, asr #COL_SHIFT 326 usat r6, #8, r6 327 add r10,r7, r10,asr #COL_SHIFT 328 usat r10,#8, r10 329 add r9, r11,r9, asr #COL_SHIFT 330 usat r9, #8, r9 331 add r8, ip, r8, asr #COL_SHIFT 332 usat r8, #8, r8 333 add lr, r4, lr, asr #COL_SHIFT 334 usat lr, #8, lr 335 strb_post r6, r1, r2 336 strb_post r10,r1, r2 337 strb_post r9, r1, r2 338 strb_post r8, r1, r2 339 strb_post lr, r1, r2 340 341 sub r1, r1, r2, lsl #3 342 343 pop {pc} 344endfunc 345 346/* 347 Compute 8 IDCT row transforms. 348 func = IDCT row->col function 349 width = width of columns in bytes 350*/ 351 .macro idct_rows func width 352 bl \func 353 add r0, r0, #(16*2) 354 add r1, r1, #\width 355 bl \func 356 add r0, r0, #(16*2) 357 add r1, r1, #\width 358 bl \func 359 add r0, r0, #(16*2) 360 add r1, r1, #\width 361 bl \func 362 sub r0, r0, #(16*5) 363 add r1, r1, #\width 364 bl \func 365 add r0, r0, #(16*2) 366 add r1, r1, #\width 367 bl \func 368 add r0, r0, #(16*2) 369 add r1, r1, #\width 370 bl \func 371 add r0, r0, #(16*2) 372 add r1, r1, #\width 373 bl \func 374 375 sub r0, r0, #(16*7) 376 .endm 377 378/* void ff_simple_idct_armv6(int16_t *data); */ 379function ff_simple_idct_armv6, export=1 380 push {r4-r11, lr} 381 sub sp, sp, #128 382 383 mov r1, sp 384 idct_rows idct_row_armv6, 2 385 mov r1, r0 386 mov r0, sp 387 idct_rows idct_col_armv6, 2 388 389 add sp, sp, #128 390 pop {r4-r11, pc} 391endfunc 392 393/* ff_simple_idct_add_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */ 394function ff_simple_idct_add_armv6, export=1 395 push {r0, r1, r4-r11, lr} 396 sub sp, sp, #128 397 398 mov r0, r2 399 mov r1, sp 400 idct_rows idct_row_armv6, 2 401 mov r0, sp 402 ldr r1, [sp, #128] 403 ldr r2, [sp, #(128+4)] 404 idct_rows idct_col_add_armv6, 1 405 406 add sp, sp, #(128+8) 407 pop {r4-r11, pc} 408endfunc 409 410/* ff_simple_idct_put_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */ 411function ff_simple_idct_put_armv6, export=1 412 push {r0, r1, r4-r11, lr} 413 sub sp, sp, #128 414 415 mov r0, r2 416 mov r1, sp 417 idct_rows idct_row_armv6, 2 418 mov r0, sp 419 ldr r1, [sp, #128] 420 ldr r2, [sp, #(128+4)] 421 idct_rows idct_col_put_armv6, 1 422 423 add sp, sp, #(128+8) 424 pop {r4-r11, pc} 425endfunc 426