1/* 2 * Copyright (C) 2002 Frederic 'dilb' Boulay 3 * 4 * Author: Frederic Boulay <dilb@handhelds.org> 5 * 6 * The function defined in this file is derived from the simple_idct function 7 * from the libavcodec library part of the FFmpeg project. 8 * 9 * This file is part of FFmpeg. 10 * 11 * FFmpeg is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU Lesser General Public 13 * License as published by the Free Software Foundation; either 14 * version 2.1 of the License, or (at your option) any later version. 15 * 16 * FFmpeg is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * Lesser General Public License for more details. 20 * 21 * You should have received a copy of the GNU Lesser General Public 22 * License along with FFmpeg; if not, write to the Free Software 23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24 */ 25 26#include "libavutil/arm/asm.S" 27 28/* useful constants for the algorithm */ 29#define W1 22725 30#define W2 21407 31#define W3 19266 32#define W4 16383 33#define W5 12873 34#define W6 8867 35#define W7 4520 36#define MASK_MSHW 0xFFFF0000 37 38#define ROW_SHIFT 11 39#define ROW_SHIFT2MSHW (16-11) 40#define COL_SHIFT 20 41#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ 42#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ 43 44 45function ff_simple_idct_arm, export=1 46 @@ void simple_idct_arm(int16_t *block) 47 @@ save stack for reg needed (take all of them), 48 @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block 49 @@ so it must not be overwritten, if it is not saved!! 50 @@ R12 is another scratch register, so it should not be saved too 51 @@ save all registers 52 stmfd sp!, {r4-r11, r14} @ R14 is also called LR 53 @@ at this point, R0=block, other registers are free. 54 add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. 55 @@ add 2 temporary variables in the stack: R0 and R14 56 sub sp, sp, #8 @ allow 2 local variables 57 str r0, [sp, #0] @ save block in sp[0] 58 @@ stack status 59 @@ sp+4 free 60 @@ sp+0 R0 (block) 61 62 63 @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free 64 65 66__row_loop: 67 @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32 bits in two 16-bit words), at least it gives more usable registers :) 68 ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer) 69 ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1] 70 ldr r3, [r14, #8] @ R3=ROWr32[2] 71 ldr r4, [r14, #12] @ R4=ROWr32[3] 72 @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop), 73 @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row) 74 @@ else follow the complete algorithm. 75 @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], 76 @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free 77 orr r5, r4, r3 @ R5=R4 | R3 78 orr r5, r5, r2 @ R5=R4 | R3 | R2 79 orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null) 80 beq __end_row_loop 81 mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later) 82 ldrsh r6, [r14, #0] @ R6=ROWr16[0] 83 orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7 84 beq __almost_empty_row 85 86@@ __b_evaluation: 87 @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3], 88 @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free, 89 @@ R12=__const_ptr_, R14=&block[n] 90 @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3 91 92 @@ MUL16(b0, W1, row[1]); 93 @@ MUL16(b1, W3, row[1]); 94 @@ MUL16(b2, W5, row[1]); 95 @@ MUL16(b3, W7, row[1]); 96 @@ MAC16(b0, W3, row[3]); 97 @@ MAC16(b1, -W7, row[3]); 98 @@ MAC16(b2, -W1, row[3]); 99 @@ MAC16(b3, -W5, row[3]); 100 ldr r8, =W1 @ R8=W1 101 mov r2, r2, asr #16 @ R2=ROWr16[3] 102 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 103 ldr r9, =W3 @ R9=W3 104 ldr r10, =W5 @ R10=W5 105 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 106 ldr r11, =W7 @ R11=W7 107 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 108 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 109 teq r2, #0 @ if null avoid muls 110 itttt ne 111 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 112 rsbne r2, r2, #0 @ R2=-ROWr16[3] 113 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 114 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 115 it ne 116 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 117 118 @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], 119 @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, 120 @@ R12=__const_ptr_, R14=&block[n] 121 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; 122 @@ if (temp != 0) {} 123 orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3] 124 beq __end_b_evaluation 125 126 @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], 127 @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, 128 @@ R12=__const_ptr_, R14=&block[n] 129 @@ MAC16(b0, W5, row[5]); 130 @@ MAC16(b2, W7, row[5]); 131 @@ MAC16(b3, W3, row[5]); 132 @@ MAC16(b1, -W1, row[5]); 133 @@ MAC16(b0, W7, row[7]); 134 @@ MAC16(b2, W3, row[7]); 135 @@ MAC16(b3, -W1, row[7]); 136 @@ MAC16(b1, -W5, row[7]); 137 mov r3, r3, asr #16 @ R3=ROWr16[5] 138 teq r3, #0 @ if null avoid muls 139 it ne 140 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 141 mov r4, r4, asr #16 @ R4=ROWr16[7] 142 itttt ne 143 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 144 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 145 rsbne r3, r3, #0 @ R3=-ROWr16[5] 146 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 147 @@ R3 is free now 148 teq r4, #0 @ if null avoid muls 149 itttt ne 150 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 151 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 152 rsbne r4, r4, #0 @ R4=-ROWr16[7] 153 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 154 it ne 155 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 156 @@ R4 is free now 157__end_b_evaluation: 158 @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free), 159 @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 160 @@ R12=__const_ptr_, R14=&block[n] 161 162@@ __a_evaluation: 163 @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); 164 @@ a1 = a0 + W6 * row[2]; 165 @@ a2 = a0 - W6 * row[2]; 166 @@ a3 = a0 - W2 * row[2]; 167 @@ a0 = a0 + W2 * row[2]; 168 ldr r9, =W4 @ R9=W4 169 mul r6, r9, r6 @ R6=W4*ROWr16[0] 170 ldr r10, =W6 @ R10=W6 171 ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet) 172 add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0) 173 174 mul r11, r10, r4 @ R11=W6*ROWr16[2] 175 ldr r8, =W2 @ R8=W2 176 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) 177 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; 178 @@ if (temp != 0) {} 179 teq r2, #0 180 beq __end_bef_a_evaluation 181 182 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 183 mul r11, r8, r4 @ R11=W2*ROWr16[2] 184 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 185 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 186 187 188 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 189 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), 190 @@ R12=__const_ptr_, R14=&block[n] 191 192 193 @@ a0 += W4*row[4] 194 @@ a1 -= W4*row[4] 195 @@ a2 -= W4*row[4] 196 @@ a3 += W4*row[4] 197 ldrsh r11, [r14, #8] @ R11=ROWr16[4] 198 teq r11, #0 @ if null avoid muls 199 it ne 200 mulne r11, r9, r11 @ R11=W4*ROWr16[4] 201 @@ R9 is free now 202 ldrsh r9, [r14, #12] @ R9=ROWr16[6] 203 itttt ne 204 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) 205 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) 206 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) 207 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) 208 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead 209 teq r9, #0 @ if null avoid muls 210 itttt ne 211 mulne r11, r10, r9 @ R11=W6*ROWr16[6] 212 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) 213 mulne r10, r8, r9 @ R10=W2*ROWr16[6] 214 @@ a0 += W6*row[6]; 215 @@ a3 -= W6*row[6]; 216 @@ a1 -= W2*row[6]; 217 @@ a2 += W2*row[6]; 218 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) 219 itt ne 220 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) 221 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) 222 223__end_a_evaluation: 224 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 225 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 226 @@ R12=__const_ptr_, R14=&block[n] 227 @@ row[0] = (a0 + b0) >> ROW_SHIFT; 228 @@ row[1] = (a1 + b1) >> ROW_SHIFT; 229 @@ row[2] = (a2 + b2) >> ROW_SHIFT; 230 @@ row[3] = (a3 + b3) >> ROW_SHIFT; 231 @@ row[4] = (a3 - b3) >> ROW_SHIFT; 232 @@ row[5] = (a2 - b2) >> ROW_SHIFT; 233 @@ row[6] = (a1 - b1) >> ROW_SHIFT; 234 @@ row[7] = (a0 - b0) >> ROW_SHIFT; 235 add r8, r6, r0 @ R8=a0+b0 236 add r9, r2, r1 @ R9=a1+b1 237 @@ put two 16-bit half-words in a 32-bit word 238 @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only little-endian compliant then!!!) 239 ldr r10, =MASK_MSHW @ R10=0xFFFF0000 240 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5) 241 mvn r11, r10 @ R11= NOT R10= 0x0000FFFF 242 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11) 243 orr r8, r8, r9 244 str r8, [r14, #0] 245 246 add r8, r3, r5 @ R8=a2+b2 247 add r9, r4, r7 @ R9=a3+b3 248 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5) 249 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11) 250 orr r8, r8, r9 251 str r8, [r14, #4] 252 253 sub r8, r4, r7 @ R8=a3-b3 254 sub r9, r3, r5 @ R9=a2-b2 255 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5) 256 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11) 257 orr r8, r8, r9 258 str r8, [r14, #8] 259 260 sub r8, r2, r1 @ R8=a1-b1 261 sub r9, r6, r0 @ R9=a0-b0 262 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5) 263 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11) 264 orr r8, r8, r9 265 str r8, [r14, #12] 266 267 bal __end_row_loop 268 269__almost_empty_row: 270 @@ the row was empty, except ROWr16[0], now, management of this special case 271 @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], 272 @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1], 273 @@ R8=0xFFFF (temp), R9-R11 free 274 mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run). 275 sub r8, r8, #1 @ R8 is now ready. 276 and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF 277 orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16) 278 str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5 279 str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5 280 str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5 281 str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5 282 283__end_row_loop: 284 @@ at this point, R0-R11 (free) 285 @@ R12=__const_ptr_, R14=&block[n] 286 ldr r0, [sp, #0] @ R0=block 287 teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished. 288 sub r14, r14, #16 289 bne __row_loop 290 291 292 293 @@ at this point, R0=block, R1-R11 (free) 294 @@ R12=__const_ptr_, R14=&block[n] 295 add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. 296__col_loop: 297 298@@ __b_evaluation2: 299 @@ at this point, R0=block (temp), R1-R11 (free) 300 @@ R12=__const_ptr_, R14=&block[n] 301 @@ proceed with b0-b3 first, followed by a0-a3 302 @@ MUL16(b0, W1, col[8x1]); 303 @@ MUL16(b1, W3, col[8x1]); 304 @@ MUL16(b2, W5, col[8x1]); 305 @@ MUL16(b3, W7, col[8x1]); 306 @@ MAC16(b0, W3, col[8x3]); 307 @@ MAC16(b1, -W7, col[8x3]); 308 @@ MAC16(b2, -W1, col[8x3]); 309 @@ MAC16(b3, -W5, col[8x3]); 310 ldr r8, =W1 @ R8=W1 311 ldrsh r7, [r14, #16] 312 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 313 ldr r9, =W3 @ R9=W3 314 ldr r10, =W5 @ R10=W5 315 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 316 ldr r11, =W7 @ R11=W7 317 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 318 ldrsh r2, [r14, #48] 319 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) 320 teq r2, #0 @ if 0, then avoid muls 321 itttt ne 322 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 323 rsbne r2, r2, #0 @ R2=-ROWr16[3] 324 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 325 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 326 it ne 327 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) 328 329 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), 330 @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, 331 @@ R12=__const_ptr_, R14=&block[n] 332 @@ MAC16(b0, W5, col[5x8]); 333 @@ MAC16(b2, W7, col[5x8]); 334 @@ MAC16(b3, W3, col[5x8]); 335 @@ MAC16(b1, -W1, col[5x8]); 336 @@ MAC16(b0, W7, col[7x8]); 337 @@ MAC16(b2, W3, col[7x8]); 338 @@ MAC16(b3, -W1, col[7x8]); 339 @@ MAC16(b1, -W5, col[7x8]); 340 ldrsh r3, [r14, #80] @ R3=COLr16[5x8] 341 teq r3, #0 @ if 0 then avoid muls 342 itttt ne 343 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 344 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 345 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 346 rsbne r3, r3, #0 @ R3=-ROWr16[5x8] 347 ldrsh r4, [r14, #112] @ R4=COLr16[7x8] 348 it ne 349 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 350 @@ R3 is free now 351 teq r4, #0 @ if 0 then avoid muls 352 itttt ne 353 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 354 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 355 rsbne r4, r4, #0 @ R4=-ROWr16[7x8] 356 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 357 it ne 358 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 359 @@ R4 is free now 360@@ __end_b_evaluation2: 361 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), 362 @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 363 @@ R12=__const_ptr_, R14=&block[n] 364 365@@ __a_evaluation2: 366 @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); 367 @@ a1 = a0 + W6 * row[2]; 368 @@ a2 = a0 - W6 * row[2]; 369 @@ a3 = a0 - W2 * row[2]; 370 @@ a0 = a0 + W2 * row[2]; 371 ldrsh r6, [r14, #0] 372 ldr r9, =W4 @ R9=W4 373 mul r6, r9, r6 @ R6=W4*ROWr16[0] 374 ldr r10, =W6 @ R10=W6 375 ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) 376 add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) 377 mul r11, r10, r4 @ R11=W6*ROWr16[2] 378 ldr r8, =W2 @ R8=W2 379 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 380 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) 381 mul r11, r8, r4 @ R11=W2*ROWr16[2] 382 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 383 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 384 385 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 386 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), 387 @@ R12=__const_ptr_, R14=&block[n] 388 @@ a0 += W4*row[4] 389 @@ a1 -= W4*row[4] 390 @@ a2 -= W4*row[4] 391 @@ a3 += W4*row[4] 392 ldrsh r11, [r14, #64] @ R11=ROWr16[4] 393 teq r11, #0 @ if null avoid muls 394 itttt ne 395 mulne r11, r9, r11 @ R11=W4*ROWr16[4] 396 @@ R9 is free now 397 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) 398 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) 399 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) 400 ldrsh r9, [r14, #96] @ R9=ROWr16[6] 401 it ne 402 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) 403 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead 404 teq r9, #0 @ if null avoid muls 405 itttt ne 406 mulne r11, r10, r9 @ R11=W6*ROWr16[6] 407 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) 408 mulne r10, r8, r9 @ R10=W2*ROWr16[6] 409 @@ a0 += W6*row[6]; 410 @@ a3 -= W6*row[6]; 411 @@ a1 -= W2*row[6]; 412 @@ a2 += W2*row[6]; 413 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) 414 itt ne 415 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) 416 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) 417@@ __end_a_evaluation2: 418 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, 419 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), 420 @@ R12=__const_ptr_, R14=&block[n] 421 @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); 422 @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); 423 @@ col[16] = ((a2 + b2) >> COL_SHIFT); 424 @@ col[24] = ((a3 + b3) >> COL_SHIFT); 425 @@ col[32] = ((a3 - b3) >> COL_SHIFT); 426 @@ col[40] = ((a2 - b2) >> COL_SHIFT); 427 @@ col[48] = ((a1 - b1) >> COL_SHIFT); 428 @@ col[56] = ((a0 - b0) >> COL_SHIFT); 429 @@@@@ no optimization here @@@@@ 430 add r8, r6, r0 @ R8=a0+b0 431 add r9, r2, r1 @ R9=a1+b1 432 mov r8, r8, asr #COL_SHIFT 433 mov r9, r9, asr #COL_SHIFT 434 strh r8, [r14, #0] 435 strh r9, [r14, #16] 436 add r8, r3, r5 @ R8=a2+b2 437 add r9, r4, r7 @ R9=a3+b3 438 mov r8, r8, asr #COL_SHIFT 439 mov r9, r9, asr #COL_SHIFT 440 strh r8, [r14, #32] 441 strh r9, [r14, #48] 442 sub r8, r4, r7 @ R8=a3-b3 443 sub r9, r3, r5 @ R9=a2-b2 444 mov r8, r8, asr #COL_SHIFT 445 mov r9, r9, asr #COL_SHIFT 446 strh r8, [r14, #64] 447 strh r9, [r14, #80] 448 sub r8, r2, r1 @ R8=a1-b1 449 sub r9, r6, r0 @ R9=a0-b0 450 mov r8, r8, asr #COL_SHIFT 451 mov r9, r9, asr #COL_SHIFT 452 strh r8, [r14, #96] 453 strh r9, [r14, #112] 454 455@@ __end_col_loop: 456 @@ at this point, R0-R11 (free) 457 @@ R12=__const_ptr_, R14=&block[n] 458 ldr r0, [sp, #0] @ R0=block 459 teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. 460 sub r14, r14, #2 461 bne __col_loop 462 463 464 465 466@@ __end_simple_idct_arm: 467 @@ restore registers to previous status! 468 add sp, sp, #8 @@ the local variables! 469 ldmfd sp!, {r4-r11, r15} @@ update PC with LR content. 470 471 472 473@@ kind of sub-function, here not to overload the common case. 474__end_bef_a_evaluation: 475 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) 476 mul r11, r8, r4 @ R11=W2*ROWr16[2] 477 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) 478 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) 479 bal __end_a_evaluation 480endfunc 481