• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (C) 2002 Frederic 'dilb' Boulay
3 *
4 * Author: Frederic Boulay <dilb@handhelds.org>
5 *
6 * The function defined in this file is derived from the simple_idct function
7 * from the libavcodec library part of the FFmpeg project.
8 *
9 * This file is part of FFmpeg.
10 *
11 * FFmpeg is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * FFmpeg is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with FFmpeg; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 */
25
26#include "libavutil/arm/asm.S"
27
28/* useful constants for the algorithm */
29#define W1  22725
30#define W2  21407
31#define W3  19266
32#define W4  16383
33#define W5  12873
34#define W6  8867
35#define W7  4520
36#define MASK_MSHW 0xFFFF0000
37
38#define ROW_SHIFT 11
39#define ROW_SHIFT2MSHW (16-11)
40#define COL_SHIFT 20
41#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
42#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
43
44
45function ff_simple_idct_arm, export=1
46        @@ void simple_idct_arm(int16_t *block)
47        @@ save stack for reg needed (take all of them),
48        @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
49        @@ so it must not be overwritten, if it is not saved!!
50        @@ R12 is another scratch register, so it should not be saved too
51        @@ save all registers
52        stmfd sp!, {r4-r11, r14} @ R14 is also called LR
53        @@ at this point, R0=block, other registers are free.
54        add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
55        @@ add 2 temporary variables in the stack: R0 and R14
56        sub sp, sp, #8          @ allow 2 local variables
57        str r0, [sp, #0]        @ save block in sp[0]
58        @@ stack status
59        @@ sp+4   free
60        @@ sp+0   R0  (block)
61
62
63        @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
64
65
66__row_loop:
67        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32 bits in two 16-bit words), at least it gives more usable registers :)
68        ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
69        ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
70        ldr r3, [r14, #8]        @ R3=ROWr32[2]
71        ldr r4, [r14, #12]       @ R4=ROWr32[3]
72        @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
73        @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
74        @@ else follow the complete algorithm.
75        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
76        @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
77        orr r5, r4, r3           @ R5=R4 | R3
78        orr r5, r5, r2           @ R5=R4 | R3 | R2
79        orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
80        beq __end_row_loop
81        mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
82        ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
83        orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
84        beq __almost_empty_row
85
86@@ __b_evaluation:
87        @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
88        @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
89        @@     R12=__const_ptr_, R14=&block[n]
90        @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
91
92        @@ MUL16(b0, W1, row[1]);
93        @@ MUL16(b1, W3, row[1]);
94        @@ MUL16(b2, W5, row[1]);
95        @@ MUL16(b3, W7, row[1]);
96        @@ MAC16(b0, W3, row[3]);
97        @@ MAC16(b1, -W7, row[3]);
98        @@ MAC16(b2, -W1, row[3]);
99        @@ MAC16(b3, -W5, row[3]);
100        ldr r8, =W1              @ R8=W1
101        mov r2, r2, asr #16      @ R2=ROWr16[3]
102        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
103        ldr r9, =W3              @ R9=W3
104        ldr r10, =W5             @ R10=W5
105        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
106        ldr r11, =W7             @ R11=W7
107        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
108        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
109        teq r2, #0               @ if null avoid muls
110        itttt ne
111        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
112        rsbne r2, r2, #0         @ R2=-ROWr16[3]
113        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
114        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
115        it    ne
116        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
117
118        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
119        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
120        @@     R12=__const_ptr_, R14=&block[n]
121        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
122        @@ if (temp != 0) {}
123        orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
124        beq __end_b_evaluation
125
126        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
127        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
128        @@     R12=__const_ptr_, R14=&block[n]
129        @@ MAC16(b0, W5, row[5]);
130        @@ MAC16(b2, W7, row[5]);
131        @@ MAC16(b3, W3, row[5]);
132        @@ MAC16(b1, -W1, row[5]);
133        @@ MAC16(b0, W7, row[7]);
134        @@ MAC16(b2, W3, row[7]);
135        @@ MAC16(b3, -W1, row[7]);
136        @@ MAC16(b1, -W5, row[7]);
137        mov r3, r3, asr #16      @ R3=ROWr16[5]
138        teq r3, #0               @ if null avoid muls
139        it    ne
140        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
141        mov r4, r4, asr #16      @ R4=ROWr16[7]
142        itttt ne
143        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
144        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
145        rsbne r3, r3, #0         @ R3=-ROWr16[5]
146        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
147        @@ R3 is free now
148        teq r4, #0               @ if null avoid muls
149        itttt ne
150        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
151        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
152        rsbne r4, r4, #0         @ R4=-ROWr16[7]
153        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
154        it    ne
155        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
156        @@ R4 is free now
157__end_b_evaluation:
158        @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
159        @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
160        @@     R12=__const_ptr_, R14=&block[n]
161
162@@ __a_evaluation:
163        @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
164        @@ a1 = a0 + W6 * row[2];
165        @@ a2 = a0 - W6 * row[2];
166        @@ a3 = a0 - W2 * row[2];
167        @@ a0 = a0 + W2 * row[2];
168        ldr r9, =W4              @ R9=W4
169        mul r6, r9, r6           @ R6=W4*ROWr16[0]
170        ldr r10, =W6             @ R10=W6
171        ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
172        add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
173
174        mul r11, r10, r4         @ R11=W6*ROWr16[2]
175        ldr r8, =W2              @ R8=W2
176        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
177        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
178        @@ if (temp != 0) {}
179        teq r2, #0
180        beq __end_bef_a_evaluation
181
182        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
183        mul r11, r8, r4          @ R11=W2*ROWr16[2]
184        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
185        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
186
187
188        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
189        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
190        @@     R12=__const_ptr_, R14=&block[n]
191
192
193        @@ a0 += W4*row[4]
194        @@ a1 -= W4*row[4]
195        @@ a2 -= W4*row[4]
196        @@ a3 += W4*row[4]
197        ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
198        teq r11, #0              @ if null avoid muls
199        it    ne
200        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
201        @@ R9 is free now
202        ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
203        itttt ne
204        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
205        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
206        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
207        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
208        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
209        teq r9, #0               @ if null avoid muls
210        itttt ne
211        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
212        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
213        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
214        @@ a0 += W6*row[6];
215        @@ a3 -= W6*row[6];
216        @@ a1 -= W2*row[6];
217        @@ a2 += W2*row[6];
218        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
219        itt   ne
220        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
221        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
222
223__end_a_evaluation:
224        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
225        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
226        @@     R12=__const_ptr_, R14=&block[n]
227        @@ row[0] = (a0 + b0) >> ROW_SHIFT;
228        @@ row[1] = (a1 + b1) >> ROW_SHIFT;
229        @@ row[2] = (a2 + b2) >> ROW_SHIFT;
230        @@ row[3] = (a3 + b3) >> ROW_SHIFT;
231        @@ row[4] = (a3 - b3) >> ROW_SHIFT;
232        @@ row[5] = (a2 - b2) >> ROW_SHIFT;
233        @@ row[6] = (a1 - b1) >> ROW_SHIFT;
234        @@ row[7] = (a0 - b0) >> ROW_SHIFT;
235        add r8, r6, r0           @ R8=a0+b0
236        add r9, r2, r1           @ R9=a1+b1
237        @@ put two 16-bit half-words in a 32-bit word
238        @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only little-endian compliant then!!!)
239        ldr r10, =MASK_MSHW      @ R10=0xFFFF0000
240        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
241        mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
242        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
243        orr r8, r8, r9
244        str r8, [r14, #0]
245
246        add r8, r3, r5           @ R8=a2+b2
247        add r9, r4, r7           @ R9=a3+b3
248        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
249        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
250        orr r8, r8, r9
251        str r8, [r14, #4]
252
253        sub r8, r4, r7           @ R8=a3-b3
254        sub r9, r3, r5           @ R9=a2-b2
255        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
256        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
257        orr r8, r8, r9
258        str r8, [r14, #8]
259
260        sub r8, r2, r1           @ R8=a1-b1
261        sub r9, r6, r0           @ R9=a0-b0
262        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
263        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
264        orr r8, r8, r9
265        str r8, [r14, #12]
266
267        bal __end_row_loop
268
269__almost_empty_row:
270        @@ the row was empty, except ROWr16[0], now, management of this special case
271        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
272        @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
273        @@                R8=0xFFFF (temp), R9-R11 free
274        mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
275        sub r8, r8, #1           @ R8 is now ready.
276        and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
277        orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
278        str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
279        str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
280        str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
281        str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
282
283__end_row_loop:
284        @@ at this point, R0-R11 (free)
285        @@     R12=__const_ptr_, R14=&block[n]
286        ldr r0, [sp, #0]         @ R0=block
287        teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
288        sub r14, r14, #16
289        bne __row_loop
290
291
292
293        @@ at this point, R0=block, R1-R11 (free)
294        @@     R12=__const_ptr_, R14=&block[n]
295        add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
296__col_loop:
297
298@@ __b_evaluation2:
299        @@ at this point, R0=block (temp),  R1-R11 (free)
300        @@     R12=__const_ptr_, R14=&block[n]
301        @@ proceed with b0-b3 first, followed by a0-a3
302        @@ MUL16(b0, W1, col[8x1]);
303        @@ MUL16(b1, W3, col[8x1]);
304        @@ MUL16(b2, W5, col[8x1]);
305        @@ MUL16(b3, W7, col[8x1]);
306        @@ MAC16(b0, W3, col[8x3]);
307        @@ MAC16(b1, -W7, col[8x3]);
308        @@ MAC16(b2, -W1, col[8x3]);
309        @@ MAC16(b3, -W5, col[8x3]);
310        ldr r8, =W1              @ R8=W1
311        ldrsh r7, [r14, #16]
312        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
313        ldr r9, =W3              @ R9=W3
314        ldr r10, =W5             @ R10=W5
315        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
316        ldr r11, =W7             @ R11=W7
317        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
318        ldrsh r2, [r14, #48]
319        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
320        teq r2, #0               @ if 0, then avoid muls
321        itttt ne
322        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
323        rsbne r2, r2, #0         @ R2=-ROWr16[3]
324        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
325        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
326        it    ne
327        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
328
329        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
330        @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
331        @@     R12=__const_ptr_, R14=&block[n]
332        @@ MAC16(b0, W5, col[5x8]);
333        @@ MAC16(b2, W7, col[5x8]);
334        @@ MAC16(b3, W3, col[5x8]);
335        @@ MAC16(b1, -W1, col[5x8]);
336        @@ MAC16(b0, W7, col[7x8]);
337        @@ MAC16(b2, W3, col[7x8]);
338        @@ MAC16(b3, -W1, col[7x8]);
339        @@ MAC16(b1, -W5, col[7x8]);
340        ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
341        teq r3, #0               @ if 0 then avoid muls
342        itttt ne
343        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
344        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
345        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
346        rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
347        ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
348        it    ne
349        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
350        @@ R3 is free now
351        teq r4, #0               @ if 0 then avoid muls
352        itttt ne
353        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
354        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
355        rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
356        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
357        it    ne
358        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
359        @@ R4 is free now
360@@ __end_b_evaluation2:
361        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
362        @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
363        @@     R12=__const_ptr_, R14=&block[n]
364
365@@ __a_evaluation2:
366        @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
367        @@ a1 = a0 + W6 * row[2];
368        @@ a2 = a0 - W6 * row[2];
369        @@ a3 = a0 - W2 * row[2];
370        @@ a0 = a0 + W2 * row[2];
371        ldrsh r6, [r14, #0]
372        ldr r9, =W4              @ R9=W4
373        mul r6, r9, r6           @ R6=W4*ROWr16[0]
374        ldr r10, =W6             @ R10=W6
375        ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
376        add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
377        mul r11, r10, r4         @ R11=W6*ROWr16[2]
378        ldr r8, =W2              @ R8=W2
379        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
380        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
381        mul r11, r8, r4          @ R11=W2*ROWr16[2]
382        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
383        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
384
385        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
386        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
387        @@     R12=__const_ptr_, R14=&block[n]
388        @@ a0 += W4*row[4]
389        @@ a1 -= W4*row[4]
390        @@ a2 -= W4*row[4]
391        @@ a3 += W4*row[4]
392        ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
393        teq r11, #0              @ if null avoid muls
394        itttt ne
395        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
396        @@ R9 is free now
397        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
398        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
399        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
400        ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
401        it    ne
402        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
403        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
404        teq r9, #0               @ if null avoid muls
405        itttt ne
406        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
407        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
408        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
409        @@ a0 += W6*row[6];
410        @@ a3 -= W6*row[6];
411        @@ a1 -= W2*row[6];
412        @@ a2 += W2*row[6];
413        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
414        itt   ne
415        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
416        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
417@@ __end_a_evaluation2:
418        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
419        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
420        @@     R12=__const_ptr_, R14=&block[n]
421        @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
422        @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
423        @@ col[16] = ((a2 + b2) >> COL_SHIFT);
424        @@ col[24] = ((a3 + b3) >> COL_SHIFT);
425        @@ col[32] = ((a3 - b3) >> COL_SHIFT);
426        @@ col[40] = ((a2 - b2) >> COL_SHIFT);
427        @@ col[48] = ((a1 - b1) >> COL_SHIFT);
428        @@ col[56] = ((a0 - b0) >> COL_SHIFT);
429        @@@@@ no optimization here @@@@@
430        add r8, r6, r0           @ R8=a0+b0
431        add r9, r2, r1           @ R9=a1+b1
432        mov r8, r8, asr #COL_SHIFT
433        mov r9, r9, asr #COL_SHIFT
434        strh r8, [r14, #0]
435        strh r9, [r14, #16]
436        add r8, r3, r5           @ R8=a2+b2
437        add r9, r4, r7           @ R9=a3+b3
438        mov r8, r8, asr #COL_SHIFT
439        mov r9, r9, asr #COL_SHIFT
440        strh r8, [r14, #32]
441        strh r9, [r14, #48]
442        sub r8, r4, r7           @ R8=a3-b3
443        sub r9, r3, r5           @ R9=a2-b2
444        mov r8, r8, asr #COL_SHIFT
445        mov r9, r9, asr #COL_SHIFT
446        strh r8, [r14, #64]
447        strh r9, [r14, #80]
448        sub r8, r2, r1           @ R8=a1-b1
449        sub r9, r6, r0           @ R9=a0-b0
450        mov r8, r8, asr #COL_SHIFT
451        mov r9, r9, asr #COL_SHIFT
452        strh r8, [r14, #96]
453        strh r9, [r14, #112]
454
455@@ __end_col_loop:
456        @@ at this point, R0-R11 (free)
457        @@     R12=__const_ptr_, R14=&block[n]
458        ldr r0, [sp, #0]         @ R0=block
459        teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
460        sub r14, r14, #2
461        bne __col_loop
462
463
464
465
466@@ __end_simple_idct_arm:
467        @@ restore registers to previous status!
468        add sp, sp, #8 @@ the local variables!
469        ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
470
471
472
473@@ kind of sub-function, here not to overload the common case.
474__end_bef_a_evaluation:
475        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
476        mul r11, r8, r4          @ R11=W2*ROWr16[2]
477        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
478        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
479        bal __end_a_evaluation
480endfunc
481