• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <machine/cpu-features.h>
18
19    .text
20    .align
21
22    .global jpeg_idct_ifast
23    .func   jpeg_idct_ifast
24
25// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
26
27// jpeg_idct_ifast (j_decompress_ptr       cinfo,
28//                 jpeg_component_info *   compptr,
29//                 short*                  coef_block,
30//                 unsigned char*          output_buf,
31//                 int                     output_col)
32
33#define  local_TMP0123       sp
34#define  local_TMP0          [sp, #0]
35#define  local_TMP1          [sp, #4]
36#define  local_TMP2          [sp, #8]
37#define  local_TMP3          [sp, #12]
38#define  local_RANGE_TABLE   [sp, #16]
39#define  local_OUTPUT_COL    [sp, #20]
40#define  local_OUTPUT_BUF    [sp, #24]
41#define  local_UNUSED        [sp, #28]
42#define  off_WORKSPACE       32
43#define  local_WORKSPACE     [sp, #offWORKSPACE]
44#define  local_SIZE          (off_WORKSPACE + 8*8*4)
45
46#define  off_DECOMPRESS_range_limit_base  324
47#define  off_COMPINFO_quanttable          80
48
49#define  DCTSIZE   8
50#define  VY(x)   ((x)*DCTSIZE*2)
51#define  QY(x)   ((x)*DCTSIZE*4)
52
53#define  VX(x)   ((x)*2)
54#define  QX(x)   ((x)*4)
55
56#define  FIX_1_414213562    #362
57#define  FIX_1_082392200    #277
58#define  FIX_1_847759065    #473
59#define  FIX_2_613125930    #669
60
61#define  RANGE_MASK   1023
62
63
64
65jpeg_idct_ifast:
66    PLD     (r2, #0)
67    stmdb   sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
68    ldr     r4, [sp, #4*10]
69    sub     sp, #local_SIZE
70
71    ldr     r10,[r1, #off_COMPINFO_quanttable]         // r10 = quanttable
72    str     r4, local_OUTPUT_COL
73    str     r3, local_OUTPUT_BUF
74    ldr     r5, [r0, #off_DECOMPRESS_range_limit_base]
75    add     r5, r5, #128
76    str     r5, local_RANGE_TABLE
77    mov     fp, r2                                      // fp = coef_block
78    add     ip, sp, #off_WORKSPACE
79
80VLoopTail:
81    ldrsh    r0, [fp, #VY(0)]
82    ldrsh    r1, [fp, #VY(1)]
83    ldrsh    r2, [fp, #VY(2)]
84    ldrsh    r3, [fp, #VY(3)]
85    ldrsh    r4, [fp, #VY(4)]
86    ldrsh    r5, [fp, #VY(5)]
87    ldrsh    r6, [fp, #VY(6)]
88    ldrsh    r7, [fp, #VY(7)]
89
90    cmp      r1, #0
91    orreqs   r8, r2, r3
92    orreqs   r8, r4, r5
93    orreqs   r8, r6, r7
94    beq      VLoopHeadZero
95
96VLoopHead:
97    // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0]   (r0)
98    // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4]   (r4)
99    // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2]   (r2)
100    // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6]   (r6)
101    // tmp10 = tmp0 + tmp2   (r0)
102    // tmp11 = tmp0 - tmp2   (r4)
103
104    ldr      r9, [r10, #QY(4)]
105    ldr      r8, [r10, #QY(0)]
106#if __ARM_HAVE_HALFWORD_MULTIPLY
107    smulbb   r4, r9, r4
108    smlabb   r0, r8, r0, r4
109#else
110    mul      r4, r9, r4
111    mul      r0, r8, r0
112    add      r0, r4
113#endif
114    ldr      r9, [r10, #QY(6)]
115    ldr      r8, [r10, #QY(2)]
116    sub      r4, r0, r4, lsl #1
117#if __ARM_HAVE_HALFWORD_MULTIPLY
118    smulbb   r6, r9, r6
119    smlabb   r2, r8, r2, r6
120#else
121    mul      r6, r9, r6
122    mul      r2, r8, r2
123    add      r2, r6
124#endif
125
126    // tmp13 = tmp1 + tmp3                                       (r2)
127    // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13    (r6)
128    // FIX_1_4142... = 362 = 45*8 + 2
129    sub      r6, r2, r6, lsl #1
130    mov      r8, #360
131    add      r8, r8, #2
132    mul      r9, r6, r8
133
134    // tmp0 = tmp10 + tmp13;   (r0)
135    // tmp3 = tmp10 - tmp13;   (r8)
136    // tmp1 = tmp11 + tmp12;   (r4)
137    // tmp2 = tmp11 - tmp12;   (r6)
138    add     r0, r0, r2
139    rsb     r6, r2, r9, asr #8
140    sub     r8, r0, r2, lsl #1
141    add     r4, r4, r6
142    sub     r6, r4, r6, lsl #1
143
144    stmia   local_TMP0123, {r0, r4, r6, r8}
145
146    // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
147
148    // odd part
149    // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] )   (r1)
150    // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] )   (r5)
151    // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] )   (r3)
152    // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] )   (r7)
153    // z13 = tmp6 + tmp5;  (r0)
154    // z10 = tmp6 - tmp5;  (r2)
155    // z11 = tmp4 + tmp7;  (r4)
156    // z12 = tmp4 - tmp7;  (r6)
157
158    ldr     r2, [r10, #QY(1)]
159    ldr     r9, [r10, #QY(5)]
160#if __ARM_HAVE_HALFWORD_MULTIPLY
161    smulbb  r1, r2, r1
162#else
163    mul     r1, r2, r1
164#endif
165    ldr     r2, [r10, #QY(3)]
166#if __ARM_HAVE_HALFWORD_MULTIPLY
167    smulbb  r5, r9, r5
168#else
169    mul     r5, r9, r5
170#endif
171    ldr     r9, [r10, #QY(7)]
172#if __ARM_HAVE_HALFWORD_MULTIPLY
173    smlabb  r0, r2, r3, r5
174    smlabb  r4, r9, r7, r1
175#else
176    mul     r0, r2, r3
177    add     r0, r5
178    mul     r4, r9, r7
179    add     r4, r1
180#endif
181    rsb  r2, r0, r5, lsl #1
182    rsb  r6, r4, r1, lsl #1
183
184    // tmp7 = z11 + z13;                             (r7)
185    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
186    // FIX_... = 360 + 2
187    add   r7, r4, r0
188    sub   r1, r4, r0
189    mov   r8, #360
190    add   r8, r8, #2
191    mul   r1, r8, r1
192
193    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
194    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
195    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
196    // FIX_1_8477... = 473 = 472 + 1
197    // FIX_1_082...  = 277 = 276 + 1
198    // FIX_2_...     = 669 = 668 + 1
199    add     r8, r2, r6
200    mov     r9, #472
201    mla     r8, r9, r8, r8
202    mov     r9, #276
203    mla     r0, r6, r9, r6
204    mov     r9, #668
205    mla     r2, r9, r2, r2
206    sub     r0, r0, r8
207    rsb     r2, r2, r8
208
209    // tmp6 = tmp12 - tmp7;  (r6)
210    // tmp5 = tmp11 - tmp6;  (r5)
211    // tmp4 = tmp10 + tmp5;  (r4)
212    rsb  r6, r7, r2, asr #8
213    rsb  r5, r6, r1, asr #8
214    add  r4, r5, r0, asr #8
215
216    ldmia local_TMP0123, {r0, r1, r2, r3}
217
218    // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
219    // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
220    // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
221    // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
222    // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
223    // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
224    // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
225    // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
226
227    add   r0, r0, r7
228    sub   r7, r0, r7, lsl #1
229    add   r1, r1, r6
230    sub   r6, r1, r6, lsl #1
231    add   r2, r2, r5
232    sub   r5, r2, r5, lsl #1
233    sub   r3, r3, r4
234    add   r4, r3, r4, lsl #1
235
236    str   r0, [ip, #QY(0)]
237    str   r1, [ip, #QY(1)]
238    str   r2, [ip, #QY(2)]
239    str   r3, [ip, #QY(3)]
240    str   r4, [ip, #QY(4)]
241    str   r5, [ip, #QY(5)]
242    str   r6, [ip, #QY(6)]
243    str   r7, [ip, #QY(7)]
244
245    // inptr++;                    /* advance pointers to next column */
246    // quantptr++;
247    // wsptr++;
248    add  fp, fp, #2
249    add  r10, r10, #4
250    add  ip, ip, #4
251    add  r0, sp, #(off_WORKSPACE + 4*8)
252    cmp  ip, r0
253    bne  VLoopTail
254
255
256
257HLoopStart:
258    // reset pointers
259    PLD     (sp, #off_WORKSPACE)
260    add     ip, sp, #off_WORKSPACE
261    ldr     r10, local_RANGE_TABLE
262
263HLoopTail:
264    // output = *output_buf++ + output_col
265    ldr      r0, local_OUTPUT_BUF
266    ldr      r1, local_OUTPUT_COL
267    ldr      r2, [r0], #4
268    str      r0, local_OUTPUT_BUF
269    add      fp, r2, r1
270
271    PLD      (ip, #32)
272    ldmia    ip!, {r0-r7}
273
274    cmp      r1, #0
275    orreqs   r8, r2, r3
276    orreqs   r8, r4, r5
277    orreqs   r8, r6, r7
278    beq      HLoopTailZero
279
280HLoopHead:
281    // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);    (r0)
282    // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);    (r4)
283    add     r0, r0, r4
284    sub     r4, r0, r4, lsl #1
285
286    // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);                                   (r2)
287    // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13;  (r6)
288    // FIX_... = 360 + 2
289    add     r2, r2, r6
290    sub     r6, r2, r6, lsl #1
291    mov     r8, #360
292    add     r8, r8, #2
293    mul     r6, r8, r6
294
295    // tmp0 = tmp10 + tmp13;   (r0)
296    // tmp3 = tmp10 - tmp13;   (r8)
297    // tmp1 = tmp11 + tmp12;   (r4)
298    // tmp2 = tmp11 - tmp12;   (r6)
299    add     r0, r0, r2
300    rsb     r6, r2, r6, asr #8
301    sub     r8, r0, r2, lsl #1
302    add     r4, r4, r6
303    sub     r6, r4, r6, lsl #1
304
305    stmia   local_TMP0123, {r0, r4, r6, r8}
306
307    // Odd part
308
309    // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];  (r0)
310    // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];  (r2)
311    // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];  (r4)
312    // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];  (r6)
313    add  r0, r5, r3
314    sub  r2, r5, r3
315    add  r4, r1, r7
316    sub  r6, r1, r7
317
318    // tmp7 = z11 + z13;                             (r7)
319    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
320    // FIX_... = 360 + 2
321    add   r7, r4, r0
322    sub   r1, r4, r0
323    mov   r8, #360
324    add   r8, r8, #2
325    mul   r1, r8, r1
326
327    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
328    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
329    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
330    // FIX_1_8477... = 473 = 472 + 1
331    // FIX_1_082...  = 277 = 276 + 1
332    // FIX_2_...     = 669 = 668 + 1
333    add  r8, r2, r6
334    mov  r9, #472
335    mla  r8, r9, r8, r8
336    mov  r9, #276
337    mla  r0, r6, r9, r6
338    mov  r9, #668
339    mla  r2, r9, r2, r2
340    sub  r0, r0, r8
341    sub  r2, r8, r2
342
343    // tmp6 = tmp12 - tmp7;  (r6)
344    // tmp5 = tmp11 - tmp6;  (r5)
345    // tmp4 = tmp10 + tmp5;  (r4)
346    rsb  r6, r7, r2, asr #8
347    rsb  r5, r6, r1, asr #8
348    add  r4, r5, r0, asr #8
349
350    ldmia local_TMP0123, {r0, r1, r2, r3}
351
352    // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
353    // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
354    // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
355    // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
356    // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
357    // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
358    // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
359    // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
360
361    mov    r8, #128
362    add    r0, r0, r7
363    sub    r7, r0, r7, lsl #1
364    add    r0, r8, r0, asr #5
365    add    r7, r8, r7, asr #5
366    add    r1, r1, r6
367    sub    r6, r1, r6, lsl #1
368    add    r1, r8, r1, asr #5
369    add    r6, r8, r6, asr #5
370    add    r2, r2, r5
371    sub    r5, r2, r5, lsl #1
372    add    r2, r8, r2, asr #5
373    add    r5, r8, r5, asr #5
374    sub    r3, r3, r4
375    add    r4, r3, r4, lsl #1
376    add    r3, r8, r3, asr #5
377    add    r4, r8, r4, asr #5
378
379#if __ARM_ARCH__ >= 6
380    usat   r0, #8, r0
381    usat   r1, #8, r1
382    usat   r2, #8, r2
383    usat   r3, #8, r3
384    usat   r4, #8, r4
385    usat   r5, #8, r5
386    usat   r6, #8, r6
387    usat   r7, #8, r7
388#else
389    cmp    r0, #255
390    mvnhi  r0, r0, asr #31
391    andhi  r0, #255
392    cmp    r7, #255
393    mvnhi  r7, r7, asr #31
394    cmp    r1, #255
395    mvnhi  r1, r1, asr #31
396    andhi  r1, #255
397    cmp    r6, #255
398    mvnhi  r6, r6, asr #31
399    andhi  r6, #255
400    cmp    r2, #255
401    mvnhi  r2, r2, asr #31
402    andhi  r2, #255
403    cmp    r5, #255
404    mvnhi  r5, r5, asr #31
405    andhi  r5, #255
406    cmp    r3, #255
407    mvnhi  r3, r3, asr #31
408    cmp    r4, #255
409    mvnhi  r4, r4, asr #31
410    andhi  r4, #255
411#endif
412
413    // r3 r2 r1 r0
414    orr    r0, r0, r1, lsl #8
415    orr    r0, r0, r2, lsl #16
416    orr    r0, r0, r3, lsl #24
417
418    // r7 r6 r5 r4
419    orr    r1, r4, r5, lsl #8
420    orr    r1, r1, r6, lsl #16
421    orr    r1, r1, r7, lsl #24
422    stmia  fp, {r0, r1}
423
424    add    r0, sp, #(off_WORKSPACE + 8*8*4)
425    cmp    ip, r0
426    bne    HLoopTail
427
428Exit:
429    add    sp, sp, #local_SIZE
430    ldmia  sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
431    bx     lr
432
433
434VLoopHeadZero:
435// ok, all AC coefficients are 0
436    ldr      r1, [r10, #QY(0)]
437    add      fp, fp, #2
438    add      r10, r10, #4
439    mul      r0, r1, r0
440    str      r0, [ip, #QY(0)]
441    str      r0, [ip, #QY(1)]
442    str      r0, [ip, #QY(2)]
443    str      r0, [ip, #QY(3)]
444    str      r0, [ip, #QY(4)]
445    str      r0, [ip, #QY(5)]
446    str      r0, [ip, #QY(6)]
447    str      r0, [ip, #QY(7)]
448    add      ip, ip, #4
449    add      r0, sp, #(off_WORKSPACE + 4*8)
450    cmp      ip, r0
451    beq      HLoopStart
452    b        VLoopTail
453
454HLoopTailZero:
455    mov      r0, r0, asr #5
456    add      r0, #128
457
458#if __ARM_ARCH__ >= 6
459    usat     r0, #8, r0
460#else
461    cmp      r0, #255
462    mvnhi    r0, r0, asr #31
463    andhi    r0, r0, #255
464#endif
465
466    orr      r0, r0, lsl #8
467    orr      r0, r0, lsl #16
468    mov      r1, r0
469    stmia    fp, {r0, r1}
470
471    add      r0, sp, #(off_WORKSPACE + 64*4)
472    cmp      ip, r0
473    beq      Exit
474    b        HLoopTail
475
476    .endfunc
477