• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_short_fdct4x4_neon|
13    EXPORT  |vp8_short_fdct8x4_neon|
14    ARM
15    REQUIRE8
16    PRESERVE8
17
18
19    AREA ||.text||, CODE, READONLY, ALIGN=2
20
21; r0    short *input
22; r1    short *output
23; r2    int pitch
24; Input has a pitch, output is contiguous
25|vp8_short_fdct4x4_neon| PROC
26    ldr             r12, _dct_matrix_
27    vld1.16         d0, [r0], r2
28    vld1.16         d1, [r0], r2
29    vld1.16         d2, [r0], r2
30    vld1.16         d3, [r0]
31    vld1.16         {q2, q3}, [r12]
32
33;first stage
34    vmull.s16       q11, d4, d0[0]              ;i=0
35    vmull.s16       q12, d4, d1[0]              ;i=1
36    vmull.s16       q13, d4, d2[0]              ;i=2
37    vmull.s16       q14, d4, d3[0]              ;i=3
38
39    vmlal.s16       q11, d5, d0[1]
40    vmlal.s16       q12, d5, d1[1]
41    vmlal.s16       q13, d5, d2[1]
42    vmlal.s16       q14, d5, d3[1]
43
44    vmlal.s16       q11, d6, d0[2]
45    vmlal.s16       q12, d6, d1[2]
46    vmlal.s16       q13, d6, d2[2]
47    vmlal.s16       q14, d6, d3[2]
48
49    vmlal.s16       q11, d7, d0[3]              ;sumtemp for i=0
50    vmlal.s16       q12, d7, d1[3]              ;sumtemp for i=1
51    vmlal.s16       q13, d7, d2[3]              ;sumtemp for i=2
52    vmlal.s16       q14, d7, d3[3]              ;sumtemp for i=3
53
54    ; rounding
55    vrshrn.i32      d22, q11, #14
56    vrshrn.i32      d24, q12, #14
57    vrshrn.i32      d26, q13, #14
58    vrshrn.i32      d28, q14, #14
59
60;second stage
61    vmull.s16       q4, d22, d4[0]              ;i=0
62    vmull.s16       q5, d22, d4[1]              ;i=1
63    vmull.s16       q6, d22, d4[2]              ;i=2
64    vmull.s16       q7, d22, d4[3]              ;i=3
65
66    vmlal.s16       q4, d24, d5[0]
67    vmlal.s16       q5, d24, d5[1]
68    vmlal.s16       q6, d24, d5[2]
69    vmlal.s16       q7, d24, d5[3]
70
71    vmlal.s16       q4, d26, d6[0]
72    vmlal.s16       q5, d26, d6[1]
73    vmlal.s16       q6, d26, d6[2]
74    vmlal.s16       q7, d26, d6[3]
75
76    vmlal.s16       q4, d28, d7[0]              ;sumtemp for i=0
77    vmlal.s16       q5, d28, d7[1]              ;sumtemp for i=1
78    vmlal.s16       q6, d28, d7[2]              ;sumtemp for i=2
79    vmlal.s16       q7, d28, d7[3]              ;sumtemp for i=3
80
81    vrshr.s32       q0, q4, #16
82    vrshr.s32       q1, q5, #16
83    vrshr.s32       q2, q6, #16
84    vrshr.s32       q3, q7, #16
85
86    vmovn.i32       d0, q0
87    vmovn.i32       d1, q1
88    vmovn.i32       d2, q2
89    vmovn.i32       d3, q3
90
91    vst1.16         {q0, q1}, [r1]
92
93    bx              lr
94
95    ENDP
96
97; r0    short *input
98; r1    short *output
99; r2    int pitch
100|vp8_short_fdct8x4_neon| PROC
101    ; Store link register and input before calling
102    ;  first 4x4 fdct.  Do not need to worry about
103    ;  output or pitch because those pointers are not
104    ;  touched in the 4x4 fdct function
105    stmdb           sp!, {r0, lr}
106
107    bl              vp8_short_fdct4x4_neon
108
109    ldmia           sp!, {r0, lr}
110
111    ; Move to the next block of data.
112    add             r0, r0, #8
113    add             r1, r1, #32
114
115    ; Second time through do not store off the
116    ;  link register, just return from the 4x4 fdtc
117    b               vp8_short_fdct4x4_neon
118
119    ; Should never get to this.
120    bx              lr
121
122    ENDP
123
124;-----------------
125
126_dct_matrix_
127    DCD     dct_matrix
128dct_matrix
129;   DCW     23170,  30274,  23170, 12540
130;   DCW     23170,  12540, -23170,-30274
131;   DCW     23170, -12540, -23170, 30274
132;   DCW     23170, -30274,  23170,-12540
133; 23170 =  0x5a82
134; -23170 =  0xa57e
135; 30274 =  0x7642
136; -30274 =  0x89be
137; 12540 =  0x30fc
138; -12540 = 0xcf04
139    DCD     0x76425a82, 0x30fc5a82
140    DCD     0x30fc5a82, 0x89bea57e
141    DCD     0xcf045a82, 0x7642a57e
142    DCD     0x89be5a82, 0xcf045a82
143
144    END
145