• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@ This file was created from a .asm file
2@  using the ads2gas.pl script.
3	.syntax unified
4@
5@  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
6@
7@  Use of this source code is governed by a BSD-style license
8@  that can be found in the LICENSE file in the root of the source
9@  tree. An additional intellectual property rights grant can be found
10@  in the file PATENTS.  All contributing project authors may
11@  be found in the AUTHORS file in the root of the source tree.
12@
13
14    .global vpx_idct4x4_16_add_neon
15	.type vpx_idct4x4_16_add_neon, function
16   .arm
17   .eabi_attribute 24, 1 @Tag_ABI_align_needed
18   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
19
20.text
21.p2align 2
22
23    .include  "vpx_dsp/arm/idct_neon.asm.S"
24
25.text
26.p2align 2@ name this block of code
27@void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
28@
29@ r0  int16_t input
30@ r1  uint8_t *dest
31@ r2  int stride)
32
33_vpx_idct4x4_16_add_neon:
34	vpx_idct4x4_16_add_neon: @ PROC
35
36    @ The 2D transform is done with two passes which are actually pretty
37    @ similar. We first transform the rows. This is done by transposing
38    @ the inputs, doing an SIMD column transform (the columns are the
39    @ transposed rows) and then transpose the results (so that it goes back
40    @ in normal/row positions). Then, we transform the columns by doing
41    @ another SIMD column transform.
42    @ So, two passes of a transpose followed by a column transform.
43
44    @ load the inputs into q8-q9, d16-d19
45    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
46
47    @ generate scalar constants
48    @ cospi_8_64 = 15137
49    movw            r0, #0x3b21
50    @ cospi_16_64 = 11585
51    movw            r3, #0x2d41
52    @ cospi_24_64 = 6270
53    movw            r12, #0x187e
54
55    @ transpose the input data
56    @ 00 01 02 03   d16
57    @ 10 11 12 13   d17
58    @ 20 21 22 23   d18
59    @ 30 31 32 33   d19
60    vtrn.16         d16, d17
61    vtrn.16         d18, d19
62
63    @ generate constant vectors
64    vdup.16         d20, r0         @ replicate cospi_8_64
65    vdup.16         d21, r3         @ replicate cospi_16_64
66
67    @ 00 10 02 12   d16
68    @ 01 11 03 13   d17
69    @ 20 30 22 32   d18
70    @ 21 31 23 33   d19
71    vtrn.32         q8, q9
72    @ 00 10 20 30   d16
73    @ 01 11 21 31   d17
74    @ 02 12 22 32   d18
75    @ 03 13 23 33   d19
76
77    vdup.16         d22, r12        @ replicate cospi_24_64
78
79    @ do the transform on transposed rows
80
81    @ stage 1
82    vmull.s16 q15, d17, d22         @ input[1] * cospi_24_64
83    vmull.s16 q1,  d17, d20         @ input[1] * cospi_8_64
84
85    @ (input[0] + input[2]) * cospi_16_64;
86    @ (input[0] - input[2]) * cospi_16_64;
87    vmull.s16 q8,  d16, d21
88    vmull.s16 q14, d18, d21
89    vadd.s32  q13, q8,  q14
90    vsub.s32  q14, q8,  q14
91
92    @ input[1] * cospi_24_64 - input[3] * cospi_8_64;
93    @ input[1] * cospi_8_64  + input[3] * cospi_24_64;
94    vmlsl.s16 q15, d19, d20
95    vmlal.s16 q1,  d19, d22
96
97    @ dct_const_round_shift
98    vrshrn.s32 d26, q13, #14
99    vrshrn.s32 d27, q14, #14
100    vrshrn.s32 d29, q15, #14
101    vrshrn.s32 d28, q1,  #14
102
103    @ stage 2
104    @ output[0] = step[0] + step[3];
105    @ output[1] = step[1] + step[2];
106    @ output[3] = step[0] - step[3];
107    @ output[2] = step[1] - step[2];
108    vadd.s16 q8,  q13, q14
109    vsub.s16 q9,  q13, q14
110    vswp     d18, d19
111
112    @ transpose the results
113    @ 00 01 02 03   d16
114    @ 10 11 12 13   d17
115    @ 20 21 22 23   d18
116    @ 30 31 32 33   d19
117    vtrn.16         d16, d17
118    vtrn.16         d18, d19
119    @ 00 10 02 12   d16
120    @ 01 11 03 13   d17
121    @ 20 30 22 32   d18
122    @ 21 31 23 33   d19
123    vtrn.32         q8, q9
124    @ 00 10 20 30   d16
125    @ 01 11 21 31   d17
126    @ 02 12 22 32   d18
127    @ 03 13 23 33   d19
128
129    @ do the transform on columns
130
131    @ stage 1
132    vadd.s16  d23, d16, d18         @ (input[0] + input[2])
133    vsub.s16  d24, d16, d18         @ (input[0] - input[2])
134
135    vmull.s16 q15, d17, d22         @ input[1] * cospi_24_64
136    vmull.s16 q1,  d17, d20         @ input[1] * cospi_8_64
137
138    @ (input[0] + input[2]) * cospi_16_64;
139    @ (input[0] - input[2]) * cospi_16_64;
140    vmull.s16 q13, d23, d21
141    vmull.s16 q14, d24, d21
142
143    @ input[1] * cospi_24_64 - input[3] * cospi_8_64;
144    @ input[1] * cospi_8_64  + input[3] * cospi_24_64;
145    vmlsl.s16 q15, d19, d20
146    vmlal.s16 q1,  d19, d22
147
148    @ dct_const_round_shift
149    vrshrn.s32 d26, q13, #14
150    vrshrn.s32 d27, q14, #14
151    vrshrn.s32 d29, q15, #14
152    vrshrn.s32 d28, q1,  #14
153
154    @ stage 2
155    @ output[0] = step[0] + step[3];
156    @ output[1] = step[1] + step[2];
157    @ output[3] = step[0] - step[3];
158    @ output[2] = step[1] - step[2];
159    vadd.s16 q8,  q13, q14
160    vsub.s16 q9,  q13, q14
161
162    @ The results are in two registers, one of them being swapped. This will
163    @ be taken care of by loading the 'dest' value in a swapped fashion and
164    @ also storing them in the same swapped fashion.
165    @ temp_out[0, 1] = d16, d17 = q8
166    @ temp_out[2, 3] = d19, d18 = q9 swapped
167
168    @ ROUND_POWER_OF_TWO(temp_out[j], 4)
169    vrshr.s16 q8, q8, #4
170    vrshr.s16 q9, q9, #4
171
172    vld1.32 {d26[0]}, [r1], r2
173    vld1.32 {d26[1]}, [r1], r2
174    vld1.32 {d27[1]}, [r1], r2
175    vld1.32 {d27[0]}, [r1]  @ no post-increment
176
177    @ ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]
178    vaddw.u8 q8, q8, d26
179    vaddw.u8 q9, q9, d27
180
181    @ clip_pixel
182    vqmovun.s16 d26, q8
183    vqmovun.s16 d27, q9
184
185    @ do the stores in reverse order with negative post-increment, by changing
186    @ the sign of the stride
187    rsb r2, r2, #0
188    vst1.32 {d27[0]}, [r1], r2
189    vst1.32 {d27[1]}, [r1], r2
190    vst1.32 {d26[1]}, [r1], r2
191    vst1.32 {d26[0]}, [r1]  @ no post-increment
192    bx              lr
193	.size vpx_idct4x4_16_add_neon, .-vpx_idct4x4_16_add_neon    @ ENDP  @ |vpx_idct4x4_16_add_neon|
194
195	.section	.note.GNU-stack,"",%progbits
196