• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@ This file was created from a .asm file
2@  using the ads2gas.pl script.
3.syntax unified
4@
5@  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
6@
7@  Use of this source code is governed by a BSD-style license
8@  that can be found in the LICENSE file in the root of the source
9@  tree. An additional intellectual property rights grant can be found
10@  in the file PATENTS.  All contributing project authors may
11@  be found in the AUTHORS file in the root of the source tree.
12@
13
14    .global vpx_idct4x4_16_add_neon
15    .type vpx_idct4x4_16_add_neon, function
16    .arm
17    .eabi_attribute 24, 1 @Tag_ABI_align_needed
18    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
19
20    .text
21    .p2align 2
22
23    .include "vpx_dsp/arm/idct_neon.asm.S"
24
25    .text
26    .p2align 2
27@void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
28@
29@ r0  int16_t input
30@ r1  uint8_t *dest
31@ r2  int stride)
32
33vpx_idct4x4_16_add_neon: @ PROC
34
35    @ The 2D transform is done with two passes which are actually pretty
36    @ similar. We first transform the rows. This is done by transposing
37    @ the inputs, doing an SIMD column transform (the columns are the
38    @ transposed rows) and then transpose the results (so that it goes back
39    @ in normal/row positions). Then, we transform the columns by doing
40    @ another SIMD column transform.
41    @ So, two passes of a transpose followed by a column transform.
42
43    @ load the inputs into q8-q9, d16-d19
44    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
45
46    @ generate scalar constants
47    @ cospi_8_64 = 15137
48    movw            r0, #0x3b21
49    @ cospi_16_64 = 11585
50    movw            r3, #0x2d41
51    @ cospi_24_64 = 6270
52    movw            r12, #0x187e
53
54    @ transpose the input data
55    @ 00 01 02 03   d16
56    @ 10 11 12 13   d17
57    @ 20 21 22 23   d18
58    @ 30 31 32 33   d19
59    vtrn.16         d16, d17
60    vtrn.16         d18, d19
61
62    @ generate constant vectors
63    vdup.16         d20, r0         @ replicate cospi_8_64
64    vdup.16         d21, r3         @ replicate cospi_16_64
65
66    @ 00 10 02 12   d16
67    @ 01 11 03 13   d17
68    @ 20 30 22 32   d18
69    @ 21 31 23 33   d19
70    vtrn.32         q8, q9
71    @ 00 10 20 30   d16
72    @ 01 11 21 31   d17
73    @ 02 12 22 32   d18
74    @ 03 13 23 33   d19
75
76    vdup.16         d22, r12        @ replicate cospi_24_64
77
78    @ do the transform on transposed rows
79
80    @ stage 1
81    vmull.s16 q15, d17, d22         @ input[1] * cospi_24_64
82    vmull.s16 q1,  d17, d20         @ input[1] * cospi_8_64
83
84    @ (input[0] + input[2]) * cospi_16_64;
85    @ (input[0] - input[2]) * cospi_16_64;
86    vmull.s16 q8,  d16, d21
87    vmull.s16 q14, d18, d21
88    vadd.s32  q13, q8,  q14
89    vsub.s32  q14, q8,  q14
90
91    @ input[1] * cospi_24_64 - input[3] * cospi_8_64;
92    @ input[1] * cospi_8_64  + input[3] * cospi_24_64;
93    vmlsl.s16 q15, d19, d20
94    vmlal.s16 q1,  d19, d22
95
96    @ dct_const_round_shift
97    vrshrn.s32 d26, q13, #14
98    vrshrn.s32 d27, q14, #14
99    vrshrn.s32 d29, q15, #14
100    vrshrn.s32 d28, q1,  #14
101
102    @ stage 2
103    @ output[0] = step[0] + step[3];
104    @ output[1] = step[1] + step[2];
105    @ output[3] = step[0] - step[3];
106    @ output[2] = step[1] - step[2];
107    vadd.s16 q8,  q13, q14
108    vsub.s16 q9,  q13, q14
109    vswp     d18, d19
110
111    @ transpose the results
112    @ 00 01 02 03   d16
113    @ 10 11 12 13   d17
114    @ 20 21 22 23   d18
115    @ 30 31 32 33   d19
116    vtrn.16         d16, d17
117    vtrn.16         d18, d19
118    @ 00 10 02 12   d16
119    @ 01 11 03 13   d17
120    @ 20 30 22 32   d18
121    @ 21 31 23 33   d19
122    vtrn.32         q8, q9
123    @ 00 10 20 30   d16
124    @ 01 11 21 31   d17
125    @ 02 12 22 32   d18
126    @ 03 13 23 33   d19
127
128    @ do the transform on columns
129
130    @ stage 1
131    vadd.s16  d23, d16, d18         @ (input[0] + input[2])
132    vsub.s16  d24, d16, d18         @ (input[0] - input[2])
133
134    vmull.s16 q15, d17, d22         @ input[1] * cospi_24_64
135    vmull.s16 q1,  d17, d20         @ input[1] * cospi_8_64
136
137    @ (input[0] + input[2]) * cospi_16_64;
138    @ (input[0] - input[2]) * cospi_16_64;
139    vmull.s16 q13, d23, d21
140    vmull.s16 q14, d24, d21
141
142    @ input[1] * cospi_24_64 - input[3] * cospi_8_64;
143    @ input[1] * cospi_8_64  + input[3] * cospi_24_64;
144    vmlsl.s16 q15, d19, d20
145    vmlal.s16 q1,  d19, d22
146
147    @ dct_const_round_shift
148    vrshrn.s32 d26, q13, #14
149    vrshrn.s32 d27, q14, #14
150    vrshrn.s32 d29, q15, #14
151    vrshrn.s32 d28, q1,  #14
152
153    @ stage 2
154    @ output[0] = step[0] + step[3];
155    @ output[1] = step[1] + step[2];
156    @ output[3] = step[0] - step[3];
157    @ output[2] = step[1] - step[2];
158    vadd.s16 q8,  q13, q14
159    vsub.s16 q9,  q13, q14
160
161    @ The results are in two registers, one of them being swapped. This will
162    @ be taken care of by loading the 'dest' value in a swapped fashion and
163    @ also storing them in the same swapped fashion.
164    @ temp_out[0, 1] = d16, d17 = q8
165    @ temp_out[2, 3] = d19, d18 = q9 swapped
166
167    @ ROUND_POWER_OF_TWO(temp_out[j], 4)
168    vrshr.s16 q8, q8, #4
169    vrshr.s16 q9, q9, #4
170
171    vld1.32 {d26[0]}, [r1], r2
172    vld1.32 {d26[1]}, [r1], r2
173    vld1.32 {d27[1]}, [r1], r2
174    vld1.32 {d27[0]}, [r1]  @ no post-increment
175
176    @ ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i]
177    vaddw.u8 q8, q8, d26
178    vaddw.u8 q9, q9, d27
179
180    @ clip_pixel
181    vqmovun.s16 d26, q8
182    vqmovun.s16 d27, q9
183
184    @ do the stores in reverse order with negative post-increment, by changing
185    @ the sign of the stride
186    rsb r2, r2, #0
187    vst1.32 {d27[0]}, [r1], r2
188    vst1.32 {d27[1]}, [r1], r2
189    vst1.32 {d26[1]}, [r1], r2
190    vst1.32 {d26[0]}, [r1]  @ no post-increment
191    bx              lr
192.size vpx_idct4x4_16_add_neon, .-vpx_idct4x4_16_add_neon    @ ENDP  @ |vpx_idct4x4_16_add_neon|
193
194    .section .note.GNU-stack,"",%progbits
195