1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/highbd_idct_neon.h"
15 #include "vpx_dsp/arm/idct_neon.h"
16 #include "vpx_dsp/inv_txfm.h"
17
dct_const_round_shift_high_4(const int64x2x2_t in)18 static INLINE int32x4_t dct_const_round_shift_high_4(const int64x2x2_t in) {
19 int32x2x2_t t32;
20
21 t32.val[0] = vrshrn_n_s64(in.val[0], DCT_CONST_BITS);
22 t32.val[1] = vrshrn_n_s64(in.val[1], DCT_CONST_BITS);
23 return vcombine_s32(t32.val[0], t32.val[1]);
24 }
25
dct_const_round_shift_high_4_dual(const int64x2x2_t * const in,int32x4_t * const d0,int32x4_t * const d1)26 static INLINE void dct_const_round_shift_high_4_dual(
27 const int64x2x2_t *const in, int32x4_t *const d0, int32x4_t *const d1) {
28 *d0 = dct_const_round_shift_high_4(in[0]);
29 *d1 = dct_const_round_shift_high_4(in[1]);
30 }
31
32 static INLINE int32x4x2_t
dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t * const in)33 dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t *const in) {
34 int32x4x2_t out;
35 out.val[0] = dct_const_round_shift_high_4(in[0]);
36 out.val[1] = dct_const_round_shift_high_4(in[1]);
37 return out;
38 }
39
dct_const_round_shift_high_4x2x2(const int64x2x2_t * const in,int32x4x2_t * const d0,int32x4x2_t * const d1)40 static INLINE void dct_const_round_shift_high_4x2x2(const int64x2x2_t *const in,
41 int32x4x2_t *const d0,
42 int32x4x2_t *const d1) {
43 *d0 = dct_const_round_shift_high_4x2_int64x2x2(in + 0);
44 *d1 = dct_const_round_shift_high_4x2_int64x2x2(in + 2);
45 }
46
highbd_idct_cospi_2_30(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_2_30_10_22,int32x4x2_t * const d0,int32x4x2_t * const d1)47 static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0,
48 const int32x4x2_t s1,
49 const int32x4_t cospi_2_30_10_22,
50 int32x4x2_t *const d0,
51 int32x4x2_t *const d1) {
52 int64x2x2_t t[4];
53
54 t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
55 vget_low_s32(cospi_2_30_10_22), 1);
56 t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
57 vget_low_s32(cospi_2_30_10_22), 1);
58 t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
59 vget_low_s32(cospi_2_30_10_22), 1);
60 t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
61 vget_low_s32(cospi_2_30_10_22), 1);
62 t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
63 vget_low_s32(cospi_2_30_10_22), 1);
64 t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
65 vget_low_s32(cospi_2_30_10_22), 1);
66 t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
67 vget_low_s32(cospi_2_30_10_22), 1);
68 t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
69 vget_low_s32(cospi_2_30_10_22), 1);
70 t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
71 vget_low_s32(cospi_2_30_10_22), 0);
72 t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
73 vget_low_s32(cospi_2_30_10_22), 0);
74 t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
75 vget_low_s32(cospi_2_30_10_22), 0);
76 t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
77 vget_low_s32(cospi_2_30_10_22), 0);
78 t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
79 vget_low_s32(cospi_2_30_10_22), 0);
80 t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
81 vget_low_s32(cospi_2_30_10_22), 0);
82 t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
83 vget_low_s32(cospi_2_30_10_22), 0);
84 t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
85 vget_low_s32(cospi_2_30_10_22), 0);
86 dct_const_round_shift_high_4x2x2(t, d0, d1);
87 }
88
highbd_idct_cospi_4_28(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_4_12_20N_28,int32x4x2_t * const d0,int32x4x2_t * const d1)89 static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0,
90 const int32x4x2_t s1,
91 const int32x4_t cospi_4_12_20N_28,
92 int32x4x2_t *const d0,
93 int32x4x2_t *const d1) {
94 int64x2x2_t t[4];
95
96 t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
97 vget_high_s32(cospi_4_12_20N_28), 1);
98 t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
99 vget_high_s32(cospi_4_12_20N_28), 1);
100 t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
101 vget_high_s32(cospi_4_12_20N_28), 1);
102 t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
103 vget_high_s32(cospi_4_12_20N_28), 1);
104 t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
105 vget_high_s32(cospi_4_12_20N_28), 1);
106 t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
107 vget_high_s32(cospi_4_12_20N_28), 1);
108 t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
109 vget_high_s32(cospi_4_12_20N_28), 1);
110 t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
111 vget_high_s32(cospi_4_12_20N_28), 1);
112 t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
113 vget_low_s32(cospi_4_12_20N_28), 0);
114 t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
115 vget_low_s32(cospi_4_12_20N_28), 0);
116 t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
117 vget_low_s32(cospi_4_12_20N_28), 0);
118 t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
119 vget_low_s32(cospi_4_12_20N_28), 0);
120 t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
121 vget_low_s32(cospi_4_12_20N_28), 0);
122 t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
123 vget_low_s32(cospi_4_12_20N_28), 0);
124 t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
125 vget_low_s32(cospi_4_12_20N_28), 0);
126 t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
127 vget_low_s32(cospi_4_12_20N_28), 0);
128 dct_const_round_shift_high_4x2x2(t, d0, d1);
129 }
130
highbd_idct_cospi_6_26(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_6_26N_14_18N,int32x4x2_t * const d0,int32x4x2_t * const d1)131 static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0,
132 const int32x4x2_t s1,
133 const int32x4_t cospi_6_26N_14_18N,
134 int32x4x2_t *const d0,
135 int32x4x2_t *const d1) {
136 int64x2x2_t t[4];
137
138 t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
139 vget_low_s32(cospi_6_26N_14_18N), 0);
140 t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
141 vget_low_s32(cospi_6_26N_14_18N), 0);
142 t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
143 vget_low_s32(cospi_6_26N_14_18N), 0);
144 t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
145 vget_low_s32(cospi_6_26N_14_18N), 0);
146 t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
147 vget_low_s32(cospi_6_26N_14_18N), 0);
148 t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
149 vget_low_s32(cospi_6_26N_14_18N), 0);
150 t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
151 vget_low_s32(cospi_6_26N_14_18N), 0);
152 t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
153 vget_low_s32(cospi_6_26N_14_18N), 0);
154 t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
155 vget_low_s32(cospi_6_26N_14_18N), 1);
156 t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
157 vget_low_s32(cospi_6_26N_14_18N), 1);
158 t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
159 vget_low_s32(cospi_6_26N_14_18N), 1);
160 t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
161 vget_low_s32(cospi_6_26N_14_18N), 1);
162 t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
163 vget_low_s32(cospi_6_26N_14_18N), 1);
164 t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
165 vget_low_s32(cospi_6_26N_14_18N), 1);
166 t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
167 vget_low_s32(cospi_6_26N_14_18N), 1);
168 t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
169 vget_low_s32(cospi_6_26N_14_18N), 1);
170 dct_const_round_shift_high_4x2x2(t, d0, d1);
171 }
172
highbd_idct_cospi_10_22(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_2_30_10_22,int32x4x2_t * const d0,int32x4x2_t * const d1)173 static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0,
174 const int32x4x2_t s1,
175 const int32x4_t cospi_2_30_10_22,
176 int32x4x2_t *const d0,
177 int32x4x2_t *const d1) {
178 int64x2x2_t t[4];
179
180 t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
181 vget_high_s32(cospi_2_30_10_22), 1);
182 t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
183 vget_high_s32(cospi_2_30_10_22), 1);
184 t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
185 vget_high_s32(cospi_2_30_10_22), 1);
186 t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
187 vget_high_s32(cospi_2_30_10_22), 1);
188 t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
189 vget_high_s32(cospi_2_30_10_22), 1);
190 t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
191 vget_high_s32(cospi_2_30_10_22), 1);
192 t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
193 vget_high_s32(cospi_2_30_10_22), 1);
194 t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
195 vget_high_s32(cospi_2_30_10_22), 1);
196 t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
197 vget_high_s32(cospi_2_30_10_22), 0);
198 t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
199 vget_high_s32(cospi_2_30_10_22), 0);
200 t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
201 vget_high_s32(cospi_2_30_10_22), 0);
202 t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
203 vget_high_s32(cospi_2_30_10_22), 0);
204 t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
205 vget_high_s32(cospi_2_30_10_22), 0);
206 t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
207 vget_high_s32(cospi_2_30_10_22), 0);
208 t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
209 vget_high_s32(cospi_2_30_10_22), 0);
210 t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
211 vget_high_s32(cospi_2_30_10_22), 0);
212 dct_const_round_shift_high_4x2x2(t, d0, d1);
213 }
214
highbd_idct_cospi_12_20(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_4_12_20N_28,int32x4x2_t * const d0,int32x4x2_t * const d1)215 static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0,
216 const int32x4x2_t s1,
217 const int32x4_t cospi_4_12_20N_28,
218 int32x4x2_t *const d0,
219 int32x4x2_t *const d1) {
220 int64x2x2_t t[4];
221
222 t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
223 vget_low_s32(cospi_4_12_20N_28), 1);
224 t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
225 vget_low_s32(cospi_4_12_20N_28), 1);
226 t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
227 vget_low_s32(cospi_4_12_20N_28), 1);
228 t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
229 vget_low_s32(cospi_4_12_20N_28), 1);
230 t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
231 vget_low_s32(cospi_4_12_20N_28), 1);
232 t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
233 vget_low_s32(cospi_4_12_20N_28), 1);
234 t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
235 vget_low_s32(cospi_4_12_20N_28), 1);
236 t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
237 vget_low_s32(cospi_4_12_20N_28), 1);
238 t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
239 vget_high_s32(cospi_4_12_20N_28), 0);
240 t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
241 vget_high_s32(cospi_4_12_20N_28), 0);
242 t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
243 vget_high_s32(cospi_4_12_20N_28), 0);
244 t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
245 vget_high_s32(cospi_4_12_20N_28), 0);
246 t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
247 vget_high_s32(cospi_4_12_20N_28), 0);
248 t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
249 vget_high_s32(cospi_4_12_20N_28), 0);
250 t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
251 vget_high_s32(cospi_4_12_20N_28), 0);
252 t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
253 vget_high_s32(cospi_4_12_20N_28), 0);
254 dct_const_round_shift_high_4x2x2(t, d0, d1);
255 }
256
highbd_idct_cospi_14_18(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_6_26N_14_18N,int32x4x2_t * const d0,int32x4x2_t * const d1)257 static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0,
258 const int32x4x2_t s1,
259 const int32x4_t cospi_6_26N_14_18N,
260 int32x4x2_t *const d0,
261 int32x4x2_t *const d1) {
262 int64x2x2_t t[4];
263
264 t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
265 vget_high_s32(cospi_6_26N_14_18N), 0);
266 t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
267 vget_high_s32(cospi_6_26N_14_18N), 0);
268 t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
269 vget_high_s32(cospi_6_26N_14_18N), 0);
270 t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
271 vget_high_s32(cospi_6_26N_14_18N), 0);
272 t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
273 vget_high_s32(cospi_6_26N_14_18N), 0);
274 t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
275 vget_high_s32(cospi_6_26N_14_18N), 0);
276 t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
277 vget_high_s32(cospi_6_26N_14_18N), 0);
278 t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
279 vget_high_s32(cospi_6_26N_14_18N), 0);
280 t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
281 vget_high_s32(cospi_6_26N_14_18N), 1);
282 t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
283 vget_high_s32(cospi_6_26N_14_18N), 1);
284 t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
285 vget_high_s32(cospi_6_26N_14_18N), 1);
286 t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
287 vget_high_s32(cospi_6_26N_14_18N), 1);
288 t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
289 vget_high_s32(cospi_6_26N_14_18N), 1);
290 t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
291 vget_high_s32(cospi_6_26N_14_18N), 1);
292 t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
293 vget_high_s32(cospi_6_26N_14_18N), 1);
294 t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
295 vget_high_s32(cospi_6_26N_14_18N), 1);
296 dct_const_round_shift_high_4x2x2(t, d0, d1);
297 }
298
highbd_idct_cospi_8_24_q_kernel(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_0_8_16_24,int64x2x2_t * const t)299 static INLINE void highbd_idct_cospi_8_24_q_kernel(
300 const int32x4x2_t s0, const int32x4x2_t s1, const int32x4_t cospi_0_8_16_24,
301 int64x2x2_t *const t) {
302 t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
303 vget_high_s32(cospi_0_8_16_24), 1);
304 t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
305 vget_high_s32(cospi_0_8_16_24), 1);
306 t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
307 vget_high_s32(cospi_0_8_16_24), 1);
308 t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
309 vget_high_s32(cospi_0_8_16_24), 1);
310 t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
311 vget_high_s32(cospi_0_8_16_24), 1);
312 t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
313 vget_high_s32(cospi_0_8_16_24), 1);
314 t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
315 vget_high_s32(cospi_0_8_16_24), 1);
316 t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
317 vget_high_s32(cospi_0_8_16_24), 1);
318 t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
319 vget_low_s32(cospi_0_8_16_24), 1);
320 t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
321 vget_low_s32(cospi_0_8_16_24), 1);
322 t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
323 vget_low_s32(cospi_0_8_16_24), 1);
324 t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
325 vget_low_s32(cospi_0_8_16_24), 1);
326 t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
327 vget_low_s32(cospi_0_8_16_24), 1);
328 t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
329 vget_low_s32(cospi_0_8_16_24), 1);
330 t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
331 vget_low_s32(cospi_0_8_16_24), 1);
332 t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
333 vget_low_s32(cospi_0_8_16_24), 1);
334 }
335
highbd_idct_cospi_8_24_d_kernel(const int32x4_t s0,const int32x4_t s1,const int32x4_t cospi_0_8_16_24,int64x2x2_t * const t)336 static INLINE void highbd_idct_cospi_8_24_d_kernel(
337 const int32x4_t s0, const int32x4_t s1, const int32x4_t cospi_0_8_16_24,
338 int64x2x2_t *const t) {
339 t[0].val[0] =
340 vmull_lane_s32(vget_low_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
341 t[0].val[1] =
342 vmull_lane_s32(vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
343 t[1].val[0] =
344 vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
345 t[1].val[1] =
346 vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
347 t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1),
348 vget_low_s32(cospi_0_8_16_24), 1);
349 t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1),
350 vget_low_s32(cospi_0_8_16_24), 1);
351 t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s0),
352 vget_low_s32(cospi_0_8_16_24), 1);
353 t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s0),
354 vget_low_s32(cospi_0_8_16_24), 1);
355 }
356
highbd_idct_cospi_8_24_q(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_0_8_16_24,int32x4x2_t * const d0,int32x4x2_t * const d1)357 static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0,
358 const int32x4x2_t s1,
359 const int32x4_t cospi_0_8_16_24,
360 int32x4x2_t *const d0,
361 int32x4x2_t *const d1) {
362 int64x2x2_t t[4];
363
364 highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
365 dct_const_round_shift_high_4x2x2(t, d0, d1);
366 }
367
highbd_idct_cospi_8_24_d(const int32x4_t s0,const int32x4_t s1,const int32x4_t cospi_0_8_16_24,int32x4_t * const d0,int32x4_t * const d1)368 static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0,
369 const int32x4_t s1,
370 const int32x4_t cospi_0_8_16_24,
371 int32x4_t *const d0,
372 int32x4_t *const d1) {
373 int64x2x2_t t[2];
374
375 highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
376 dct_const_round_shift_high_4_dual(t, d0, d1);
377 }
378
highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_0_8_16_24,int32x4x2_t * const d0,int32x4x2_t * const d1)379 static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,
380 const int32x4x2_t s1,
381 const int32x4_t cospi_0_8_16_24,
382 int32x4x2_t *const d0,
383 int32x4x2_t *const d1) {
384 int64x2x2_t t[4];
385
386 highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
387 t[2].val[0] = vsubq_s64(vdupq_n_s64(0), t[2].val[0]);
388 t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]);
389 t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]);
390 t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]);
391 dct_const_round_shift_high_4x2x2(t, d0, d1);
392 }
393
highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,const int32x4_t s1,const int32x4_t cospi_0_8_16_24,int32x4_t * const d0,int32x4_t * const d1)394 static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,
395 const int32x4_t s1,
396 const int32x4_t cospi_0_8_16_24,
397 int32x4_t *const d0,
398 int32x4_t *const d1) {
399 int64x2x2_t t[2];
400
401 highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
402 t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]);
403 t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]);
404 dct_const_round_shift_high_4_dual(t, d0, d1);
405 }
406
highbd_idct_cospi_16_16_q(const int32x4x2_t s0,const int32x4x2_t s1,const int32x4_t cospi_0_8_16_24,int32x4x2_t * const d0,int32x4x2_t * const d1)407 static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0,
408 const int32x4x2_t s1,
409 const int32x4_t cospi_0_8_16_24,
410 int32x4x2_t *const d0,
411 int32x4x2_t *const d1) {
412 int64x2x2_t t[6];
413
414 t[4].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
415 vget_high_s32(cospi_0_8_16_24), 0);
416 t[4].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
417 vget_high_s32(cospi_0_8_16_24), 0);
418 t[5].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
419 vget_high_s32(cospi_0_8_16_24), 0);
420 t[5].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
421 vget_high_s32(cospi_0_8_16_24), 0);
422 t[0].val[0] = vmlsl_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
423 vget_high_s32(cospi_0_8_16_24), 0);
424 t[0].val[1] = vmlsl_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
425 vget_high_s32(cospi_0_8_16_24), 0);
426 t[1].val[0] = vmlsl_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
427 vget_high_s32(cospi_0_8_16_24), 0);
428 t[1].val[1] = vmlsl_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
429 vget_high_s32(cospi_0_8_16_24), 0);
430 t[2].val[0] = vmlal_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
431 vget_high_s32(cospi_0_8_16_24), 0);
432 t[2].val[1] = vmlal_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
433 vget_high_s32(cospi_0_8_16_24), 0);
434 t[3].val[0] = vmlal_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
435 vget_high_s32(cospi_0_8_16_24), 0);
436 t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
437 vget_high_s32(cospi_0_8_16_24), 0);
438 dct_const_round_shift_high_4x2x2(t, d0, d1);
439 }
440
highbd_idct_cospi_16_16_d(const int32x4_t s0,const int32x4_t s1,const int32x4_t cospi_0_8_16_24,int32x4_t * const d0,int32x4_t * const d1)441 static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0,
442 const int32x4_t s1,
443 const int32x4_t cospi_0_8_16_24,
444 int32x4_t *const d0,
445 int32x4_t *const d1) {
446 int64x2x2_t t[3];
447
448 t[2].val[0] =
449 vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
450 t[2].val[1] =
451 vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
452 t[0].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0),
453 vget_high_s32(cospi_0_8_16_24), 0);
454 t[0].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0),
455 vget_high_s32(cospi_0_8_16_24), 0);
456 t[1].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0),
457 vget_high_s32(cospi_0_8_16_24), 0);
458 t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0),
459 vget_high_s32(cospi_0_8_16_24), 0);
460 dct_const_round_shift_high_4_dual(t, d0, d1);
461 }
462
highbd_idct16x16_add_stage7_dual(const int32x4x2_t * const step2,int32x4x2_t * const out)463 static INLINE void highbd_idct16x16_add_stage7_dual(
464 const int32x4x2_t *const step2, int32x4x2_t *const out) {
465 out[0].val[0] = vaddq_s32(step2[0].val[0], step2[15].val[0]);
466 out[0].val[1] = vaddq_s32(step2[0].val[1], step2[15].val[1]);
467 out[1].val[0] = vaddq_s32(step2[1].val[0], step2[14].val[0]);
468 out[1].val[1] = vaddq_s32(step2[1].val[1], step2[14].val[1]);
469 out[2].val[0] = vaddq_s32(step2[2].val[0], step2[13].val[0]);
470 out[2].val[1] = vaddq_s32(step2[2].val[1], step2[13].val[1]);
471 out[3].val[0] = vaddq_s32(step2[3].val[0], step2[12].val[0]);
472 out[3].val[1] = vaddq_s32(step2[3].val[1], step2[12].val[1]);
473 out[4].val[0] = vaddq_s32(step2[4].val[0], step2[11].val[0]);
474 out[4].val[1] = vaddq_s32(step2[4].val[1], step2[11].val[1]);
475 out[5].val[0] = vaddq_s32(step2[5].val[0], step2[10].val[0]);
476 out[5].val[1] = vaddq_s32(step2[5].val[1], step2[10].val[1]);
477 out[6].val[0] = vaddq_s32(step2[6].val[0], step2[9].val[0]);
478 out[6].val[1] = vaddq_s32(step2[6].val[1], step2[9].val[1]);
479 out[7].val[0] = vaddq_s32(step2[7].val[0], step2[8].val[0]);
480 out[7].val[1] = vaddq_s32(step2[7].val[1], step2[8].val[1]);
481 out[8].val[0] = vsubq_s32(step2[7].val[0], step2[8].val[0]);
482 out[8].val[1] = vsubq_s32(step2[7].val[1], step2[8].val[1]);
483 out[9].val[0] = vsubq_s32(step2[6].val[0], step2[9].val[0]);
484 out[9].val[1] = vsubq_s32(step2[6].val[1], step2[9].val[1]);
485 out[10].val[0] = vsubq_s32(step2[5].val[0], step2[10].val[0]);
486 out[10].val[1] = vsubq_s32(step2[5].val[1], step2[10].val[1]);
487 out[11].val[0] = vsubq_s32(step2[4].val[0], step2[11].val[0]);
488 out[11].val[1] = vsubq_s32(step2[4].val[1], step2[11].val[1]);
489 out[12].val[0] = vsubq_s32(step2[3].val[0], step2[12].val[0]);
490 out[12].val[1] = vsubq_s32(step2[3].val[1], step2[12].val[1]);
491 out[13].val[0] = vsubq_s32(step2[2].val[0], step2[13].val[0]);
492 out[13].val[1] = vsubq_s32(step2[2].val[1], step2[13].val[1]);
493 out[14].val[0] = vsubq_s32(step2[1].val[0], step2[14].val[0]);
494 out[14].val[1] = vsubq_s32(step2[1].val[1], step2[14].val[1]);
495 out[15].val[0] = vsubq_s32(step2[0].val[0], step2[15].val[0]);
496 out[15].val[1] = vsubq_s32(step2[0].val[1], step2[15].val[1]);
497 }
498
highbd_idct16x16_add_stage7(const int32x4_t * const step2,int32x4_t * const out)499 static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2,
500 int32x4_t *const out) {
501 out[0] = vaddq_s32(step2[0], step2[15]);
502 out[1] = vaddq_s32(step2[1], step2[14]);
503 out[2] = vaddq_s32(step2[2], step2[13]);
504 out[3] = vaddq_s32(step2[3], step2[12]);
505 out[4] = vaddq_s32(step2[4], step2[11]);
506 out[5] = vaddq_s32(step2[5], step2[10]);
507 out[6] = vaddq_s32(step2[6], step2[9]);
508 out[7] = vaddq_s32(step2[7], step2[8]);
509 out[8] = vsubq_s32(step2[7], step2[8]);
510 out[9] = vsubq_s32(step2[6], step2[9]);
511 out[10] = vsubq_s32(step2[5], step2[10]);
512 out[11] = vsubq_s32(step2[4], step2[11]);
513 out[12] = vsubq_s32(step2[3], step2[12]);
514 out[13] = vsubq_s32(step2[2], step2[13]);
515 out[14] = vsubq_s32(step2[1], step2[14]);
516 out[15] = vsubq_s32(step2[0], step2[15]);
517 }
518
vpx_highbd_idct16x16_256_add_half1d(const int32_t * input,int32_t * output,uint16_t * dest,const int stride,const int bd)519 void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
520 uint16_t *dest, const int stride,
521 const int bd) {
522 const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
523 const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
524 const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
525 const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
526 int32x4x2_t in[16], step1[16], step2[16], out[16];
527
528 // Load input (16x8)
529 in[0].val[0] = vld1q_s32(input);
530 in[0].val[1] = vld1q_s32(input + 4);
531 input += 8;
532 in[8].val[0] = vld1q_s32(input);
533 in[8].val[1] = vld1q_s32(input + 4);
534 input += 8;
535 in[1].val[0] = vld1q_s32(input);
536 in[1].val[1] = vld1q_s32(input + 4);
537 input += 8;
538 in[9].val[0] = vld1q_s32(input);
539 in[9].val[1] = vld1q_s32(input + 4);
540 input += 8;
541 in[2].val[0] = vld1q_s32(input);
542 in[2].val[1] = vld1q_s32(input + 4);
543 input += 8;
544 in[10].val[0] = vld1q_s32(input);
545 in[10].val[1] = vld1q_s32(input + 4);
546 input += 8;
547 in[3].val[0] = vld1q_s32(input);
548 in[3].val[1] = vld1q_s32(input + 4);
549 input += 8;
550 in[11].val[0] = vld1q_s32(input);
551 in[11].val[1] = vld1q_s32(input + 4);
552 input += 8;
553 in[4].val[0] = vld1q_s32(input);
554 in[4].val[1] = vld1q_s32(input + 4);
555 input += 8;
556 in[12].val[0] = vld1q_s32(input);
557 in[12].val[1] = vld1q_s32(input + 4);
558 input += 8;
559 in[5].val[0] = vld1q_s32(input);
560 in[5].val[1] = vld1q_s32(input + 4);
561 input += 8;
562 in[13].val[0] = vld1q_s32(input);
563 in[13].val[1] = vld1q_s32(input + 4);
564 input += 8;
565 in[6].val[0] = vld1q_s32(input);
566 in[6].val[1] = vld1q_s32(input + 4);
567 input += 8;
568 in[14].val[0] = vld1q_s32(input);
569 in[14].val[1] = vld1q_s32(input + 4);
570 input += 8;
571 in[7].val[0] = vld1q_s32(input);
572 in[7].val[1] = vld1q_s32(input + 4);
573 input += 8;
574 in[15].val[0] = vld1q_s32(input);
575 in[15].val[1] = vld1q_s32(input + 4);
576
577 // Transpose
578 transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
579 &in[7]);
580 transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
581 &in[15]);
582
583 // stage 1
584 step1[0] = in[0 / 2];
585 step1[1] = in[16 / 2];
586 step1[2] = in[8 / 2];
587 step1[3] = in[24 / 2];
588 step1[4] = in[4 / 2];
589 step1[5] = in[20 / 2];
590 step1[6] = in[12 / 2];
591 step1[7] = in[28 / 2];
592 step1[8] = in[2 / 2];
593 step1[9] = in[18 / 2];
594 step1[10] = in[10 / 2];
595 step1[11] = in[26 / 2];
596 step1[12] = in[6 / 2];
597 step1[13] = in[22 / 2];
598 step1[14] = in[14 / 2];
599 step1[15] = in[30 / 2];
600
601 // stage 2
602 step2[0] = step1[0];
603 step2[1] = step1[1];
604 step2[2] = step1[2];
605 step2[3] = step1[3];
606 step2[4] = step1[4];
607 step2[5] = step1[5];
608 step2[6] = step1[6];
609 step2[7] = step1[7];
610 highbd_idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8],
611 &step2[15]);
612 highbd_idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
613 &step2[14]);
614 highbd_idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
615 &step2[13]);
616 highbd_idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
617 &step2[12]);
618
619 // stage 3
620 step1[0] = step2[0];
621 step1[1] = step2[1];
622 step1[2] = step2[2];
623 step1[3] = step2[3];
624 highbd_idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4],
625 &step1[7]);
626 highbd_idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5],
627 &step1[6]);
628 step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[9].val[0]);
629 step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[9].val[1]);
630 step1[9].val[0] = vsubq_s32(step2[8].val[0], step2[9].val[0]);
631 step1[9].val[1] = vsubq_s32(step2[8].val[1], step2[9].val[1]);
632 step1[10].val[0] = vsubq_s32(step2[11].val[0], step2[10].val[0]);
633 step1[10].val[1] = vsubq_s32(step2[11].val[1], step2[10].val[1]);
634 step1[11].val[0] = vaddq_s32(step2[11].val[0], step2[10].val[0]);
635 step1[11].val[1] = vaddq_s32(step2[11].val[1], step2[10].val[1]);
636 step1[12].val[0] = vaddq_s32(step2[12].val[0], step2[13].val[0]);
637 step1[12].val[1] = vaddq_s32(step2[12].val[1], step2[13].val[1]);
638 step1[13].val[0] = vsubq_s32(step2[12].val[0], step2[13].val[0]);
639 step1[13].val[1] = vsubq_s32(step2[12].val[1], step2[13].val[1]);
640 step1[14].val[0] = vsubq_s32(step2[15].val[0], step2[14].val[0]);
641 step1[14].val[1] = vsubq_s32(step2[15].val[1], step2[14].val[1]);
642 step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[14].val[0]);
643 step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[14].val[1]);
644
645 // stage 4
646 highbd_idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1],
647 &step2[0]);
648 highbd_idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2],
649 &step2[3]);
650 step2[4].val[0] = vaddq_s32(step1[4].val[0], step1[5].val[0]);
651 step2[4].val[1] = vaddq_s32(step1[4].val[1], step1[5].val[1]);
652 step2[5].val[0] = vsubq_s32(step1[4].val[0], step1[5].val[0]);
653 step2[5].val[1] = vsubq_s32(step1[4].val[1], step1[5].val[1]);
654 step2[6].val[0] = vsubq_s32(step1[7].val[0], step1[6].val[0]);
655 step2[6].val[1] = vsubq_s32(step1[7].val[1], step1[6].val[1]);
656 step2[7].val[0] = vaddq_s32(step1[7].val[0], step1[6].val[0]);
657 step2[7].val[1] = vaddq_s32(step1[7].val[1], step1[6].val[1]);
658 step2[8] = step1[8];
659 highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
660 &step2[14]);
661 highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
662 &step2[13], &step2[10]);
663 step2[11] = step1[11];
664 step2[12] = step1[12];
665 step2[15] = step1[15];
666
667 // stage 5
668 step1[0].val[0] = vaddq_s32(step2[0].val[0], step2[3].val[0]);
669 step1[0].val[1] = vaddq_s32(step2[0].val[1], step2[3].val[1]);
670 step1[1].val[0] = vaddq_s32(step2[1].val[0], step2[2].val[0]);
671 step1[1].val[1] = vaddq_s32(step2[1].val[1], step2[2].val[1]);
672 step1[2].val[0] = vsubq_s32(step2[1].val[0], step2[2].val[0]);
673 step1[2].val[1] = vsubq_s32(step2[1].val[1], step2[2].val[1]);
674 step1[3].val[0] = vsubq_s32(step2[0].val[0], step2[3].val[0]);
675 step1[3].val[1] = vsubq_s32(step2[0].val[1], step2[3].val[1]);
676 step1[4] = step2[4];
677 highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
678 &step1[6]);
679 step1[7] = step2[7];
680 step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[11].val[0]);
681 step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[11].val[1]);
682 step1[9].val[0] = vaddq_s32(step2[9].val[0], step2[10].val[0]);
683 step1[9].val[1] = vaddq_s32(step2[9].val[1], step2[10].val[1]);
684 step1[10].val[0] = vsubq_s32(step2[9].val[0], step2[10].val[0]);
685 step1[10].val[1] = vsubq_s32(step2[9].val[1], step2[10].val[1]);
686 step1[11].val[0] = vsubq_s32(step2[8].val[0], step2[11].val[0]);
687 step1[11].val[1] = vsubq_s32(step2[8].val[1], step2[11].val[1]);
688 step1[12].val[0] = vsubq_s32(step2[15].val[0], step2[12].val[0]);
689 step1[12].val[1] = vsubq_s32(step2[15].val[1], step2[12].val[1]);
690 step1[13].val[0] = vsubq_s32(step2[14].val[0], step2[13].val[0]);
691 step1[13].val[1] = vsubq_s32(step2[14].val[1], step2[13].val[1]);
692 step1[14].val[0] = vaddq_s32(step2[14].val[0], step2[13].val[0]);
693 step1[14].val[1] = vaddq_s32(step2[14].val[1], step2[13].val[1]);
694 step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[12].val[0]);
695 step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[12].val[1]);
696
697 // stage 6
698 step2[0].val[0] = vaddq_s32(step1[0].val[0], step1[7].val[0]);
699 step2[0].val[1] = vaddq_s32(step1[0].val[1], step1[7].val[1]);
700 step2[1].val[0] = vaddq_s32(step1[1].val[0], step1[6].val[0]);
701 step2[1].val[1] = vaddq_s32(step1[1].val[1], step1[6].val[1]);
702 step2[2].val[0] = vaddq_s32(step1[2].val[0], step1[5].val[0]);
703 step2[2].val[1] = vaddq_s32(step1[2].val[1], step1[5].val[1]);
704 step2[3].val[0] = vaddq_s32(step1[3].val[0], step1[4].val[0]);
705 step2[3].val[1] = vaddq_s32(step1[3].val[1], step1[4].val[1]);
706 step2[4].val[0] = vsubq_s32(step1[3].val[0], step1[4].val[0]);
707 step2[4].val[1] = vsubq_s32(step1[3].val[1], step1[4].val[1]);
708 step2[5].val[0] = vsubq_s32(step1[2].val[0], step1[5].val[0]);
709 step2[5].val[1] = vsubq_s32(step1[2].val[1], step1[5].val[1]);
710 step2[6].val[0] = vsubq_s32(step1[1].val[0], step1[6].val[0]);
711 step2[6].val[1] = vsubq_s32(step1[1].val[1], step1[6].val[1]);
712 step2[7].val[0] = vsubq_s32(step1[0].val[0], step1[7].val[0]);
713 step2[7].val[1] = vsubq_s32(step1[0].val[1], step1[7].val[1]);
714 highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
715 &step2[13]);
716 highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
717 &step2[12]);
718 step2[8] = step1[8];
719 step2[9] = step1[9];
720 step2[14] = step1[14];
721 step2[15] = step1[15];
722
723 // stage 7
724 highbd_idct16x16_add_stage7_dual(step2, out);
725
726 if (output) {
727 highbd_idct16x16_store_pass1(out, output);
728 } else {
729 highbd_idct16x16_add_store(out, dest, stride, bd);
730 }
731 }
732
highbd_idct_cospi_lane0_dual(const int32x4x2_t s,const int32x2_t coef)733 static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s,
734 const int32x2_t coef) {
735 int64x2x2_t t[2];
736
737 t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 0);
738 t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0);
739 t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0);
740 t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0);
741 return dct_const_round_shift_high_4x2_int64x2x2(t);
742 }
743
highbd_idct_cospi_lane0(const int32x4_t s,const int32x2_t coef)744 static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s,
745 const int32x2_t coef) {
746 int64x2x2_t t;
747
748 t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0);
749 t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0);
750 return dct_const_round_shift_high_4(t);
751 }
752
highbd_idct_cospi_lane1_dual(const int32x4x2_t s,const int32x2_t coef)753 static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s,
754 const int32x2_t coef) {
755 int64x2x2_t t[2];
756
757 t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 1);
758 t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1);
759 t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1);
760 t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1);
761 return dct_const_round_shift_high_4x2_int64x2x2(t);
762 }
763
highbd_idct_cospi_lane1(const int32x4_t s,const int32x2_t coef)764 static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s,
765 const int32x2_t coef) {
766 int64x2x2_t t;
767
768 t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1);
769 t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1);
770 return dct_const_round_shift_high_4(t);
771 }
772
vpx_highbd_idct16x16_38_add_half1d(const int32_t * input,int32_t * output,uint16_t * dest,const int stride,const int bd)773 static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input,
774 int32_t *output, uint16_t *dest,
775 const int stride, const int bd) {
776 const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
777 const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
778 const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
779 const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
780 int32x4x2_t in[8], step1[16], step2[16], out[16];
781
782 // Load input (8x8)
783 in[0].val[0] = vld1q_s32(input);
784 in[0].val[1] = vld1q_s32(input + 4);
785 input += 16;
786 in[1].val[0] = vld1q_s32(input);
787 in[1].val[1] = vld1q_s32(input + 4);
788 input += 16;
789 in[2].val[0] = vld1q_s32(input);
790 in[2].val[1] = vld1q_s32(input + 4);
791 input += 16;
792 in[3].val[0] = vld1q_s32(input);
793 in[3].val[1] = vld1q_s32(input + 4);
794 input += 16;
795 in[4].val[0] = vld1q_s32(input);
796 in[4].val[1] = vld1q_s32(input + 4);
797 input += 16;
798 in[5].val[0] = vld1q_s32(input);
799 in[5].val[1] = vld1q_s32(input + 4);
800 input += 16;
801 in[6].val[0] = vld1q_s32(input);
802 in[6].val[1] = vld1q_s32(input + 4);
803 input += 16;
804 in[7].val[0] = vld1q_s32(input);
805 in[7].val[1] = vld1q_s32(input + 4);
806
807 // Transpose
808 transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
809 &in[7]);
810
811 // stage 1
812 step1[0] = in[0 / 2];
813 step1[2] = in[8 / 2];
814 step1[4] = in[4 / 2];
815 step1[6] = in[12 / 2];
816 step1[8] = in[2 / 2];
817 step1[10] = in[10 / 2];
818 step1[12] = in[6 / 2];
819 step1[14] = in[14 / 2]; // 0 in pass 1
820
821 // stage 2
822 step2[0] = step1[0];
823 step2[2] = step1[2];
824 step2[4] = step1[4];
825 step2[6] = step1[6];
826 step2[8] =
827 highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
828 step2[9] = highbd_idct_cospi_lane1_dual(step1[14],
829 vget_high_s32(cospi_6_26N_14_18N));
830 step2[10] =
831 highbd_idct_cospi_lane1_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
832 step2[11] =
833 highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
834 step2[12] =
835 highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
836 step2[13] =
837 highbd_idct_cospi_lane0_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
838 step2[14] = highbd_idct_cospi_lane0_dual(step1[14],
839 vget_high_s32(cospi_6_26N_14_18N));
840 step2[15] =
841 highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
842
843 // stage 3
844 step1[0] = step2[0];
845 step1[2] = step2[2];
846 step1[4] =
847 highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
848 step1[5] =
849 highbd_idct_cospi_lane0_dual(step2[6], vget_high_s32(cospi_4_12_20N_28));
850 step1[6] =
851 highbd_idct_cospi_lane1_dual(step2[6], vget_low_s32(cospi_4_12_20N_28));
852 step1[7] =
853 highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
854 step1[8] = highbd_idct_add_dual(step2[8], step2[9]);
855 step1[9] = highbd_idct_sub_dual(step2[8], step2[9]);
856 step1[10] = highbd_idct_sub_dual(step2[11], step2[10]);
857 step1[11] = highbd_idct_add_dual(step2[11], step2[10]);
858 step1[12] = highbd_idct_add_dual(step2[12], step2[13]);
859 step1[13] = highbd_idct_sub_dual(step2[12], step2[13]);
860 step1[14] = highbd_idct_sub_dual(step2[15], step2[14]);
861 step1[15] = highbd_idct_add_dual(step2[15], step2[14]);
862
863 // stage 4
864 step2[0] = step2[1] =
865 highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
866 step2[2] =
867 highbd_idct_cospi_lane1_dual(step1[2], vget_high_s32(cospi_0_8_16_24));
868 step2[3] =
869 highbd_idct_cospi_lane1_dual(step1[2], vget_low_s32(cospi_0_8_16_24));
870 step2[4] = highbd_idct_add_dual(step1[4], step1[5]);
871 step2[5] = highbd_idct_sub_dual(step1[4], step1[5]);
872 step2[6] = highbd_idct_sub_dual(step1[7], step1[6]);
873 step2[7] = highbd_idct_add_dual(step1[7], step1[6]);
874 step2[8] = step1[8];
875 highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
876 &step2[14]);
877 highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
878 &step2[13], &step2[10]);
879 step2[11] = step1[11];
880 step2[12] = step1[12];
881 step2[15] = step1[15];
882
883 // stage 5
884 step1[0] = highbd_idct_add_dual(step2[0], step2[3]);
885 step1[1] = highbd_idct_add_dual(step2[1], step2[2]);
886 step1[2] = highbd_idct_sub_dual(step2[1], step2[2]);
887 step1[3] = highbd_idct_sub_dual(step2[0], step2[3]);
888 step1[4] = step2[4];
889 highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
890 &step1[6]);
891 step1[7] = step2[7];
892 step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
893 step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
894 step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
895 step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
896 step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
897 step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
898 step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
899 step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
900
901 // stage 6
902 step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
903 step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
904 step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
905 step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
906 step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
907 step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
908 step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
909 step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
910 highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
911 &step2[13]);
912 highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
913 &step2[12]);
914 step2[8] = step1[8];
915 step2[9] = step1[9];
916 step2[14] = step1[14];
917 step2[15] = step1[15];
918
919 // stage 7
920 highbd_idct16x16_add_stage7_dual(step2, out);
921
922 if (output) {
923 highbd_idct16x16_store_pass1(out, output);
924 } else {
925 highbd_idct16x16_add_store(out, dest, stride, bd);
926 }
927 }
928
highbd_idct16x16_10_add_half1d_pass1(const tran_low_t * input,int32_t * output)929 static void highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
930 int32_t *output) {
931 const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
932 const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
933 const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
934 const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
935 int32x4_t in[4], step1[16], step2[16], out[16];
936
937 // Load input (4x4)
938 in[0] = vld1q_s32(input);
939 input += 16;
940 in[1] = vld1q_s32(input);
941 input += 16;
942 in[2] = vld1q_s32(input);
943 input += 16;
944 in[3] = vld1q_s32(input);
945
946 // Transpose
947 transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
948
949 // stage 1
950 step1[0] = in[0 / 2];
951 step1[4] = in[4 / 2];
952 step1[8] = in[2 / 2];
953 step1[12] = in[6 / 2];
954
955 // stage 2
956 step2[0] = step1[0];
957 step2[4] = step1[4];
958 step2[8] = highbd_idct_cospi_lane1(step1[8], vget_low_s32(cospi_2_30_10_22));
959 step2[11] =
960 highbd_idct_cospi_lane1(step1[12], vget_low_s32(cospi_6_26N_14_18N));
961 step2[12] =
962 highbd_idct_cospi_lane0(step1[12], vget_low_s32(cospi_6_26N_14_18N));
963 step2[15] = highbd_idct_cospi_lane0(step1[8], vget_low_s32(cospi_2_30_10_22));
964
965 // stage 3
966 step1[0] = step2[0];
967 step1[4] =
968 highbd_idct_cospi_lane1(step2[4], vget_high_s32(cospi_4_12_20N_28));
969 step1[7] = highbd_idct_cospi_lane0(step2[4], vget_low_s32(cospi_4_12_20N_28));
970 step1[8] = step2[8];
971 step1[9] = step2[8];
972 step1[10] = step2[11];
973 step1[11] = step2[11];
974 step1[12] = step2[12];
975 step1[13] = step2[12];
976 step1[14] = step2[15];
977 step1[15] = step2[15];
978
979 // stage 4
980 step2[0] = step2[1] =
981 highbd_idct_cospi_lane0(step1[0], vget_high_s32(cospi_0_8_16_24));
982 step2[4] = step1[4];
983 step2[5] = step1[4];
984 step2[6] = step1[7];
985 step2[7] = step1[7];
986 step2[8] = step1[8];
987 highbd_idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
988 &step2[14]);
989 highbd_idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24,
990 &step2[13], &step2[10]);
991 step2[11] = step1[11];
992 step2[12] = step1[12];
993 step2[15] = step1[15];
994
995 // stage 5
996 step1[0] = step2[0];
997 step1[1] = step2[1];
998 step1[2] = step2[1];
999 step1[3] = step2[0];
1000 step1[4] = step2[4];
1001 highbd_idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
1002 &step1[6]);
1003 step1[7] = step2[7];
1004 step1[8] = vaddq_s32(step2[8], step2[11]);
1005 step1[9] = vaddq_s32(step2[9], step2[10]);
1006 step1[10] = vsubq_s32(step2[9], step2[10]);
1007 step1[11] = vsubq_s32(step2[8], step2[11]);
1008 step1[12] = vsubq_s32(step2[15], step2[12]);
1009 step1[13] = vsubq_s32(step2[14], step2[13]);
1010 step1[14] = vaddq_s32(step2[14], step2[13]);
1011 step1[15] = vaddq_s32(step2[15], step2[12]);
1012
1013 // stage 6
1014 step2[0] = vaddq_s32(step1[0], step1[7]);
1015 step2[1] = vaddq_s32(step1[1], step1[6]);
1016 step2[2] = vaddq_s32(step1[2], step1[5]);
1017 step2[3] = vaddq_s32(step1[3], step1[4]);
1018 step2[4] = vsubq_s32(step1[3], step1[4]);
1019 step2[5] = vsubq_s32(step1[2], step1[5]);
1020 step2[6] = vsubq_s32(step1[1], step1[6]);
1021 step2[7] = vsubq_s32(step1[0], step1[7]);
1022 highbd_idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
1023 &step2[13]);
1024 highbd_idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
1025 &step2[12]);
1026 step2[8] = step1[8];
1027 step2[9] = step1[9];
1028 step2[14] = step1[14];
1029 step2[15] = step1[15];
1030
1031 // stage 7
1032 highbd_idct16x16_add_stage7(step2, out);
1033
1034 // pass 1: save the result into output
1035 vst1q_s32(output, out[0]);
1036 output += 4;
1037 vst1q_s32(output, out[1]);
1038 output += 4;
1039 vst1q_s32(output, out[2]);
1040 output += 4;
1041 vst1q_s32(output, out[3]);
1042 output += 4;
1043 vst1q_s32(output, out[4]);
1044 output += 4;
1045 vst1q_s32(output, out[5]);
1046 output += 4;
1047 vst1q_s32(output, out[6]);
1048 output += 4;
1049 vst1q_s32(output, out[7]);
1050 output += 4;
1051 vst1q_s32(output, out[8]);
1052 output += 4;
1053 vst1q_s32(output, out[9]);
1054 output += 4;
1055 vst1q_s32(output, out[10]);
1056 output += 4;
1057 vst1q_s32(output, out[11]);
1058 output += 4;
1059 vst1q_s32(output, out[12]);
1060 output += 4;
1061 vst1q_s32(output, out[13]);
1062 output += 4;
1063 vst1q_s32(output, out[14]);
1064 output += 4;
1065 vst1q_s32(output, out[15]);
1066 }
1067
highbd_idct16x16_10_add_half1d_pass2(const int32_t * input,int32_t * const output,uint16_t * const dest,const int stride,const int bd)1068 static void highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
1069 int32_t *const output,
1070 uint16_t *const dest,
1071 const int stride,
1072 const int bd) {
1073 const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
1074 const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
1075 const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
1076 const int32x4_t cospi_6_26N_14_18N = vld1q_s32(kCospi32 + 12);
1077 int32x4x2_t in[4], step1[16], step2[16], out[16];
1078
1079 // Load input (4x8)
1080 in[0].val[0] = vld1q_s32(input);
1081 input += 4;
1082 in[0].val[1] = vld1q_s32(input);
1083 input += 4;
1084 in[1].val[0] = vld1q_s32(input);
1085 input += 4;
1086 in[1].val[1] = vld1q_s32(input);
1087 input += 4;
1088 in[2].val[0] = vld1q_s32(input);
1089 input += 4;
1090 in[2].val[1] = vld1q_s32(input);
1091 input += 4;
1092 in[3].val[0] = vld1q_s32(input);
1093 input += 4;
1094 in[3].val[1] = vld1q_s32(input);
1095
1096 // Transpose
1097 transpose_s32_4x8(&in[0].val[0], &in[0].val[1], &in[1].val[0], &in[1].val[1],
1098 &in[2].val[0], &in[2].val[1], &in[3].val[0], &in[3].val[1]);
1099
1100 // stage 1
1101 step1[0] = in[0 / 2];
1102 step1[4] = in[4 / 2];
1103 step1[8] = in[2 / 2];
1104 step1[12] = in[6 / 2];
1105
1106 // stage 2
1107 step2[0] = step1[0];
1108 step2[4] = step1[4];
1109 step2[8] =
1110 highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
1111 step2[11] =
1112 highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
1113 step2[12] =
1114 highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26N_14_18N));
1115 step2[15] =
1116 highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
1117
1118 // stage 3
1119 step1[0] = step2[0];
1120 step1[4] =
1121 highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
1122 step1[7] =
1123 highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
1124 step1[8] = step2[8];
1125 step1[9] = step2[8];
1126 step1[10] = step2[11];
1127 step1[11] = step2[11];
1128 step1[12] = step2[12];
1129 step1[13] = step2[12];
1130 step1[14] = step2[15];
1131 step1[15] = step2[15];
1132
1133 // stage 4
1134 step2[0] = step2[1] =
1135 highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
1136 step2[4] = step1[4];
1137 step2[5] = step1[4];
1138 step2[6] = step1[7];
1139 step2[7] = step1[7];
1140 step2[8] = step1[8];
1141 highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
1142 &step2[14]);
1143 highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
1144 &step2[13], &step2[10]);
1145 step2[11] = step1[11];
1146 step2[12] = step1[12];
1147 step2[15] = step1[15];
1148
1149 // stage 5
1150 step1[0] = step2[0];
1151 step1[1] = step2[1];
1152 step1[2] = step2[1];
1153 step1[3] = step2[0];
1154 step1[4] = step2[4];
1155 highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
1156 &step1[6]);
1157 step1[7] = step2[7];
1158 step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
1159 step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
1160 step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
1161 step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
1162 step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
1163 step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
1164 step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
1165 step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
1166
1167 // stage 6
1168 step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
1169 step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
1170 step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
1171 step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
1172 step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
1173 step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
1174 step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
1175 step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
1176 highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
1177 &step2[13]);
1178 highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
1179 &step2[12]);
1180 step2[8] = step1[8];
1181 step2[9] = step1[9];
1182 step2[14] = step1[14];
1183 step2[15] = step1[15];
1184
1185 // stage 7
1186 highbd_idct16x16_add_stage7_dual(step2, out);
1187
1188 if (output) {
1189 highbd_idct16x16_store_pass1(out, output);
1190 } else {
1191 highbd_idct16x16_add_store(out, dest, stride, bd);
1192 }
1193 }
1194
vpx_highbd_idct16x16_256_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int bd)1195 void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
1196 int stride, int bd) {
1197 if (bd == 8) {
1198 int16_t row_idct_output[16 * 16];
1199
1200 // pass 1
1201 // Parallel idct on the upper 8 rows
1202 vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 1);
1203
1204 // Parallel idct on the lower 8 rows
1205 vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
1206 stride, 1);
1207
1208 // pass 2
1209 // Parallel idct to get the left 8 columns
1210 vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 1);
1211
1212 // Parallel idct to get the right 8 columns
1213 vpx_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8,
1214 stride, 1);
1215 } else {
1216 int32_t row_idct_output[16 * 16];
1217
1218 // pass 1
1219 // Parallel idct on the upper 8 rows
1220 vpx_highbd_idct16x16_256_add_half1d(input, row_idct_output, dest, stride,
1221 bd);
1222
1223 // Parallel idct on the lower 8 rows
1224 vpx_highbd_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8,
1225 dest, stride, bd);
1226
1227 // pass 2
1228 // Parallel idct to get the left 8 columns
1229 vpx_highbd_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride,
1230 bd);
1231
1232 // Parallel idct to get the right 8 columns
1233 vpx_highbd_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL,
1234 dest + 8, stride, bd);
1235 }
1236 }
1237
vpx_highbd_idct16x16_38_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int bd)1238 void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest,
1239 int stride, int bd) {
1240 if (bd == 8) {
1241 int16_t row_idct_output[16 * 16];
1242
1243 // pass 1
1244 // Parallel idct on the upper 8 rows
1245 vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 1);
1246
1247 // pass 2
1248 // Parallel idct to get the left 8 columns
1249 vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 1);
1250
1251 // Parallel idct to get the right 8 columns
1252 vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
1253 stride, 1);
1254 } else {
1255 int32_t row_idct_output[16 * 16];
1256
1257 // pass 1
1258 // Parallel idct on the upper 8 rows
1259 vpx_highbd_idct16x16_38_add_half1d(input, row_idct_output, dest, stride,
1260 bd);
1261
1262 // pass 2
1263 // Parallel idct to get the left 8 columns
1264 vpx_highbd_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, bd);
1265
1266 // Parallel idct to get the right 8 columns
1267 vpx_highbd_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
1268 stride, bd);
1269 }
1270 }
1271
vpx_highbd_idct16x16_10_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int bd)1272 void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,
1273 int stride, int bd) {
1274 if (bd == 8) {
1275 int16_t row_idct_output[4 * 16];
1276
1277 // pass 1
1278 // Parallel idct on the upper 8 rows
1279 vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
1280
1281 // pass 2
1282 // Parallel idct to get the left 8 columns
1283 vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 1);
1284
1285 // Parallel idct to get the right 8 columns
1286 vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
1287 stride, 1);
1288 } else {
1289 int32_t row_idct_output[4 * 16];
1290
1291 // pass 1
1292 // Parallel idct on the upper 8 rows
1293 highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output);
1294
1295 // pass 2
1296 // Parallel idct to get the left 8 columns
1297 highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride,
1298 bd);
1299
1300 // Parallel idct to get the right 8 columns
1301 highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL,
1302 dest + 8, stride, bd);
1303 }
1304 }
1305
highbd_idct16x16_1_add_pos_kernel(uint16_t ** dest,const int stride,const int16x8_t res,const int16x8_t max)1306 static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest,
1307 const int stride,
1308 const int16x8_t res,
1309 const int16x8_t max) {
1310 const uint16x8_t a0 = vld1q_u16(*dest + 0);
1311 const uint16x8_t a1 = vld1q_u16(*dest + 8);
1312 const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
1313 const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
1314 const int16x8_t c0 = vminq_s16(b0, max);
1315 const int16x8_t c1 = vminq_s16(b1, max);
1316 vst1q_u16(*dest + 0, vreinterpretq_u16_s16(c0));
1317 vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
1318 *dest += stride;
1319 }
1320
highbd_idct16x16_1_add_neg_kernel(uint16_t ** dest,const int stride,const int16x8_t res)1321 static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
1322 const int stride,
1323 const int16x8_t res) {
1324 const uint16x8_t a0 = vld1q_u16(*dest + 0);
1325 const uint16x8_t a1 = vld1q_u16(*dest + 8);
1326 const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
1327 const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
1328 const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
1329 const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
1330 vst1q_u16(*dest + 0, c0);
1331 vst1q_u16(*dest + 8, c1);
1332 *dest += stride;
1333 }
1334
vpx_highbd_idct16x16_1_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int bd)1335 void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
1336 int stride, int bd) {
1337 const tran_low_t out0 = HIGHBD_WRAPLOW(
1338 dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
1339 const tran_low_t out1 = HIGHBD_WRAPLOW(
1340 dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
1341 const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
1342 const int16x8_t dc = vdupq_n_s16(a1);
1343 int i;
1344
1345 if (a1 >= 0) {
1346 const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
1347 for (i = 0; i < 4; ++i) {
1348 highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
1349 highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
1350 highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
1351 highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
1352 }
1353 } else {
1354 for (i = 0; i < 4; ++i) {
1355 highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
1356 highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
1357 highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
1358 highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
1359 }
1360 }
1361 }
1362