1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "vpx_dsp/txfm_common.h"
15
TRANSPOSE8X8(int16x8_t * q8s16,int16x8_t * q9s16,int16x8_t * q10s16,int16x8_t * q11s16,int16x8_t * q12s16,int16x8_t * q13s16,int16x8_t * q14s16,int16x8_t * q15s16)16 static INLINE void TRANSPOSE8X8(
17 int16x8_t *q8s16,
18 int16x8_t *q9s16,
19 int16x8_t *q10s16,
20 int16x8_t *q11s16,
21 int16x8_t *q12s16,
22 int16x8_t *q13s16,
23 int16x8_t *q14s16,
24 int16x8_t *q15s16) {
25 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
26 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
27 int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
28 int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
29
30 d16s16 = vget_low_s16(*q8s16);
31 d17s16 = vget_high_s16(*q8s16);
32 d18s16 = vget_low_s16(*q9s16);
33 d19s16 = vget_high_s16(*q9s16);
34 d20s16 = vget_low_s16(*q10s16);
35 d21s16 = vget_high_s16(*q10s16);
36 d22s16 = vget_low_s16(*q11s16);
37 d23s16 = vget_high_s16(*q11s16);
38 d24s16 = vget_low_s16(*q12s16);
39 d25s16 = vget_high_s16(*q12s16);
40 d26s16 = vget_low_s16(*q13s16);
41 d27s16 = vget_high_s16(*q13s16);
42 d28s16 = vget_low_s16(*q14s16);
43 d29s16 = vget_high_s16(*q14s16);
44 d30s16 = vget_low_s16(*q15s16);
45 d31s16 = vget_high_s16(*q15s16);
46
47 *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
48 *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
49 *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
50 *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
51 *q12s16 = vcombine_s16(d17s16, d25s16);
52 *q13s16 = vcombine_s16(d19s16, d27s16);
53 *q14s16 = vcombine_s16(d21s16, d29s16);
54 *q15s16 = vcombine_s16(d23s16, d31s16);
55
56 q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
57 vreinterpretq_s32_s16(*q10s16));
58 q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
59 vreinterpretq_s32_s16(*q11s16));
60 q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
61 vreinterpretq_s32_s16(*q14s16));
62 q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
63 vreinterpretq_s32_s16(*q15s16));
64
65 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
66 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
67 q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
68 vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
69 q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
70 vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
71 q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
72 vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
73
74 *q8s16 = q0x2s16.val[0];
75 *q9s16 = q0x2s16.val[1];
76 *q10s16 = q1x2s16.val[0];
77 *q11s16 = q1x2s16.val[1];
78 *q12s16 = q2x2s16.val[0];
79 *q13s16 = q2x2s16.val[1];
80 *q14s16 = q3x2s16.val[0];
81 *q15s16 = q3x2s16.val[1];
82 return;
83 }
84
IDCT8x8_1D(int16x8_t * q8s16,int16x8_t * q9s16,int16x8_t * q10s16,int16x8_t * q11s16,int16x8_t * q12s16,int16x8_t * q13s16,int16x8_t * q14s16,int16x8_t * q15s16)85 static INLINE void IDCT8x8_1D(
86 int16x8_t *q8s16,
87 int16x8_t *q9s16,
88 int16x8_t *q10s16,
89 int16x8_t *q11s16,
90 int16x8_t *q12s16,
91 int16x8_t *q13s16,
92 int16x8_t *q14s16,
93 int16x8_t *q15s16) {
94 int16x4_t d0s16, d1s16, d2s16, d3s16;
95 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
96 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
97 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
98 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
99 int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
100 int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
101
102 d0s16 = vdup_n_s16(cospi_28_64);
103 d1s16 = vdup_n_s16(cospi_4_64);
104 d2s16 = vdup_n_s16(cospi_12_64);
105 d3s16 = vdup_n_s16(cospi_20_64);
106
107 d16s16 = vget_low_s16(*q8s16);
108 d17s16 = vget_high_s16(*q8s16);
109 d18s16 = vget_low_s16(*q9s16);
110 d19s16 = vget_high_s16(*q9s16);
111 d20s16 = vget_low_s16(*q10s16);
112 d21s16 = vget_high_s16(*q10s16);
113 d22s16 = vget_low_s16(*q11s16);
114 d23s16 = vget_high_s16(*q11s16);
115 d24s16 = vget_low_s16(*q12s16);
116 d25s16 = vget_high_s16(*q12s16);
117 d26s16 = vget_low_s16(*q13s16);
118 d27s16 = vget_high_s16(*q13s16);
119 d28s16 = vget_low_s16(*q14s16);
120 d29s16 = vget_high_s16(*q14s16);
121 d30s16 = vget_low_s16(*q15s16);
122 d31s16 = vget_high_s16(*q15s16);
123
124 q2s32 = vmull_s16(d18s16, d0s16);
125 q3s32 = vmull_s16(d19s16, d0s16);
126 q5s32 = vmull_s16(d26s16, d2s16);
127 q6s32 = vmull_s16(d27s16, d2s16);
128
129 q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
130 q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
131 q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
132 q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
133
134 d8s16 = vqrshrn_n_s32(q2s32, 14);
135 d9s16 = vqrshrn_n_s32(q3s32, 14);
136 d10s16 = vqrshrn_n_s32(q5s32, 14);
137 d11s16 = vqrshrn_n_s32(q6s32, 14);
138 q4s16 = vcombine_s16(d8s16, d9s16);
139 q5s16 = vcombine_s16(d10s16, d11s16);
140
141 q2s32 = vmull_s16(d18s16, d1s16);
142 q3s32 = vmull_s16(d19s16, d1s16);
143 q9s32 = vmull_s16(d26s16, d3s16);
144 q13s32 = vmull_s16(d27s16, d3s16);
145
146 q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
147 q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
148 q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
149 q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
150
151 d14s16 = vqrshrn_n_s32(q2s32, 14);
152 d15s16 = vqrshrn_n_s32(q3s32, 14);
153 d12s16 = vqrshrn_n_s32(q9s32, 14);
154 d13s16 = vqrshrn_n_s32(q13s32, 14);
155 q6s16 = vcombine_s16(d12s16, d13s16);
156 q7s16 = vcombine_s16(d14s16, d15s16);
157
158 d0s16 = vdup_n_s16(cospi_16_64);
159
160 q2s32 = vmull_s16(d16s16, d0s16);
161 q3s32 = vmull_s16(d17s16, d0s16);
162 q13s32 = vmull_s16(d16s16, d0s16);
163 q15s32 = vmull_s16(d17s16, d0s16);
164
165 q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
166 q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
167 q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
168 q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
169
170 d0s16 = vdup_n_s16(cospi_24_64);
171 d1s16 = vdup_n_s16(cospi_8_64);
172
173 d18s16 = vqrshrn_n_s32(q2s32, 14);
174 d19s16 = vqrshrn_n_s32(q3s32, 14);
175 d22s16 = vqrshrn_n_s32(q13s32, 14);
176 d23s16 = vqrshrn_n_s32(q15s32, 14);
177 *q9s16 = vcombine_s16(d18s16, d19s16);
178 *q11s16 = vcombine_s16(d22s16, d23s16);
179
180 q2s32 = vmull_s16(d20s16, d0s16);
181 q3s32 = vmull_s16(d21s16, d0s16);
182 q8s32 = vmull_s16(d20s16, d1s16);
183 q12s32 = vmull_s16(d21s16, d1s16);
184
185 q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
186 q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
187 q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
188 q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
189
190 d26s16 = vqrshrn_n_s32(q2s32, 14);
191 d27s16 = vqrshrn_n_s32(q3s32, 14);
192 d30s16 = vqrshrn_n_s32(q8s32, 14);
193 d31s16 = vqrshrn_n_s32(q12s32, 14);
194 *q13s16 = vcombine_s16(d26s16, d27s16);
195 *q15s16 = vcombine_s16(d30s16, d31s16);
196
197 q0s16 = vaddq_s16(*q9s16, *q15s16);
198 q1s16 = vaddq_s16(*q11s16, *q13s16);
199 q2s16 = vsubq_s16(*q11s16, *q13s16);
200 q3s16 = vsubq_s16(*q9s16, *q15s16);
201
202 *q13s16 = vsubq_s16(q4s16, q5s16);
203 q4s16 = vaddq_s16(q4s16, q5s16);
204 *q14s16 = vsubq_s16(q7s16, q6s16);
205 q7s16 = vaddq_s16(q7s16, q6s16);
206 d26s16 = vget_low_s16(*q13s16);
207 d27s16 = vget_high_s16(*q13s16);
208 d28s16 = vget_low_s16(*q14s16);
209 d29s16 = vget_high_s16(*q14s16);
210
211 d16s16 = vdup_n_s16(cospi_16_64);
212
213 q9s32 = vmull_s16(d28s16, d16s16);
214 q10s32 = vmull_s16(d29s16, d16s16);
215 q11s32 = vmull_s16(d28s16, d16s16);
216 q12s32 = vmull_s16(d29s16, d16s16);
217
218 q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
219 q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
220 q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
221 q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
222
223 d10s16 = vqrshrn_n_s32(q9s32, 14);
224 d11s16 = vqrshrn_n_s32(q10s32, 14);
225 d12s16 = vqrshrn_n_s32(q11s32, 14);
226 d13s16 = vqrshrn_n_s32(q12s32, 14);
227 q5s16 = vcombine_s16(d10s16, d11s16);
228 q6s16 = vcombine_s16(d12s16, d13s16);
229
230 *q8s16 = vaddq_s16(q0s16, q7s16);
231 *q9s16 = vaddq_s16(q1s16, q6s16);
232 *q10s16 = vaddq_s16(q2s16, q5s16);
233 *q11s16 = vaddq_s16(q3s16, q4s16);
234 *q12s16 = vsubq_s16(q3s16, q4s16);
235 *q13s16 = vsubq_s16(q2s16, q5s16);
236 *q14s16 = vsubq_s16(q1s16, q6s16);
237 *q15s16 = vsubq_s16(q0s16, q7s16);
238 return;
239 }
240
vpx_idct8x8_64_add_neon(int16_t * input,uint8_t * dest,int dest_stride)241 void vpx_idct8x8_64_add_neon(
242 int16_t *input,
243 uint8_t *dest,
244 int dest_stride) {
245 uint8_t *d1, *d2;
246 uint8x8_t d0u8, d1u8, d2u8, d3u8;
247 uint64x1_t d0u64, d1u64, d2u64, d3u64;
248 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
249 uint16x8_t q8u16, q9u16, q10u16, q11u16;
250
251 q8s16 = vld1q_s16(input);
252 q9s16 = vld1q_s16(input + 8);
253 q10s16 = vld1q_s16(input + 16);
254 q11s16 = vld1q_s16(input + 24);
255 q12s16 = vld1q_s16(input + 32);
256 q13s16 = vld1q_s16(input + 40);
257 q14s16 = vld1q_s16(input + 48);
258 q15s16 = vld1q_s16(input + 56);
259
260 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
261 &q12s16, &q13s16, &q14s16, &q15s16);
262
263 IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
264 &q12s16, &q13s16, &q14s16, &q15s16);
265
266 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
267 &q12s16, &q13s16, &q14s16, &q15s16);
268
269 IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
270 &q12s16, &q13s16, &q14s16, &q15s16);
271
272 q8s16 = vrshrq_n_s16(q8s16, 5);
273 q9s16 = vrshrq_n_s16(q9s16, 5);
274 q10s16 = vrshrq_n_s16(q10s16, 5);
275 q11s16 = vrshrq_n_s16(q11s16, 5);
276 q12s16 = vrshrq_n_s16(q12s16, 5);
277 q13s16 = vrshrq_n_s16(q13s16, 5);
278 q14s16 = vrshrq_n_s16(q14s16, 5);
279 q15s16 = vrshrq_n_s16(q15s16, 5);
280
281 d1 = d2 = dest;
282
283 d0u64 = vld1_u64((uint64_t *)d1);
284 d1 += dest_stride;
285 d1u64 = vld1_u64((uint64_t *)d1);
286 d1 += dest_stride;
287 d2u64 = vld1_u64((uint64_t *)d1);
288 d1 += dest_stride;
289 d3u64 = vld1_u64((uint64_t *)d1);
290 d1 += dest_stride;
291
292 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
293 vreinterpret_u8_u64(d0u64));
294 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
295 vreinterpret_u8_u64(d1u64));
296 q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
297 vreinterpret_u8_u64(d2u64));
298 q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
299 vreinterpret_u8_u64(d3u64));
300
301 d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
302 d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
303 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
304 d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
305
306 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
307 d2 += dest_stride;
308 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
309 d2 += dest_stride;
310 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
311 d2 += dest_stride;
312 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
313 d2 += dest_stride;
314
315 q8s16 = q12s16;
316 q9s16 = q13s16;
317 q10s16 = q14s16;
318 q11s16 = q15s16;
319
320 d0u64 = vld1_u64((uint64_t *)d1);
321 d1 += dest_stride;
322 d1u64 = vld1_u64((uint64_t *)d1);
323 d1 += dest_stride;
324 d2u64 = vld1_u64((uint64_t *)d1);
325 d1 += dest_stride;
326 d3u64 = vld1_u64((uint64_t *)d1);
327 d1 += dest_stride;
328
329 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
330 vreinterpret_u8_u64(d0u64));
331 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
332 vreinterpret_u8_u64(d1u64));
333 q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
334 vreinterpret_u8_u64(d2u64));
335 q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
336 vreinterpret_u8_u64(d3u64));
337
338 d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
339 d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
340 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
341 d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
342
343 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
344 d2 += dest_stride;
345 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
346 d2 += dest_stride;
347 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
348 d2 += dest_stride;
349 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
350 d2 += dest_stride;
351 return;
352 }
353
vpx_idct8x8_12_add_neon(int16_t * input,uint8_t * dest,int dest_stride)354 void vpx_idct8x8_12_add_neon(
355 int16_t *input,
356 uint8_t *dest,
357 int dest_stride) {
358 uint8_t *d1, *d2;
359 uint8x8_t d0u8, d1u8, d2u8, d3u8;
360 int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
361 int16x4_t d26s16, d27s16, d28s16, d29s16;
362 uint64x1_t d0u64, d1u64, d2u64, d3u64;
363 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
364 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
365 uint16x8_t q8u16, q9u16, q10u16, q11u16;
366 int32x4_t q9s32, q10s32, q11s32, q12s32;
367
368 q8s16 = vld1q_s16(input);
369 q9s16 = vld1q_s16(input + 8);
370 q10s16 = vld1q_s16(input + 16);
371 q11s16 = vld1q_s16(input + 24);
372 q12s16 = vld1q_s16(input + 32);
373 q13s16 = vld1q_s16(input + 40);
374 q14s16 = vld1q_s16(input + 48);
375 q15s16 = vld1q_s16(input + 56);
376
377 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
378 &q12s16, &q13s16, &q14s16, &q15s16);
379
380 // First transform rows
381 // stage 1
382 q0s16 = vdupq_n_s16(cospi_28_64 * 2);
383 q1s16 = vdupq_n_s16(cospi_4_64 * 2);
384
385 q4s16 = vqrdmulhq_s16(q9s16, q0s16);
386
387 q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
388
389 q7s16 = vqrdmulhq_s16(q9s16, q1s16);
390
391 q1s16 = vdupq_n_s16(cospi_12_64 * 2);
392
393 q5s16 = vqrdmulhq_s16(q11s16, q0s16);
394
395 q0s16 = vdupq_n_s16(cospi_16_64 * 2);
396
397 q6s16 = vqrdmulhq_s16(q11s16, q1s16);
398
399 // stage 2 & stage 3 - even half
400 q1s16 = vdupq_n_s16(cospi_24_64 * 2);
401
402 q9s16 = vqrdmulhq_s16(q8s16, q0s16);
403
404 q0s16 = vdupq_n_s16(cospi_8_64 * 2);
405
406 q13s16 = vqrdmulhq_s16(q10s16, q1s16);
407
408 q15s16 = vqrdmulhq_s16(q10s16, q0s16);
409
410 // stage 3 -odd half
411 q0s16 = vaddq_s16(q9s16, q15s16);
412 q1s16 = vaddq_s16(q9s16, q13s16);
413 q2s16 = vsubq_s16(q9s16, q13s16);
414 q3s16 = vsubq_s16(q9s16, q15s16);
415
416 // stage 2 - odd half
417 q13s16 = vsubq_s16(q4s16, q5s16);
418 q4s16 = vaddq_s16(q4s16, q5s16);
419 q14s16 = vsubq_s16(q7s16, q6s16);
420 q7s16 = vaddq_s16(q7s16, q6s16);
421 d26s16 = vget_low_s16(q13s16);
422 d27s16 = vget_high_s16(q13s16);
423 d28s16 = vget_low_s16(q14s16);
424 d29s16 = vget_high_s16(q14s16);
425
426 d16s16 = vdup_n_s16(cospi_16_64);
427 q9s32 = vmull_s16(d28s16, d16s16);
428 q10s32 = vmull_s16(d29s16, d16s16);
429 q11s32 = vmull_s16(d28s16, d16s16);
430 q12s32 = vmull_s16(d29s16, d16s16);
431
432 q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
433 q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
434 q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
435 q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
436
437 d10s16 = vqrshrn_n_s32(q9s32, 14);
438 d11s16 = vqrshrn_n_s32(q10s32, 14);
439 d12s16 = vqrshrn_n_s32(q11s32, 14);
440 d13s16 = vqrshrn_n_s32(q12s32, 14);
441 q5s16 = vcombine_s16(d10s16, d11s16);
442 q6s16 = vcombine_s16(d12s16, d13s16);
443
444 // stage 4
445 q8s16 = vaddq_s16(q0s16, q7s16);
446 q9s16 = vaddq_s16(q1s16, q6s16);
447 q10s16 = vaddq_s16(q2s16, q5s16);
448 q11s16 = vaddq_s16(q3s16, q4s16);
449 q12s16 = vsubq_s16(q3s16, q4s16);
450 q13s16 = vsubq_s16(q2s16, q5s16);
451 q14s16 = vsubq_s16(q1s16, q6s16);
452 q15s16 = vsubq_s16(q0s16, q7s16);
453
454 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
455 &q12s16, &q13s16, &q14s16, &q15s16);
456
457 IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
458 &q12s16, &q13s16, &q14s16, &q15s16);
459
460 q8s16 = vrshrq_n_s16(q8s16, 5);
461 q9s16 = vrshrq_n_s16(q9s16, 5);
462 q10s16 = vrshrq_n_s16(q10s16, 5);
463 q11s16 = vrshrq_n_s16(q11s16, 5);
464 q12s16 = vrshrq_n_s16(q12s16, 5);
465 q13s16 = vrshrq_n_s16(q13s16, 5);
466 q14s16 = vrshrq_n_s16(q14s16, 5);
467 q15s16 = vrshrq_n_s16(q15s16, 5);
468
469 d1 = d2 = dest;
470
471 d0u64 = vld1_u64((uint64_t *)d1);
472 d1 += dest_stride;
473 d1u64 = vld1_u64((uint64_t *)d1);
474 d1 += dest_stride;
475 d2u64 = vld1_u64((uint64_t *)d1);
476 d1 += dest_stride;
477 d3u64 = vld1_u64((uint64_t *)d1);
478 d1 += dest_stride;
479
480 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
481 vreinterpret_u8_u64(d0u64));
482 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
483 vreinterpret_u8_u64(d1u64));
484 q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
485 vreinterpret_u8_u64(d2u64));
486 q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
487 vreinterpret_u8_u64(d3u64));
488
489 d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
490 d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
491 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
492 d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
493
494 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
495 d2 += dest_stride;
496 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
497 d2 += dest_stride;
498 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
499 d2 += dest_stride;
500 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
501 d2 += dest_stride;
502
503 q8s16 = q12s16;
504 q9s16 = q13s16;
505 q10s16 = q14s16;
506 q11s16 = q15s16;
507
508 d0u64 = vld1_u64((uint64_t *)d1);
509 d1 += dest_stride;
510 d1u64 = vld1_u64((uint64_t *)d1);
511 d1 += dest_stride;
512 d2u64 = vld1_u64((uint64_t *)d1);
513 d1 += dest_stride;
514 d3u64 = vld1_u64((uint64_t *)d1);
515 d1 += dest_stride;
516
517 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
518 vreinterpret_u8_u64(d0u64));
519 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
520 vreinterpret_u8_u64(d1u64));
521 q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
522 vreinterpret_u8_u64(d2u64));
523 q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
524 vreinterpret_u8_u64(d3u64));
525
526 d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
527 d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
528 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
529 d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
530
531 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
532 d2 += dest_stride;
533 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
534 d2 += dest_stride;
535 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
536 d2 += dest_stride;
537 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
538 d2 += dest_stride;
539 return;
540 }
541