1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/idct_neon.h"
15 #include "vpx_dsp/arm/mem_neon.h"
16 #include "vpx_dsp/txfm_common.h"
17
wrap_low_4x2(const int32x4_t * const t32,int16x4_t * const d0,int16x4_t * const d1)18 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
19 int16x4_t *const d1) {
20 *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
21 *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
22 }
23
idct_cospi_8_24_d_kernel(const int16x4_t s0,const int16x4_t s1,const int16x4_t cospi_0_8_16_24,int32x4_t * const t32)24 static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
25 const int16x4_t s1,
26 const int16x4_t cospi_0_8_16_24,
27 int32x4_t *const t32) {
28 t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
29 t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
30 t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
31 t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
32 }
33
idct_cospi_8_24_d(const int16x4_t s0,const int16x4_t s1,const int16x4_t cospi_0_8_16_24,int16x4_t * const d0,int16x4_t * const d1)34 static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
35 const int16x4_t cospi_0_8_16_24,
36 int16x4_t *const d0, int16x4_t *const d1) {
37 int32x4_t t32[2];
38
39 idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
40 wrap_low_4x2(t32, d0, d1);
41 }
42
idct_cospi_8_24_neg_d(const int16x4_t s0,const int16x4_t s1,const int16x4_t cospi_0_8_16_24,int16x4_t * const d0,int16x4_t * const d1)43 static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
44 const int16x4_t cospi_0_8_16_24,
45 int16x4_t *const d0,
46 int16x4_t *const d1) {
47 int32x4_t t32[2];
48
49 idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
50 t32[1] = vnegq_s32(t32[1]);
51 wrap_low_4x2(t32, d0, d1);
52 }
53
idct_cospi_16_16_d(const int16x4_t s0,const int16x4_t s1,const int16x4_t cospi_0_8_16_24,int16x4_t * const d0,int16x4_t * const d1)54 static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
55 const int16x4_t cospi_0_8_16_24,
56 int16x4_t *const d0,
57 int16x4_t *const d1) {
58 int32x4_t t32[3];
59
60 t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
61 t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
62 t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
63 wrap_low_4x2(t32, d0, d1);
64 }
65
vpx_idct16x16_256_add_half1d(const void * const input,int16_t * output,void * const dest,const int stride,const int highbd_flag)66 void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
67 void *const dest, const int stride,
68 const int highbd_flag) {
69 const int16x8_t cospis0 = vld1q_s16(kCospi);
70 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
71 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
72 const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
73 const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
74 const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1);
75 int16x8_t in[16], step1[16], step2[16], out[16];
76
77 // Load input (16x8)
78 if (output) {
79 const tran_low_t *inputT = (const tran_low_t *)input;
80 in[0] = load_tran_low_to_s16q(inputT);
81 inputT += 8;
82 in[8] = load_tran_low_to_s16q(inputT);
83 inputT += 8;
84 in[1] = load_tran_low_to_s16q(inputT);
85 inputT += 8;
86 in[9] = load_tran_low_to_s16q(inputT);
87 inputT += 8;
88 in[2] = load_tran_low_to_s16q(inputT);
89 inputT += 8;
90 in[10] = load_tran_low_to_s16q(inputT);
91 inputT += 8;
92 in[3] = load_tran_low_to_s16q(inputT);
93 inputT += 8;
94 in[11] = load_tran_low_to_s16q(inputT);
95 inputT += 8;
96 in[4] = load_tran_low_to_s16q(inputT);
97 inputT += 8;
98 in[12] = load_tran_low_to_s16q(inputT);
99 inputT += 8;
100 in[5] = load_tran_low_to_s16q(inputT);
101 inputT += 8;
102 in[13] = load_tran_low_to_s16q(inputT);
103 inputT += 8;
104 in[6] = load_tran_low_to_s16q(inputT);
105 inputT += 8;
106 in[14] = load_tran_low_to_s16q(inputT);
107 inputT += 8;
108 in[7] = load_tran_low_to_s16q(inputT);
109 inputT += 8;
110 in[15] = load_tran_low_to_s16q(inputT);
111 } else {
112 const int16_t *inputT = (const int16_t *)input;
113 in[0] = vld1q_s16(inputT);
114 inputT += 8;
115 in[8] = vld1q_s16(inputT);
116 inputT += 8;
117 in[1] = vld1q_s16(inputT);
118 inputT += 8;
119 in[9] = vld1q_s16(inputT);
120 inputT += 8;
121 in[2] = vld1q_s16(inputT);
122 inputT += 8;
123 in[10] = vld1q_s16(inputT);
124 inputT += 8;
125 in[3] = vld1q_s16(inputT);
126 inputT += 8;
127 in[11] = vld1q_s16(inputT);
128 inputT += 8;
129 in[4] = vld1q_s16(inputT);
130 inputT += 8;
131 in[12] = vld1q_s16(inputT);
132 inputT += 8;
133 in[5] = vld1q_s16(inputT);
134 inputT += 8;
135 in[13] = vld1q_s16(inputT);
136 inputT += 8;
137 in[6] = vld1q_s16(inputT);
138 inputT += 8;
139 in[14] = vld1q_s16(inputT);
140 inputT += 8;
141 in[7] = vld1q_s16(inputT);
142 inputT += 8;
143 in[15] = vld1q_s16(inputT);
144 }
145
146 // Transpose
147 transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
148 &in[7]);
149 transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
150 &in[15]);
151
152 // stage 1
153 step1[0] = in[0 / 2];
154 step1[1] = in[16 / 2];
155 step1[2] = in[8 / 2];
156 step1[3] = in[24 / 2];
157 step1[4] = in[4 / 2];
158 step1[5] = in[20 / 2];
159 step1[6] = in[12 / 2];
160 step1[7] = in[28 / 2];
161 step1[8] = in[2 / 2];
162 step1[9] = in[18 / 2];
163 step1[10] = in[10 / 2];
164 step1[11] = in[26 / 2];
165 step1[12] = in[6 / 2];
166 step1[13] = in[22 / 2];
167 step1[14] = in[14 / 2];
168 step1[15] = in[30 / 2];
169
170 // stage 2
171 step2[0] = step1[0];
172 step2[1] = step1[1];
173 step2[2] = step1[2];
174 step2[3] = step1[3];
175 step2[4] = step1[4];
176 step2[5] = step1[5];
177 step2[6] = step1[6];
178 step2[7] = step1[7];
179 idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
180 idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
181 &step2[14]);
182 idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
183 &step2[13]);
184 idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
185 &step2[12]);
186
187 // stage 3
188 step1[0] = step2[0];
189 step1[1] = step2[1];
190 step1[2] = step2[2];
191 step1[3] = step2[3];
192 idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
193 idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
194 step1[8] = vaddq_s16(step2[8], step2[9]);
195 step1[9] = vsubq_s16(step2[8], step2[9]);
196 step1[10] = vsubq_s16(step2[11], step2[10]);
197 step1[11] = vaddq_s16(step2[11], step2[10]);
198 step1[12] = vaddq_s16(step2[12], step2[13]);
199 step1[13] = vsubq_s16(step2[12], step2[13]);
200 step1[14] = vsubq_s16(step2[15], step2[14]);
201 step1[15] = vaddq_s16(step2[15], step2[14]);
202
203 // stage 4
204 idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
205 idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
206 step2[4] = vaddq_s16(step1[4], step1[5]);
207 step2[5] = vsubq_s16(step1[4], step1[5]);
208 step2[6] = vsubq_s16(step1[7], step1[6]);
209 step2[7] = vaddq_s16(step1[7], step1[6]);
210 step2[8] = step1[8];
211 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
212 &step2[14]);
213 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
214 &step2[10]);
215 step2[11] = step1[11];
216 step2[12] = step1[12];
217 step2[15] = step1[15];
218
219 // stage 5
220 step1[0] = vaddq_s16(step2[0], step2[3]);
221 step1[1] = vaddq_s16(step2[1], step2[2]);
222 step1[2] = vsubq_s16(step2[1], step2[2]);
223 step1[3] = vsubq_s16(step2[0], step2[3]);
224 step1[4] = step2[4];
225 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
226 step1[7] = step2[7];
227 step1[8] = vaddq_s16(step2[8], step2[11]);
228 step1[9] = vaddq_s16(step2[9], step2[10]);
229 step1[10] = vsubq_s16(step2[9], step2[10]);
230 step1[11] = vsubq_s16(step2[8], step2[11]);
231 step1[12] = vsubq_s16(step2[15], step2[12]);
232 step1[13] = vsubq_s16(step2[14], step2[13]);
233 step1[14] = vaddq_s16(step2[14], step2[13]);
234 step1[15] = vaddq_s16(step2[15], step2[12]);
235
236 // stage 6
237 step2[0] = vaddq_s16(step1[0], step1[7]);
238 step2[1] = vaddq_s16(step1[1], step1[6]);
239 step2[2] = vaddq_s16(step1[2], step1[5]);
240 step2[3] = vaddq_s16(step1[3], step1[4]);
241 step2[4] = vsubq_s16(step1[3], step1[4]);
242 step2[5] = vsubq_s16(step1[2], step1[5]);
243 step2[6] = vsubq_s16(step1[1], step1[6]);
244 step2[7] = vsubq_s16(step1[0], step1[7]);
245 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
246 &step2[13]);
247 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
248 &step2[12]);
249 step2[8] = step1[8];
250 step2[9] = step1[9];
251 step2[14] = step1[14];
252 step2[15] = step1[15];
253
254 // stage 7
255 idct16x16_add_stage7(step2, out);
256
257 if (output) {
258 idct16x16_store_pass1(out, output);
259 } else {
260 if (highbd_flag) {
261 idct16x16_add_store_bd8(out, dest, stride);
262 } else {
263 idct16x16_add_store(out, dest, stride);
264 }
265 }
266 }
267
vpx_idct16x16_38_add_half1d(const void * const input,int16_t * const output,void * const dest,const int stride,const int highbd_flag)268 void vpx_idct16x16_38_add_half1d(const void *const input, int16_t *const output,
269 void *const dest, const int stride,
270 const int highbd_flag) {
271 const int16x8_t cospis0 = vld1q_s16(kCospi);
272 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
273 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
274 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
275 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
276 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
277 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
278 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
279 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
280 int16x8_t in[8], step1[16], step2[16], out[16];
281
282 // Load input (8x8)
283 if (output) {
284 const tran_low_t *inputT = (const tran_low_t *)input;
285 in[0] = load_tran_low_to_s16q(inputT);
286 inputT += 16;
287 in[1] = load_tran_low_to_s16q(inputT);
288 inputT += 16;
289 in[2] = load_tran_low_to_s16q(inputT);
290 inputT += 16;
291 in[3] = load_tran_low_to_s16q(inputT);
292 inputT += 16;
293 in[4] = load_tran_low_to_s16q(inputT);
294 inputT += 16;
295 in[5] = load_tran_low_to_s16q(inputT);
296 inputT += 16;
297 in[6] = load_tran_low_to_s16q(inputT);
298 inputT += 16;
299 in[7] = load_tran_low_to_s16q(inputT);
300 } else {
301 const int16_t *inputT = (const int16_t *)input;
302 in[0] = vld1q_s16(inputT);
303 inputT += 16;
304 in[1] = vld1q_s16(inputT);
305 inputT += 16;
306 in[2] = vld1q_s16(inputT);
307 inputT += 16;
308 in[3] = vld1q_s16(inputT);
309 inputT += 16;
310 in[4] = vld1q_s16(inputT);
311 inputT += 16;
312 in[5] = vld1q_s16(inputT);
313 inputT += 16;
314 in[6] = vld1q_s16(inputT);
315 inputT += 16;
316 in[7] = vld1q_s16(inputT);
317 }
318
319 // Transpose
320 transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
321 &in[7]);
322
323 // stage 1
324 step1[0] = in[0 / 2];
325 step1[2] = in[8 / 2];
326 step1[4] = in[4 / 2];
327 step1[6] = in[12 / 2];
328 step1[8] = in[2 / 2];
329 step1[10] = in[10 / 2];
330 step1[12] = in[6 / 2];
331 step1[14] = in[14 / 2]; // 0 in pass 1
332
333 // stage 2
334 step2[0] = step1[0];
335 step2[2] = step1[2];
336 step2[4] = step1[4];
337 step2[6] = step1[6];
338 step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
339 step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3);
340 step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3);
341 step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
342 step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
343 step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2);
344 step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2);
345 step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
346
347 // stage 3
348 step1[0] = step2[0];
349 step1[2] = step2[2];
350 step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
351 step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2);
352 step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1);
353 step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
354 step1[8] = vaddq_s16(step2[8], step2[9]);
355 step1[9] = vsubq_s16(step2[8], step2[9]);
356 step1[10] = vsubq_s16(step2[11], step2[10]);
357 step1[11] = vaddq_s16(step2[11], step2[10]);
358 step1[12] = vaddq_s16(step2[12], step2[13]);
359 step1[13] = vsubq_s16(step2[12], step2[13]);
360 step1[14] = vsubq_s16(step2[15], step2[14]);
361 step1[15] = vaddq_s16(step2[15], step2[14]);
362
363 // stage 4
364 step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
365 step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3);
366 step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1);
367 step2[4] = vaddq_s16(step1[4], step1[5]);
368 step2[5] = vsubq_s16(step1[4], step1[5]);
369 step2[6] = vsubq_s16(step1[7], step1[6]);
370 step2[7] = vaddq_s16(step1[7], step1[6]);
371 step2[8] = step1[8];
372 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
373 &step2[14]);
374 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
375 &step2[10]);
376 step2[11] = step1[11];
377 step2[12] = step1[12];
378 step2[15] = step1[15];
379
380 // stage 5
381 step1[0] = vaddq_s16(step2[0], step2[3]);
382 step1[1] = vaddq_s16(step2[1], step2[2]);
383 step1[2] = vsubq_s16(step2[1], step2[2]);
384 step1[3] = vsubq_s16(step2[0], step2[3]);
385 step1[4] = step2[4];
386 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
387 step1[7] = step2[7];
388 step1[8] = vaddq_s16(step2[8], step2[11]);
389 step1[9] = vaddq_s16(step2[9], step2[10]);
390 step1[10] = vsubq_s16(step2[9], step2[10]);
391 step1[11] = vsubq_s16(step2[8], step2[11]);
392 step1[12] = vsubq_s16(step2[15], step2[12]);
393 step1[13] = vsubq_s16(step2[14], step2[13]);
394 step1[14] = vaddq_s16(step2[14], step2[13]);
395 step1[15] = vaddq_s16(step2[15], step2[12]);
396
397 // stage 6
398 step2[0] = vaddq_s16(step1[0], step1[7]);
399 step2[1] = vaddq_s16(step1[1], step1[6]);
400 step2[2] = vaddq_s16(step1[2], step1[5]);
401 step2[3] = vaddq_s16(step1[3], step1[4]);
402 step2[4] = vsubq_s16(step1[3], step1[4]);
403 step2[5] = vsubq_s16(step1[2], step1[5]);
404 step2[6] = vsubq_s16(step1[1], step1[6]);
405 step2[7] = vsubq_s16(step1[0], step1[7]);
406 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
407 &step2[13]);
408 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
409 &step2[12]);
410 step2[8] = step1[8];
411 step2[9] = step1[9];
412 step2[14] = step1[14];
413 step2[15] = step1[15];
414
415 // stage 7
416 idct16x16_add_stage7(step2, out);
417
418 if (output) {
419 idct16x16_store_pass1(out, output);
420 } else {
421 if (highbd_flag) {
422 idct16x16_add_store_bd8(out, dest, stride);
423 } else {
424 idct16x16_add_store(out, dest, stride);
425 }
426 }
427 }
428
vpx_idct16x16_10_add_half1d_pass1(const tran_low_t * input,int16_t * output)429 void vpx_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
430 int16_t *output) {
431 const int16x8_t cospis0 = vld1q_s16(kCospi);
432 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
433 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
434 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
435 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
436 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
437 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
438 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
439 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
440 int16x4_t in[4], step1[16], step2[16], out[16];
441
442 // Load input (4x4)
443 in[0] = load_tran_low_to_s16d(input);
444 input += 16;
445 in[1] = load_tran_low_to_s16d(input);
446 input += 16;
447 in[2] = load_tran_low_to_s16d(input);
448 input += 16;
449 in[3] = load_tran_low_to_s16d(input);
450
451 // Transpose
452 transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
453
454 // stage 1
455 step1[0] = in[0 / 2];
456 step1[4] = in[4 / 2];
457 step1[8] = in[2 / 2];
458 step1[12] = in[6 / 2];
459
460 // stage 2
461 step2[0] = step1[0];
462 step2[4] = step1[4];
463 step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
464 step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
465 step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
466 step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
467
468 // stage 3
469 step1[0] = step2[0];
470 step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
471 step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
472 step1[8] = step2[8];
473 step1[9] = step2[8];
474 step1[10] = step2[11];
475 step1[11] = step2[11];
476 step1[12] = step2[12];
477 step1[13] = step2[12];
478 step1[14] = step2[15];
479 step1[15] = step2[15];
480
481 // stage 4
482 step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
483 step2[4] = step1[4];
484 step2[5] = step1[4];
485 step2[6] = step1[7];
486 step2[7] = step1[7];
487 step2[8] = step1[8];
488 idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
489 &step2[14]);
490 idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
491 &step2[10]);
492 step2[11] = step1[11];
493 step2[12] = step1[12];
494 step2[15] = step1[15];
495
496 // stage 5
497 step1[0] = step2[0];
498 step1[1] = step2[1];
499 step1[2] = step2[1];
500 step1[3] = step2[0];
501 step1[4] = step2[4];
502 idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
503 step1[7] = step2[7];
504 step1[8] = vadd_s16(step2[8], step2[11]);
505 step1[9] = vadd_s16(step2[9], step2[10]);
506 step1[10] = vsub_s16(step2[9], step2[10]);
507 step1[11] = vsub_s16(step2[8], step2[11]);
508 step1[12] = vsub_s16(step2[15], step2[12]);
509 step1[13] = vsub_s16(step2[14], step2[13]);
510 step1[14] = vadd_s16(step2[14], step2[13]);
511 step1[15] = vadd_s16(step2[15], step2[12]);
512
513 // stage 6
514 step2[0] = vadd_s16(step1[0], step1[7]);
515 step2[1] = vadd_s16(step1[1], step1[6]);
516 step2[2] = vadd_s16(step1[2], step1[5]);
517 step2[3] = vadd_s16(step1[3], step1[4]);
518 step2[4] = vsub_s16(step1[3], step1[4]);
519 step2[5] = vsub_s16(step1[2], step1[5]);
520 step2[6] = vsub_s16(step1[1], step1[6]);
521 step2[7] = vsub_s16(step1[0], step1[7]);
522 idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
523 &step2[13]);
524 idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
525 &step2[12]);
526 step2[8] = step1[8];
527 step2[9] = step1[9];
528 step2[14] = step1[14];
529 step2[15] = step1[15];
530
531 // stage 7
532 out[0] = vadd_s16(step2[0], step2[15]);
533 out[1] = vadd_s16(step2[1], step2[14]);
534 out[2] = vadd_s16(step2[2], step2[13]);
535 out[3] = vadd_s16(step2[3], step2[12]);
536 out[4] = vadd_s16(step2[4], step2[11]);
537 out[5] = vadd_s16(step2[5], step2[10]);
538 out[6] = vadd_s16(step2[6], step2[9]);
539 out[7] = vadd_s16(step2[7], step2[8]);
540 out[8] = vsub_s16(step2[7], step2[8]);
541 out[9] = vsub_s16(step2[6], step2[9]);
542 out[10] = vsub_s16(step2[5], step2[10]);
543 out[11] = vsub_s16(step2[4], step2[11]);
544 out[12] = vsub_s16(step2[3], step2[12]);
545 out[13] = vsub_s16(step2[2], step2[13]);
546 out[14] = vsub_s16(step2[1], step2[14]);
547 out[15] = vsub_s16(step2[0], step2[15]);
548
549 // pass 1: save the result into output
550 vst1_s16(output, out[0]);
551 output += 4;
552 vst1_s16(output, out[1]);
553 output += 4;
554 vst1_s16(output, out[2]);
555 output += 4;
556 vst1_s16(output, out[3]);
557 output += 4;
558 vst1_s16(output, out[4]);
559 output += 4;
560 vst1_s16(output, out[5]);
561 output += 4;
562 vst1_s16(output, out[6]);
563 output += 4;
564 vst1_s16(output, out[7]);
565 output += 4;
566 vst1_s16(output, out[8]);
567 output += 4;
568 vst1_s16(output, out[9]);
569 output += 4;
570 vst1_s16(output, out[10]);
571 output += 4;
572 vst1_s16(output, out[11]);
573 output += 4;
574 vst1_s16(output, out[12]);
575 output += 4;
576 vst1_s16(output, out[13]);
577 output += 4;
578 vst1_s16(output, out[14]);
579 output += 4;
580 vst1_s16(output, out[15]);
581 }
582
vpx_idct16x16_10_add_half1d_pass2(const int16_t * input,int16_t * const output,void * const dest,const int stride,const int highbd_flag)583 void vpx_idct16x16_10_add_half1d_pass2(const int16_t *input,
584 int16_t *const output, void *const dest,
585 const int stride,
586 const int highbd_flag) {
587 const int16x8_t cospis0 = vld1q_s16(kCospi);
588 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
589 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
590 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
591 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
592 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
593 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
594 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
595 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
596 int16x4_t ind[8];
597 int16x8_t in[4], step1[16], step2[16], out[16];
598
599 // Load input (4x8)
600 ind[0] = vld1_s16(input);
601 input += 4;
602 ind[1] = vld1_s16(input);
603 input += 4;
604 ind[2] = vld1_s16(input);
605 input += 4;
606 ind[3] = vld1_s16(input);
607 input += 4;
608 ind[4] = vld1_s16(input);
609 input += 4;
610 ind[5] = vld1_s16(input);
611 input += 4;
612 ind[6] = vld1_s16(input);
613 input += 4;
614 ind[7] = vld1_s16(input);
615
616 // Transpose
617 transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
618 ind[7], &in[0], &in[1], &in[2], &in[3]);
619
620 // stage 1
621 step1[0] = in[0 / 2];
622 step1[4] = in[4 / 2];
623 step1[8] = in[2 / 2];
624 step1[12] = in[6 / 2];
625
626 // stage 2
627 step2[0] = step1[0];
628 step2[4] = step1[4];
629 step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
630 step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
631 step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
632 step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
633
634 // stage 3
635 step1[0] = step2[0];
636 step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
637 step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
638 step1[8] = step2[8];
639 step1[9] = step2[8];
640 step1[10] = step2[11];
641 step1[11] = step2[11];
642 step1[12] = step2[12];
643 step1[13] = step2[12];
644 step1[14] = step2[15];
645 step1[15] = step2[15];
646
647 // stage 4
648 step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
649 step2[4] = step1[4];
650 step2[5] = step1[4];
651 step2[6] = step1[7];
652 step2[7] = step1[7];
653 step2[8] = step1[8];
654 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
655 &step2[14]);
656 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
657 &step2[10]);
658 step2[11] = step1[11];
659 step2[12] = step1[12];
660 step2[15] = step1[15];
661
662 // stage 5
663 step1[0] = step2[0];
664 step1[1] = step2[1];
665 step1[2] = step2[1];
666 step1[3] = step2[0];
667 step1[4] = step2[4];
668 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
669 step1[7] = step2[7];
670 step1[8] = vaddq_s16(step2[8], step2[11]);
671 step1[9] = vaddq_s16(step2[9], step2[10]);
672 step1[10] = vsubq_s16(step2[9], step2[10]);
673 step1[11] = vsubq_s16(step2[8], step2[11]);
674 step1[12] = vsubq_s16(step2[15], step2[12]);
675 step1[13] = vsubq_s16(step2[14], step2[13]);
676 step1[14] = vaddq_s16(step2[14], step2[13]);
677 step1[15] = vaddq_s16(step2[15], step2[12]);
678
679 // stage 6
680 step2[0] = vaddq_s16(step1[0], step1[7]);
681 step2[1] = vaddq_s16(step1[1], step1[6]);
682 step2[2] = vaddq_s16(step1[2], step1[5]);
683 step2[3] = vaddq_s16(step1[3], step1[4]);
684 step2[4] = vsubq_s16(step1[3], step1[4]);
685 step2[5] = vsubq_s16(step1[2], step1[5]);
686 step2[6] = vsubq_s16(step1[1], step1[6]);
687 step2[7] = vsubq_s16(step1[0], step1[7]);
688 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
689 &step2[13]);
690 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
691 &step2[12]);
692 step2[8] = step1[8];
693 step2[9] = step1[9];
694 step2[14] = step1[14];
695 step2[15] = step1[15];
696
697 // stage 7
698 idct16x16_add_stage7(step2, out);
699
700 if (output) {
701 idct16x16_store_pass1(out, output);
702 } else {
703 if (highbd_flag) {
704 idct16x16_add_store_bd8(out, dest, stride);
705 } else {
706 idct16x16_add_store(out, dest, stride);
707 }
708 }
709 }
710
vpx_idct16x16_256_add_neon(const tran_low_t * input,uint8_t * dest,int stride)711 void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
712 int stride) {
713 int16_t row_idct_output[16 * 16];
714
715 // pass 1
716 // Parallel idct on the upper 8 rows
717 vpx_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0);
718
719 // Parallel idct on the lower 8 rows
720 vpx_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
721 stride, 0);
722
723 // pass 2
724 // Parallel idct to get the left 8 columns
725 vpx_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0);
726
727 // Parallel idct to get the right 8 columns
728 vpx_idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
729 0);
730 }
731
vpx_idct16x16_38_add_neon(const tran_low_t * input,uint8_t * dest,int stride)732 void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest,
733 int stride) {
734 int16_t row_idct_output[16 * 16];
735
736 // pass 1
737 // Parallel idct on the upper 8 rows
738 vpx_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0);
739
740 // pass 2
741 // Parallel idct to get the left 8 columns
742 vpx_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0);
743
744 // Parallel idct to get the right 8 columns
745 vpx_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
746 0);
747 }
748
vpx_idct16x16_10_add_neon(const tran_low_t * input,uint8_t * dest,int stride)749 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
750 int stride) {
751 int16_t row_idct_output[4 * 16];
752
753 // pass 1
754 // Parallel idct on the upper 8 rows
755 vpx_idct16x16_10_add_half1d_pass1(input, row_idct_output);
756
757 // pass 2
758 // Parallel idct to get the left 8 columns
759 vpx_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0);
760
761 // Parallel idct to get the right 8 columns
762 vpx_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
763 stride, 0);
764 }
765