1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/arm/idct_neon.h"
16 #include "vpx_dsp/arm/mem_neon.h"
17 #include "vpx_dsp/arm/transpose_neon.h"
18 #include "vpx_dsp/txfm_common.h"
19
20 // Only for the first pass of the _34_ variant. Since it only uses values from
21 // the top left 8x8 it can safely assume all the remaining values are 0 and skip
22 // an awful lot of calculations. In fact, only the first 6 columns make the cut.
23 // None of the elements in the 7th or 8th column are used so it skips any calls
24 // to input[67] too.
25 // In C this does a single row of 32 for each call. Here it transposes the top
26 // left 8x8 to allow using SIMD.
27
28 // vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero
29 // coefficients as follows:
30 // 0 1 2 3 4 5 6 7
31 // 0 0 2 5 10 17 25
32 // 1 1 4 8 15 22 30
33 // 2 3 7 12 18 28
34 // 3 6 11 16 23 31
35 // 4 9 14 19 29
36 // 5 13 20 26
37 // 6 21 27 33
38 // 7 24 32
vpx_idct32_6_neon(const tran_low_t * input,int16_t * output)39 void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output) {
40 int16x8_t in[8], s1[32], s2[32], s3[32];
41
42 in[0] = load_tran_low_to_s16q(input);
43 input += 32;
44 in[1] = load_tran_low_to_s16q(input);
45 input += 32;
46 in[2] = load_tran_low_to_s16q(input);
47 input += 32;
48 in[3] = load_tran_low_to_s16q(input);
49 input += 32;
50 in[4] = load_tran_low_to_s16q(input);
51 input += 32;
52 in[5] = load_tran_low_to_s16q(input);
53 input += 32;
54 in[6] = load_tran_low_to_s16q(input);
55 input += 32;
56 in[7] = load_tran_low_to_s16q(input);
57 transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
58 &in[7]);
59
60 // stage 1
61 // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0)
62 s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
63 // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0)
64 s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
65
66 s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
67 s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
68
69 s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
70 s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
71
72 // stage 2
73 s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
74 s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
75
76 // stage 3
77 s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
78 s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
79
80 s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
81 cospi_28_64);
82 s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
83 cospi_4_64);
84
85 s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
86 s1[27], cospi_12_64);
87 s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
88 cospi_20_64);
89
90 s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
91 s1[24], -cospi_20_64);
92 s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
93 s1[24], cospi_12_64);
94
95 // stage 4
96 s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
97
98 s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
99 cospi_24_64);
100 s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
101 cospi_8_64);
102
103 s2[20] = vsubq_s16(s1[23], s1[20]);
104 s2[21] = vsubq_s16(s1[22], s1[21]);
105 s2[22] = vaddq_s16(s1[21], s1[22]);
106 s2[23] = vaddq_s16(s1[20], s1[23]);
107 s2[24] = vaddq_s16(s1[24], s1[27]);
108 s2[25] = vaddq_s16(s1[25], s1[26]);
109 s2[26] = vsubq_s16(s1[25], s1[26]);
110 s2[27] = vsubq_s16(s1[24], s1[27]);
111
112 // stage 5
113 s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
114 s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
115
116 s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[17], -cospi_8_64, s1[30],
117 cospi_24_64);
118 s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[17], cospi_24_64, s1[30],
119 cospi_8_64);
120
121 s1[19] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_8_64, s1[31],
122 cospi_24_64);
123 s1[28] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_24_64, s1[31],
124 cospi_8_64);
125
126 s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
127 s2[27], -cospi_8_64);
128 s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
129 cospi_24_64);
130
131 s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
132 s2[26], -cospi_8_64);
133 s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
134 cospi_24_64);
135
136 // stage 6
137 s2[0] = vaddq_s16(s1[0], s1[7]);
138 s2[1] = vaddq_s16(s1[0], s1[6]);
139 s2[2] = vaddq_s16(s1[0], s1[5]);
140 s2[3] = vaddq_s16(s1[0], s1[4]);
141 s2[4] = vsubq_s16(s1[0], s1[4]);
142 s2[5] = vsubq_s16(s1[0], s1[5]);
143 s2[6] = vsubq_s16(s1[0], s1[6]);
144 s2[7] = vsubq_s16(s1[0], s1[7]);
145
146 s2[10] = sub_multiply_shift_and_narrow_s16(s2[14], s2[9], cospi_16_64);
147 s2[13] = add_multiply_shift_and_narrow_s16(s2[9], s2[14], cospi_16_64);
148
149 s2[11] = sub_multiply_shift_and_narrow_s16(s2[15], s2[8], cospi_16_64);
150 s2[12] = add_multiply_shift_and_narrow_s16(s2[8], s2[15], cospi_16_64);
151
152 s2[16] = vaddq_s16(s1[16], s2[23]);
153 s2[17] = vaddq_s16(s1[17], s2[22]);
154 s2[18] = vaddq_s16(s1[18], s1[21]);
155 s2[19] = vaddq_s16(s1[19], s1[20]);
156 s2[20] = vsubq_s16(s1[19], s1[20]);
157 s2[21] = vsubq_s16(s1[18], s1[21]);
158 s2[22] = vsubq_s16(s1[17], s2[22]);
159 s2[23] = vsubq_s16(s1[16], s2[23]);
160
161 s3[24] = vsubq_s16(s1[31], s2[24]);
162 s3[25] = vsubq_s16(s1[30], s2[25]);
163 s3[26] = vsubq_s16(s1[29], s1[26]);
164 s3[27] = vsubq_s16(s1[28], s1[27]);
165 s2[28] = vaddq_s16(s1[27], s1[28]);
166 s2[29] = vaddq_s16(s1[26], s1[29]);
167 s2[30] = vaddq_s16(s2[25], s1[30]);
168 s2[31] = vaddq_s16(s2[24], s1[31]);
169
170 // stage 7
171 s1[0] = vaddq_s16(s2[0], s2[15]);
172 s1[1] = vaddq_s16(s2[1], s2[14]);
173 s1[2] = vaddq_s16(s2[2], s2[13]);
174 s1[3] = vaddq_s16(s2[3], s2[12]);
175 s1[4] = vaddq_s16(s2[4], s2[11]);
176 s1[5] = vaddq_s16(s2[5], s2[10]);
177 s1[6] = vaddq_s16(s2[6], s2[9]);
178 s1[7] = vaddq_s16(s2[7], s2[8]);
179 s1[8] = vsubq_s16(s2[7], s2[8]);
180 s1[9] = vsubq_s16(s2[6], s2[9]);
181 s1[10] = vsubq_s16(s2[5], s2[10]);
182 s1[11] = vsubq_s16(s2[4], s2[11]);
183 s1[12] = vsubq_s16(s2[3], s2[12]);
184 s1[13] = vsubq_s16(s2[2], s2[13]);
185 s1[14] = vsubq_s16(s2[1], s2[14]);
186 s1[15] = vsubq_s16(s2[0], s2[15]);
187
188 s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
189 s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
190
191 s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
192 s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
193
194 s1[22] = sub_multiply_shift_and_narrow_s16(s3[25], s2[22], cospi_16_64);
195 s1[25] = add_multiply_shift_and_narrow_s16(s2[22], s3[25], cospi_16_64);
196
197 s1[23] = sub_multiply_shift_and_narrow_s16(s3[24], s2[23], cospi_16_64);
198 s1[24] = add_multiply_shift_and_narrow_s16(s2[23], s3[24], cospi_16_64);
199
200 // final stage
201 vst1q_s16(output, vaddq_s16(s1[0], s2[31]));
202 output += 8;
203 vst1q_s16(output, vaddq_s16(s1[1], s2[30]));
204 output += 8;
205 vst1q_s16(output, vaddq_s16(s1[2], s2[29]));
206 output += 8;
207 vst1q_s16(output, vaddq_s16(s1[3], s2[28]));
208 output += 8;
209 vst1q_s16(output, vaddq_s16(s1[4], s1[27]));
210 output += 8;
211 vst1q_s16(output, vaddq_s16(s1[5], s1[26]));
212 output += 8;
213 vst1q_s16(output, vaddq_s16(s1[6], s1[25]));
214 output += 8;
215 vst1q_s16(output, vaddq_s16(s1[7], s1[24]));
216 output += 8;
217
218 vst1q_s16(output, vaddq_s16(s1[8], s1[23]));
219 output += 8;
220 vst1q_s16(output, vaddq_s16(s1[9], s1[22]));
221 output += 8;
222 vst1q_s16(output, vaddq_s16(s1[10], s1[21]));
223 output += 8;
224 vst1q_s16(output, vaddq_s16(s1[11], s1[20]));
225 output += 8;
226 vst1q_s16(output, vaddq_s16(s1[12], s2[19]));
227 output += 8;
228 vst1q_s16(output, vaddq_s16(s1[13], s2[18]));
229 output += 8;
230 vst1q_s16(output, vaddq_s16(s1[14], s2[17]));
231 output += 8;
232 vst1q_s16(output, vaddq_s16(s1[15], s2[16]));
233 output += 8;
234
235 vst1q_s16(output, vsubq_s16(s1[15], s2[16]));
236 output += 8;
237 vst1q_s16(output, vsubq_s16(s1[14], s2[17]));
238 output += 8;
239 vst1q_s16(output, vsubq_s16(s1[13], s2[18]));
240 output += 8;
241 vst1q_s16(output, vsubq_s16(s1[12], s2[19]));
242 output += 8;
243 vst1q_s16(output, vsubq_s16(s1[11], s1[20]));
244 output += 8;
245 vst1q_s16(output, vsubq_s16(s1[10], s1[21]));
246 output += 8;
247 vst1q_s16(output, vsubq_s16(s1[9], s1[22]));
248 output += 8;
249 vst1q_s16(output, vsubq_s16(s1[8], s1[23]));
250 output += 8;
251
252 vst1q_s16(output, vsubq_s16(s1[7], s1[24]));
253 output += 8;
254 vst1q_s16(output, vsubq_s16(s1[6], s1[25]));
255 output += 8;
256 vst1q_s16(output, vsubq_s16(s1[5], s1[26]));
257 output += 8;
258 vst1q_s16(output, vsubq_s16(s1[4], s1[27]));
259 output += 8;
260 vst1q_s16(output, vsubq_s16(s1[3], s2[28]));
261 output += 8;
262 vst1q_s16(output, vsubq_s16(s1[2], s2[29]));
263 output += 8;
264 vst1q_s16(output, vsubq_s16(s1[1], s2[30]));
265 output += 8;
266 vst1q_s16(output, vsubq_s16(s1[0], s2[31]));
267 }
268
vpx_idct32_8_neon(const int16_t * input,void * const output,int stride,const int highbd_flag)269 void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
270 const int highbd_flag) {
271 int16x8_t in[8], s1[32], s2[32], s3[32], out[32];
272
273 load_and_transpose_s16_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4],
274 &in[5], &in[6], &in[7]);
275
276 // stage 1
277 s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64);
278 s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64);
279
280 // Different for _8_
281 s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64);
282 s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64);
283
284 s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64);
285 s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64);
286
287 s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64);
288 s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64);
289
290 // stage 2
291 s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64);
292 s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64);
293
294 s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64);
295 s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64);
296
297 // stage 3
298 s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64);
299 s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64);
300
301 s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31],
302 cospi_28_64);
303 s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31],
304 cospi_4_64);
305
306 // Different for _8_
307 s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_28_64,
308 s1[28], -cospi_4_64);
309 s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_4_64, s1[28],
310 cospi_28_64);
311
312 s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64,
313 s1[27], cospi_12_64);
314 s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27],
315 cospi_20_64);
316
317 s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64,
318 s1[24], -cospi_20_64);
319 s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64,
320 s1[24], cospi_12_64);
321
322 // stage 4
323 s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64);
324
325 s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15],
326 cospi_24_64);
327 s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15],
328 cospi_8_64);
329
330 s2[10] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_24_64,
331 s2[12], -cospi_8_64);
332 s2[13] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_8_64, s2[12],
333 cospi_24_64);
334
335 s2[16] = vaddq_s16(s1[16], s1[19]);
336
337 s2[17] = vaddq_s16(s1[17], s1[18]);
338 s2[18] = vsubq_s16(s1[17], s1[18]);
339
340 s2[19] = vsubq_s16(s1[16], s1[19]);
341
342 s2[20] = vsubq_s16(s1[23], s1[20]);
343 s2[21] = vsubq_s16(s1[22], s1[21]);
344
345 s2[22] = vaddq_s16(s1[21], s1[22]);
346 s2[23] = vaddq_s16(s1[20], s1[23]);
347
348 s2[24] = vaddq_s16(s1[24], s1[27]);
349 s2[25] = vaddq_s16(s1[25], s1[26]);
350 s2[26] = vsubq_s16(s1[25], s1[26]);
351 s2[27] = vsubq_s16(s1[24], s1[27]);
352
353 s2[28] = vsubq_s16(s1[31], s1[28]);
354 s2[29] = vsubq_s16(s1[30], s1[29]);
355 s2[30] = vaddq_s16(s1[29], s1[30]);
356 s2[31] = vaddq_s16(s1[28], s1[31]);
357
358 // stage 5
359 s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64);
360 s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64);
361
362 s1[8] = vaddq_s16(s2[8], s2[11]);
363 s1[9] = vaddq_s16(s2[9], s2[10]);
364 s1[10] = vsubq_s16(s2[9], s2[10]);
365 s1[11] = vsubq_s16(s2[8], s2[11]);
366 s1[12] = vsubq_s16(s2[15], s2[12]);
367 s1[13] = vsubq_s16(s2[14], s2[13]);
368 s1[14] = vaddq_s16(s2[13], s2[14]);
369 s1[15] = vaddq_s16(s2[12], s2[15]);
370
371 s1[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_8_64, s2[29],
372 cospi_24_64);
373 s1[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], cospi_24_64, s2[29],
374 cospi_8_64);
375
376 s1[19] = multiply_accumulate_shift_and_narrow_s16(s2[19], -cospi_8_64, s2[28],
377 cospi_24_64);
378 s1[28] = multiply_accumulate_shift_and_narrow_s16(s2[19], cospi_24_64, s2[28],
379 cospi_8_64);
380
381 s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64,
382 s2[27], -cospi_8_64);
383 s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27],
384 cospi_24_64);
385
386 s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64,
387 s2[26], -cospi_8_64);
388 s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26],
389 cospi_24_64);
390
391 // stage 6
392 s2[0] = vaddq_s16(s1[0], s1[7]);
393 s2[1] = vaddq_s16(s1[0], s1[6]);
394 s2[2] = vaddq_s16(s1[0], s1[5]);
395 s2[3] = vaddq_s16(s1[0], s1[4]);
396 s2[4] = vsubq_s16(s1[0], s1[4]);
397 s2[5] = vsubq_s16(s1[0], s1[5]);
398 s2[6] = vsubq_s16(s1[0], s1[6]);
399 s2[7] = vsubq_s16(s1[0], s1[7]);
400
401 s2[10] = sub_multiply_shift_and_narrow_s16(s1[13], s1[10], cospi_16_64);
402 s2[13] = add_multiply_shift_and_narrow_s16(s1[10], s1[13], cospi_16_64);
403
404 s2[11] = sub_multiply_shift_and_narrow_s16(s1[12], s1[11], cospi_16_64);
405 s2[12] = add_multiply_shift_and_narrow_s16(s1[11], s1[12], cospi_16_64);
406
407 s1[16] = vaddq_s16(s2[16], s2[23]);
408 s1[17] = vaddq_s16(s2[17], s2[22]);
409 s2[18] = vaddq_s16(s1[18], s1[21]);
410 s2[19] = vaddq_s16(s1[19], s1[20]);
411 s2[20] = vsubq_s16(s1[19], s1[20]);
412 s2[21] = vsubq_s16(s1[18], s1[21]);
413 s1[22] = vsubq_s16(s2[17], s2[22]);
414 s1[23] = vsubq_s16(s2[16], s2[23]);
415
416 s3[24] = vsubq_s16(s2[31], s2[24]);
417 s3[25] = vsubq_s16(s2[30], s2[25]);
418 s3[26] = vsubq_s16(s1[29], s1[26]);
419 s3[27] = vsubq_s16(s1[28], s1[27]);
420 s2[28] = vaddq_s16(s1[27], s1[28]);
421 s2[29] = vaddq_s16(s1[26], s1[29]);
422 s2[30] = vaddq_s16(s2[25], s2[30]);
423 s2[31] = vaddq_s16(s2[24], s2[31]);
424
425 // stage 7
426 s1[0] = vaddq_s16(s2[0], s1[15]);
427 s1[1] = vaddq_s16(s2[1], s1[14]);
428 s1[2] = vaddq_s16(s2[2], s2[13]);
429 s1[3] = vaddq_s16(s2[3], s2[12]);
430 s1[4] = vaddq_s16(s2[4], s2[11]);
431 s1[5] = vaddq_s16(s2[5], s2[10]);
432 s1[6] = vaddq_s16(s2[6], s1[9]);
433 s1[7] = vaddq_s16(s2[7], s1[8]);
434 s1[8] = vsubq_s16(s2[7], s1[8]);
435 s1[9] = vsubq_s16(s2[6], s1[9]);
436 s1[10] = vsubq_s16(s2[5], s2[10]);
437 s1[11] = vsubq_s16(s2[4], s2[11]);
438 s1[12] = vsubq_s16(s2[3], s2[12]);
439 s1[13] = vsubq_s16(s2[2], s2[13]);
440 s1[14] = vsubq_s16(s2[1], s1[14]);
441 s1[15] = vsubq_s16(s2[0], s1[15]);
442
443 s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64);
444 s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64);
445
446 s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64);
447 s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64);
448
449 s2[22] = sub_multiply_shift_and_narrow_s16(s3[25], s1[22], cospi_16_64);
450 s1[25] = add_multiply_shift_and_narrow_s16(s1[22], s3[25], cospi_16_64);
451
452 s2[23] = sub_multiply_shift_and_narrow_s16(s3[24], s1[23], cospi_16_64);
453 s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64);
454
455 // final stage
456 out[0] = final_add(s1[0], s2[31]);
457 out[1] = final_add(s1[1], s2[30]);
458 out[2] = final_add(s1[2], s2[29]);
459 out[3] = final_add(s1[3], s2[28]);
460 out[4] = final_add(s1[4], s1[27]);
461 out[5] = final_add(s1[5], s1[26]);
462 out[6] = final_add(s1[6], s1[25]);
463 out[7] = final_add(s1[7], s1[24]);
464 out[8] = final_add(s1[8], s2[23]);
465 out[9] = final_add(s1[9], s2[22]);
466 out[10] = final_add(s1[10], s1[21]);
467 out[11] = final_add(s1[11], s1[20]);
468 out[12] = final_add(s1[12], s2[19]);
469 out[13] = final_add(s1[13], s2[18]);
470 out[14] = final_add(s1[14], s1[17]);
471 out[15] = final_add(s1[15], s1[16]);
472 out[16] = final_sub(s1[15], s1[16]);
473 out[17] = final_sub(s1[14], s1[17]);
474 out[18] = final_sub(s1[13], s2[18]);
475 out[19] = final_sub(s1[12], s2[19]);
476 out[20] = final_sub(s1[11], s1[20]);
477 out[21] = final_sub(s1[10], s1[21]);
478 out[22] = final_sub(s1[9], s2[22]);
479 out[23] = final_sub(s1[8], s2[23]);
480 out[24] = final_sub(s1[7], s1[24]);
481 out[25] = final_sub(s1[6], s1[25]);
482 out[26] = final_sub(s1[5], s1[26]);
483 out[27] = final_sub(s1[4], s1[27]);
484 out[28] = final_sub(s1[3], s2[28]);
485 out[29] = final_sub(s1[2], s2[29]);
486 out[30] = final_sub(s1[1], s2[30]);
487 out[31] = final_sub(s1[0], s2[31]);
488
489 if (highbd_flag) {
490 highbd_add_and_store_bd8(out, output, stride);
491 } else {
492 uint8_t *const outputT = (uint8_t *)output;
493 add_and_store_u8_s16(out + 0, outputT, stride);
494 add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
495 add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
496 add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
497 }
498 }
499
vpx_idct32x32_34_add_neon(const tran_low_t * input,uint8_t * dest,int stride)500 void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest,
501 int stride) {
502 int i;
503 int16_t temp[32 * 8];
504 int16_t *t = temp;
505
506 vpx_idct32_6_neon(input, t);
507
508 for (i = 0; i < 32; i += 8) {
509 vpx_idct32_8_neon(t, dest, stride, 0);
510 t += (8 * 8);
511 dest += 8;
512 }
513 }
514