1 /*
2 * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vp9/common/vp9_enums.h"
15 #include "vp9/common/arm/neon/vp9_iht_neon.h"
16 #include "vpx_dsp/arm/highbd_idct_neon.h"
17 #include "vpx_dsp/arm/idct_neon.h"
18 #include "vpx_dsp/arm/transpose_neon.h"
19 #include "vpx_dsp/inv_txfm.h"
20
highbd_iadst_half_butterfly_neon(int32x4_t * const x,const int32x2_t c)21 static INLINE void highbd_iadst_half_butterfly_neon(int32x4_t *const x,
22 const int32x2_t c) {
23 const int32x4_t sum = vaddq_s32(x[0], x[1]);
24 const int32x4_t sub = vsubq_s32(x[0], x[1]);
25 const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0);
26 const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0);
27 const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0);
28 const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0);
29 const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS);
30 const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS);
31 const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS);
32 const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS);
33
34 x[0] = vcombine_s32(out0_lo, out0_hi);
35 x[1] = vcombine_s32(out1_lo, out1_hi);
36 }
37
highbd_iadst_butterfly_lane_0_1_neon(const int32x4_t in0,const int32x4_t in1,const int32x2_t c,int64x2_t * const s0,int64x2_t * const s1)38 static INLINE void highbd_iadst_butterfly_lane_0_1_neon(const int32x4_t in0,
39 const int32x4_t in1,
40 const int32x2_t c,
41 int64x2_t *const s0,
42 int64x2_t *const s1) {
43 const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 0);
44 const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 1);
45 const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 0);
46 const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 1);
47
48 s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 1);
49 s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 0);
50 s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 1);
51 s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 0);
52 }
53
highbd_iadst_butterfly_lane_1_0_neon(const int32x4_t in0,const int32x4_t in1,const int32x2_t c,int64x2_t * const s0,int64x2_t * const s1)54 static INLINE void highbd_iadst_butterfly_lane_1_0_neon(const int32x4_t in0,
55 const int32x4_t in1,
56 const int32x2_t c,
57 int64x2_t *const s0,
58 int64x2_t *const s1) {
59 const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 1);
60 const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 0);
61 const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 1);
62 const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 0);
63
64 s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 0);
65 s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 1);
66 s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 0);
67 s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 1);
68 }
69
highbd_add_dct_const_round_shift_low_8(const int64x2_t * const in0,const int64x2_t * const in1)70 static INLINE int32x4_t highbd_add_dct_const_round_shift_low_8(
71 const int64x2_t *const in0, const int64x2_t *const in1) {
72 const int64x2_t sum_lo = vaddq_s64(in0[0], in1[0]);
73 const int64x2_t sum_hi = vaddq_s64(in0[1], in1[1]);
74 const int32x2_t out_lo = vrshrn_n_s64(sum_lo, DCT_CONST_BITS);
75 const int32x2_t out_hi = vrshrn_n_s64(sum_hi, DCT_CONST_BITS);
76 return vcombine_s32(out_lo, out_hi);
77 }
78
highbd_sub_dct_const_round_shift_low_8(const int64x2_t * const in0,const int64x2_t * const in1)79 static INLINE int32x4_t highbd_sub_dct_const_round_shift_low_8(
80 const int64x2_t *const in0, const int64x2_t *const in1) {
81 const int64x2_t sub_lo = vsubq_s64(in0[0], in1[0]);
82 const int64x2_t sub_hi = vsubq_s64(in0[1], in1[1]);
83 const int32x2_t out_lo = vrshrn_n_s64(sub_lo, DCT_CONST_BITS);
84 const int32x2_t out_hi = vrshrn_n_s64(sub_hi, DCT_CONST_BITS);
85 return vcombine_s32(out_lo, out_hi);
86 }
87
highbd_iadst8(int32x4_t * const io0,int32x4_t * const io1,int32x4_t * const io2,int32x4_t * const io3,int32x4_t * const io4,int32x4_t * const io5,int32x4_t * const io6,int32x4_t * const io7)88 static INLINE void highbd_iadst8(int32x4_t *const io0, int32x4_t *const io1,
89 int32x4_t *const io2, int32x4_t *const io3,
90 int32x4_t *const io4, int32x4_t *const io5,
91 int32x4_t *const io6, int32x4_t *const io7) {
92 const int32x4_t c0 =
93 create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
94 const int32x4_t c1 =
95 create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
96 const int32x4_t c2 =
97 create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
98 int32x4_t x[8], t[4];
99 int64x2_t s[8][2];
100
101 x[0] = *io7;
102 x[1] = *io0;
103 x[2] = *io5;
104 x[3] = *io2;
105 x[4] = *io3;
106 x[5] = *io4;
107 x[6] = *io1;
108 x[7] = *io6;
109
110 // stage 1
111 highbd_iadst_butterfly_lane_0_1_neon(x[0], x[1], vget_low_s32(c0), s[0],
112 s[1]);
113 highbd_iadst_butterfly_lane_0_1_neon(x[2], x[3], vget_high_s32(c0), s[2],
114 s[3]);
115 highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_low_s32(c1), s[4],
116 s[5]);
117 highbd_iadst_butterfly_lane_0_1_neon(x[6], x[7], vget_high_s32(c1), s[6],
118 s[7]);
119
120 x[0] = highbd_add_dct_const_round_shift_low_8(s[0], s[4]);
121 x[1] = highbd_add_dct_const_round_shift_low_8(s[1], s[5]);
122 x[2] = highbd_add_dct_const_round_shift_low_8(s[2], s[6]);
123 x[3] = highbd_add_dct_const_round_shift_low_8(s[3], s[7]);
124 x[4] = highbd_sub_dct_const_round_shift_low_8(s[0], s[4]);
125 x[5] = highbd_sub_dct_const_round_shift_low_8(s[1], s[5]);
126 x[6] = highbd_sub_dct_const_round_shift_low_8(s[2], s[6]);
127 x[7] = highbd_sub_dct_const_round_shift_low_8(s[3], s[7]);
128
129 // stage 2
130 t[0] = x[0];
131 t[1] = x[1];
132 t[2] = x[2];
133 t[3] = x[3];
134 highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_high_s32(c2), s[4],
135 s[5]);
136 highbd_iadst_butterfly_lane_1_0_neon(x[7], x[6], vget_high_s32(c2), s[7],
137 s[6]);
138
139 x[0] = vaddq_s32(t[0], t[2]);
140 x[1] = vaddq_s32(t[1], t[3]);
141 x[2] = vsubq_s32(t[0], t[2]);
142 x[3] = vsubq_s32(t[1], t[3]);
143 x[4] = highbd_add_dct_const_round_shift_low_8(s[4], s[6]);
144 x[5] = highbd_add_dct_const_round_shift_low_8(s[5], s[7]);
145 x[6] = highbd_sub_dct_const_round_shift_low_8(s[4], s[6]);
146 x[7] = highbd_sub_dct_const_round_shift_low_8(s[5], s[7]);
147
148 // stage 3
149 highbd_iadst_half_butterfly_neon(x + 2, vget_low_s32(c2));
150 highbd_iadst_half_butterfly_neon(x + 6, vget_low_s32(c2));
151
152 *io0 = x[0];
153 *io1 = vnegq_s32(x[4]);
154 *io2 = x[6];
155 *io3 = vnegq_s32(x[2]);
156 *io4 = x[3];
157 *io5 = vnegq_s32(x[7]);
158 *io6 = x[5];
159 *io7 = vnegq_s32(x[1]);
160 }
161
vp9_highbd_iht8x8_64_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int tx_type,int bd)162 void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
163 int stride, int tx_type, int bd) {
164 int32x4_t a[16];
165 int16x8_t c[8];
166
167 a[0] = vld1q_s32(input);
168 a[1] = vld1q_s32(input + 4);
169 a[2] = vld1q_s32(input + 8);
170 a[3] = vld1q_s32(input + 12);
171 a[4] = vld1q_s32(input + 16);
172 a[5] = vld1q_s32(input + 20);
173 a[6] = vld1q_s32(input + 24);
174 a[7] = vld1q_s32(input + 28);
175 a[8] = vld1q_s32(input + 32);
176 a[9] = vld1q_s32(input + 36);
177 a[10] = vld1q_s32(input + 40);
178 a[11] = vld1q_s32(input + 44);
179 a[12] = vld1q_s32(input + 48);
180 a[13] = vld1q_s32(input + 52);
181 a[14] = vld1q_s32(input + 56);
182 a[15] = vld1q_s32(input + 60);
183
184 if (bd == 8) {
185 c[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1]));
186 c[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3]));
187 c[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5]));
188 c[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7]));
189 c[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9]));
190 c[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11]));
191 c[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13]));
192 c[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15]));
193
194 switch (tx_type) {
195 case DCT_DCT: {
196 const int16x8_t cospis = vld1q_s16(kCospi);
197 const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
198 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
199
200 idct8x8_64_1d_bd8(cospis0, cospis1, c);
201 idct8x8_64_1d_bd8(cospis0, cospis1, c);
202 break;
203 }
204
205 case ADST_DCT: {
206 const int16x8_t cospis = vld1q_s16(kCospi);
207 const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
208 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
209
210 idct8x8_64_1d_bd8(cospis0, cospis1, c);
211 transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
212 &c[7]);
213 iadst8(c);
214 break;
215 }
216
217 case DCT_ADST: {
218 const int16x8_t cospis = vld1q_s16(kCospi);
219 const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
220 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
221
222 transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
223 &c[7]);
224 iadst8(c);
225 idct8x8_64_1d_bd8(cospis0, cospis1, c);
226 break;
227 }
228
229 default: {
230 transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
231 &c[7]);
232 iadst8(c);
233 transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
234 &c[7]);
235 iadst8(c);
236 break;
237 }
238 }
239
240 c[0] = vrshrq_n_s16(c[0], 5);
241 c[1] = vrshrq_n_s16(c[1], 5);
242 c[2] = vrshrq_n_s16(c[2], 5);
243 c[3] = vrshrq_n_s16(c[3], 5);
244 c[4] = vrshrq_n_s16(c[4], 5);
245 c[5] = vrshrq_n_s16(c[5], 5);
246 c[6] = vrshrq_n_s16(c[6], 5);
247 c[7] = vrshrq_n_s16(c[7], 5);
248 } else {
249 switch (tx_type) {
250 case DCT_DCT: {
251 const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24
252 const int32x4_t cospis1 =
253 vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
254
255 if (bd == 10) {
256 idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
257 &a[4], &a[5], &a[6], &a[7]);
258 idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
259 &a[12], &a[13], &a[14], &a[15]);
260 idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
261 &a[2], &a[10], &a[3], &a[11]);
262 idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
263 &a[6], &a[14], &a[7], &a[15]);
264 } else {
265 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
266 &a[4], &a[5], &a[6], &a[7]);
267 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
268 &a[12], &a[13], &a[14], &a[15]);
269 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
270 &a[2], &a[10], &a[3], &a[11]);
271 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
272 &a[6], &a[14], &a[7], &a[15]);
273 }
274 break;
275 }
276
277 case ADST_DCT: {
278 const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24
279 const int32x4_t cospis1 =
280 vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
281
282 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
283 &a[4], &a[5], &a[6], &a[7]);
284 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
285 &a[12], &a[13], &a[14], &a[15]);
286 transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
287 &a[11]);
288 highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
289 transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
290 &a[15]);
291 highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
292 &a[15]);
293 break;
294 }
295
296 case DCT_ADST: {
297 const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24
298 const int32x4_t cospis1 =
299 vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
300
301 transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
302 &a[7]);
303 highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
304 transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
305 &a[15]);
306 highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
307 &a[15]);
308 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
309 &a[2], &a[10], &a[3], &a[11]);
310 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
311 &a[6], &a[14], &a[7], &a[15]);
312 break;
313 }
314
315 default: {
316 assert(tx_type == ADST_ADST);
317 transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
318 &a[7]);
319 highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
320 transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
321 &a[15]);
322 highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
323 &a[15]);
324 transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
325 &a[11]);
326 highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
327 transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
328 &a[15]);
329 highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
330 &a[15]);
331 break;
332 }
333 }
334
335 c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
336 c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
337 c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
338 c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
339 c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
340 c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
341 c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
342 c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
343 }
344 highbd_add8x8(c, dest, stride, bd);
345 }
346