1 /*
2 * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
12 #define VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
13
14 #include <arm_neon.h>
15
16 #include "./vp9_rtcd.h"
17 #include "./vpx_config.h"
18 #include "vp9/common/vp9_common.h"
19 #include "vpx_dsp/arm/idct_neon.h"
20 #include "vpx_dsp/arm/mem_neon.h"
21 #include "vpx_dsp/txfm_common.h"
22
iadst4(int16x8_t * const io)23 static INLINE void iadst4(int16x8_t *const io) {
24 const int32x4_t c3 = vdupq_n_s32(sinpi_3_9);
25 int16x4_t x[4];
26 int32x4_t s[8], output[4];
27 const int16x4_t c =
28 create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9);
29
30 x[0] = vget_low_s16(io[0]);
31 x[1] = vget_low_s16(io[1]);
32 x[2] = vget_high_s16(io[0]);
33 x[3] = vget_high_s16(io[1]);
34
35 s[0] = vmull_lane_s16(x[0], c, 0);
36 s[1] = vmull_lane_s16(x[0], c, 1);
37 s[2] = vmull_lane_s16(x[1], c, 2);
38 s[3] = vmull_lane_s16(x[2], c, 3);
39 s[4] = vmull_lane_s16(x[2], c, 0);
40 s[5] = vmull_lane_s16(x[3], c, 1);
41 s[6] = vmull_lane_s16(x[3], c, 3);
42 s[7] = vaddl_s16(x[0], x[3]);
43 s[7] = vsubw_s16(s[7], x[2]);
44
45 s[0] = vaddq_s32(s[0], s[3]);
46 s[0] = vaddq_s32(s[0], s[5]);
47 s[1] = vsubq_s32(s[1], s[4]);
48 s[1] = vsubq_s32(s[1], s[6]);
49 s[3] = s[2];
50 s[2] = vmulq_s32(c3, s[7]);
51
52 output[0] = vaddq_s32(s[0], s[3]);
53 output[1] = vaddq_s32(s[1], s[3]);
54 output[2] = s[2];
55 output[3] = vaddq_s32(s[0], s[1]);
56 output[3] = vsubq_s32(output[3], s[3]);
57 dct_const_round_shift_low_8_dual(output, &io[0], &io[1]);
58 }
59
iadst_half_butterfly_neon(int16x8_t * const x,const int16x4_t c)60 static INLINE void iadst_half_butterfly_neon(int16x8_t *const x,
61 const int16x4_t c) {
62 // Don't add/sub before multiply, which will overflow in iadst8.
63 const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
64 const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
65 const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
66 const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
67 int32x4_t t0[2], t1[2];
68
69 t0[0] = vaddq_s32(x0_lo, x1_lo);
70 t0[1] = vaddq_s32(x0_hi, x1_hi);
71 t1[0] = vsubq_s32(x0_lo, x1_lo);
72 t1[1] = vsubq_s32(x0_hi, x1_hi);
73 x[0] = dct_const_round_shift_low_8(t0);
74 x[1] = dct_const_round_shift_low_8(t1);
75 }
76
iadst_half_butterfly_neg_neon(int16x8_t * const x0,int16x8_t * const x1,const int16x4_t c)77 static INLINE void iadst_half_butterfly_neg_neon(int16x8_t *const x0,
78 int16x8_t *const x1,
79 const int16x4_t c) {
80 // Don't add/sub before multiply, which will overflow in iadst8.
81 const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 1);
82 const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 1);
83 const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 1);
84 const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 1);
85 int32x4_t t0[2], t1[2];
86
87 t0[0] = vaddq_s32(x0_lo, x1_lo);
88 t0[1] = vaddq_s32(x0_hi, x1_hi);
89 t1[0] = vsubq_s32(x0_lo, x1_lo);
90 t1[1] = vsubq_s32(x0_hi, x1_hi);
91 *x1 = dct_const_round_shift_low_8(t0);
92 *x0 = dct_const_round_shift_low_8(t1);
93 }
94
iadst_half_butterfly_pos_neon(int16x8_t * const x0,int16x8_t * const x1,const int16x4_t c)95 static INLINE void iadst_half_butterfly_pos_neon(int16x8_t *const x0,
96 int16x8_t *const x1,
97 const int16x4_t c) {
98 // Don't add/sub before multiply, which will overflow in iadst8.
99 const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 0);
100 const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 0);
101 const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 0);
102 const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 0);
103 int32x4_t t0[2], t1[2];
104
105 t0[0] = vaddq_s32(x0_lo, x1_lo);
106 t0[1] = vaddq_s32(x0_hi, x1_hi);
107 t1[0] = vsubq_s32(x0_lo, x1_lo);
108 t1[1] = vsubq_s32(x0_hi, x1_hi);
109 *x1 = dct_const_round_shift_low_8(t0);
110 *x0 = dct_const_round_shift_low_8(t1);
111 }
112
iadst_butterfly_lane_0_1_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int32x4_t * const s0,int32x4_t * const s1)113 static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0,
114 const int16x8_t in1,
115 const int16x4_t c,
116 int32x4_t *const s0,
117 int32x4_t *const s1) {
118 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
119 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
120 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
121 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
122
123 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
124 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
125 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
126 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
127 }
128
iadst_butterfly_lane_2_3_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int32x4_t * const s0,int32x4_t * const s1)129 static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0,
130 const int16x8_t in1,
131 const int16x4_t c,
132 int32x4_t *const s0,
133 int32x4_t *const s1) {
134 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
135 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
136 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
137 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
138
139 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
140 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
141 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
142 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
143 }
144
iadst_butterfly_lane_1_0_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int32x4_t * const s0,int32x4_t * const s1)145 static INLINE void iadst_butterfly_lane_1_0_neon(const int16x8_t in0,
146 const int16x8_t in1,
147 const int16x4_t c,
148 int32x4_t *const s0,
149 int32x4_t *const s1) {
150 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
151 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
152 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
153 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
154
155 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
156 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
157 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
158 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
159 }
160
iadst_butterfly_lane_3_2_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int32x4_t * const s0,int32x4_t * const s1)161 static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0,
162 const int16x8_t in1,
163 const int16x4_t c,
164 int32x4_t *const s0,
165 int32x4_t *const s1) {
166 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
167 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
168 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
169 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
170
171 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
172 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
173 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
174 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
175 }
176
add_dct_const_round_shift_low_8(const int32x4_t * const in0,const int32x4_t * const in1)177 static INLINE int16x8_t add_dct_const_round_shift_low_8(
178 const int32x4_t *const in0, const int32x4_t *const in1) {
179 int32x4_t sum[2];
180
181 sum[0] = vaddq_s32(in0[0], in1[0]);
182 sum[1] = vaddq_s32(in0[1], in1[1]);
183 return dct_const_round_shift_low_8(sum);
184 }
185
sub_dct_const_round_shift_low_8(const int32x4_t * const in0,const int32x4_t * const in1)186 static INLINE int16x8_t sub_dct_const_round_shift_low_8(
187 const int32x4_t *const in0, const int32x4_t *const in1) {
188 int32x4_t sum[2];
189
190 sum[0] = vsubq_s32(in0[0], in1[0]);
191 sum[1] = vsubq_s32(in0[1], in1[1]);
192 return dct_const_round_shift_low_8(sum);
193 }
194
iadst8(int16x8_t * const io)195 static INLINE void iadst8(int16x8_t *const io) {
196 const int16x4_t c0 =
197 create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
198 const int16x4_t c1 =
199 create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
200 const int16x4_t c2 =
201 create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
202 int16x8_t x[8], t[4];
203 int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
204
205 x[0] = io[7];
206 x[1] = io[0];
207 x[2] = io[5];
208 x[3] = io[2];
209 x[4] = io[3];
210 x[5] = io[4];
211 x[6] = io[1];
212 x[7] = io[6];
213
214 // stage 1
215 iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1);
216 iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3);
217 iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5);
218 iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7);
219
220 x[0] = add_dct_const_round_shift_low_8(s0, s4);
221 x[1] = add_dct_const_round_shift_low_8(s1, s5);
222 x[2] = add_dct_const_round_shift_low_8(s2, s6);
223 x[3] = add_dct_const_round_shift_low_8(s3, s7);
224 x[4] = sub_dct_const_round_shift_low_8(s0, s4);
225 x[5] = sub_dct_const_round_shift_low_8(s1, s5);
226 x[6] = sub_dct_const_round_shift_low_8(s2, s6);
227 x[7] = sub_dct_const_round_shift_low_8(s3, s7);
228
229 // stage 2
230 t[0] = x[0];
231 t[1] = x[1];
232 t[2] = x[2];
233 t[3] = x[3];
234 iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5);
235 iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6);
236
237 x[0] = vaddq_s16(t[0], t[2]);
238 x[1] = vaddq_s16(t[1], t[3]);
239 x[2] = vsubq_s16(t[0], t[2]);
240 x[3] = vsubq_s16(t[1], t[3]);
241 x[4] = add_dct_const_round_shift_low_8(s4, s6);
242 x[5] = add_dct_const_round_shift_low_8(s5, s7);
243 x[6] = sub_dct_const_round_shift_low_8(s4, s6);
244 x[7] = sub_dct_const_round_shift_low_8(s5, s7);
245
246 // stage 3
247 iadst_half_butterfly_neon(x + 2, c2);
248 iadst_half_butterfly_neon(x + 6, c2);
249
250 io[0] = x[0];
251 io[1] = vnegq_s16(x[4]);
252 io[2] = x[6];
253 io[3] = vnegq_s16(x[2]);
254 io[4] = x[3];
255 io[5] = vnegq_s16(x[7]);
256 io[6] = x[5];
257 io[7] = vnegq_s16(x[1]);
258 }
259
260 void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output,
261 void *const dest, const int stride,
262 const int highbd_flag);
263
264 typedef void (*iht_1d)(const void *const input, int16_t *output,
265 void *const dest, const int stride,
266 const int highbd_flag);
267
268 typedef struct {
269 iht_1d cols, rows; // vertical and horizontal
270 } iht_2d;
271
272 #endif // VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
273