1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vp9_rtcd.h"
15 #include "./vpx_config.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx_dsp/txfm_common.h"
18
TRANSPOSE4X4(int16x8_t * q8s16,int16x8_t * q9s16)19 static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
20 int32x4_t q8s32, q9s32;
21 int16x4x2_t d0x2s16, d1x2s16;
22 int32x4x2_t q0x2s32;
23
24 d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
25 d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
26
27 q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
28 q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
29 q0x2s32 = vtrnq_s32(q8s32, q9s32);
30
31 *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
32 *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
33 }
34
GENERATE_COSINE_CONSTANTS(int16x4_t * d0s16,int16x4_t * d1s16,int16x4_t * d2s16)35 static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
36 int16x4_t *d2s16) {
37 *d0s16 = vdup_n_s16(cospi_8_64);
38 *d1s16 = vdup_n_s16(cospi_16_64);
39 *d2s16 = vdup_n_s16(cospi_24_64);
40 }
41
GENERATE_SINE_CONSTANTS(int16x4_t * d3s16,int16x4_t * d4s16,int16x4_t * d5s16,int16x8_t * q3s16)42 static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
43 int16x4_t *d5s16, int16x8_t *q3s16) {
44 *d3s16 = vdup_n_s16(sinpi_1_9);
45 *d4s16 = vdup_n_s16(sinpi_2_9);
46 *q3s16 = vdupq_n_s16(sinpi_3_9);
47 *d5s16 = vdup_n_s16(sinpi_4_9);
48 }
49
IDCT4x4_1D(int16x4_t * d0s16,int16x4_t * d1s16,int16x4_t * d2s16,int16x8_t * q8s16,int16x8_t * q9s16)50 static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
51 int16x4_t *d2s16, int16x8_t *q8s16,
52 int16x8_t *q9s16) {
53 int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
54 int16x4_t d26s16, d27s16, d28s16, d29s16;
55 int32x4_t q10s32, q13s32, q14s32, q15s32;
56 int16x8_t q13s16, q14s16;
57
58 d16s16 = vget_low_s16(*q8s16);
59 d17s16 = vget_high_s16(*q8s16);
60 d18s16 = vget_low_s16(*q9s16);
61 d19s16 = vget_high_s16(*q9s16);
62
63 d23s16 = vadd_s16(d16s16, d18s16);
64 d24s16 = vsub_s16(d16s16, d18s16);
65
66 q15s32 = vmull_s16(d17s16, *d2s16);
67 q10s32 = vmull_s16(d17s16, *d0s16);
68 q13s32 = vmull_s16(d23s16, *d1s16);
69 q14s32 = vmull_s16(d24s16, *d1s16);
70 q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
71 q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
72
73 d26s16 = vrshrn_n_s32(q13s32, 14);
74 d27s16 = vrshrn_n_s32(q14s32, 14);
75 d29s16 = vrshrn_n_s32(q15s32, 14);
76 d28s16 = vrshrn_n_s32(q10s32, 14);
77
78 q13s16 = vcombine_s16(d26s16, d27s16);
79 q14s16 = vcombine_s16(d28s16, d29s16);
80 *q8s16 = vaddq_s16(q13s16, q14s16);
81 *q9s16 = vsubq_s16(q13s16, q14s16);
82 *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16)); // vswp
83 }
84
IADST4x4_1D(int16x4_t * d3s16,int16x4_t * d4s16,int16x4_t * d5s16,int16x8_t * q3s16,int16x8_t * q8s16,int16x8_t * q9s16)85 static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
86 int16x4_t *d5s16, int16x8_t *q3s16,
87 int16x8_t *q8s16, int16x8_t *q9s16) {
88 int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
89 int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
90
91 d6s16 = vget_low_s16(*q3s16);
92
93 d16s16 = vget_low_s16(*q8s16);
94 d17s16 = vget_high_s16(*q8s16);
95 d18s16 = vget_low_s16(*q9s16);
96 d19s16 = vget_high_s16(*q9s16);
97
98 q10s32 = vmull_s16(*d3s16, d16s16);
99 q11s32 = vmull_s16(*d4s16, d16s16);
100 q12s32 = vmull_s16(d6s16, d17s16);
101 q13s32 = vmull_s16(*d5s16, d18s16);
102 q14s32 = vmull_s16(*d3s16, d18s16);
103 q15s32 = vmovl_s16(d16s16);
104 q15s32 = vaddw_s16(q15s32, d19s16);
105 q8s32 = vmull_s16(*d4s16, d19s16);
106 q15s32 = vsubw_s16(q15s32, d18s16);
107 q9s32 = vmull_s16(*d5s16, d19s16);
108
109 q10s32 = vaddq_s32(q10s32, q13s32);
110 q10s32 = vaddq_s32(q10s32, q8s32);
111 q11s32 = vsubq_s32(q11s32, q14s32);
112 q8s32 = vdupq_n_s32(sinpi_3_9);
113 q11s32 = vsubq_s32(q11s32, q9s32);
114 q15s32 = vmulq_s32(q15s32, q8s32);
115
116 q13s32 = vaddq_s32(q10s32, q12s32);
117 q10s32 = vaddq_s32(q10s32, q11s32);
118 q14s32 = vaddq_s32(q11s32, q12s32);
119 q10s32 = vsubq_s32(q10s32, q12s32);
120
121 d16s16 = vrshrn_n_s32(q13s32, 14);
122 d17s16 = vrshrn_n_s32(q14s32, 14);
123 d18s16 = vrshrn_n_s32(q15s32, 14);
124 d19s16 = vrshrn_n_s32(q10s32, 14);
125
126 *q8s16 = vcombine_s16(d16s16, d17s16);
127 *q9s16 = vcombine_s16(d18s16, d19s16);
128 }
129
vp9_iht4x4_16_add_neon(const tran_low_t * input,uint8_t * dest,int stride,int tx_type)130 void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
131 int tx_type) {
132 uint8x8_t d26u8, d27u8;
133 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
134 uint32x2_t d26u32, d27u32;
135 int16x8_t q3s16, q8s16, q9s16;
136 uint16x8_t q8u16, q9u16;
137
138 d26u32 = d27u32 = vdup_n_u32(0);
139
140 q8s16 = vld1q_s16(input);
141 q9s16 = vld1q_s16(input + 8);
142
143 TRANSPOSE4X4(&q8s16, &q9s16);
144
145 switch (tx_type) {
146 case 0: // idct_idct is not supported. Fall back to C
147 vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
148 return;
149 case 1: // iadst_idct
150 // generate constants
151 GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
152 GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
153
154 // first transform rows
155 IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
156
157 // transpose the matrix
158 TRANSPOSE4X4(&q8s16, &q9s16);
159
160 // then transform columns
161 IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
162 break;
163 case 2: // idct_iadst
164 // generate constantsyy
165 GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
166 GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
167
168 // first transform rows
169 IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
170
171 // transpose the matrix
172 TRANSPOSE4X4(&q8s16, &q9s16);
173
174 // then transform columns
175 IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
176 break;
177 case 3: // iadst_iadst
178 // generate constants
179 GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
180
181 // first transform rows
182 IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
183
184 // transpose the matrix
185 TRANSPOSE4X4(&q8s16, &q9s16);
186
187 // then transform columns
188 IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
189 break;
190 default: // iadst_idct
191 assert(0);
192 break;
193 }
194
195 q8s16 = vrshrq_n_s16(q8s16, 4);
196 q9s16 = vrshrq_n_s16(q9s16, 4);
197
198 d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
199 dest += stride;
200 d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
201 dest += stride;
202 d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
203 dest += stride;
204 d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
205
206 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
207 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
208
209 d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
210 d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
211
212 vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
213 dest -= stride;
214 vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
215 dest -= stride;
216 vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
217 dest -= stride;
218 vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
219 }
220