1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vp8_rtcd.h"
14
idct_dequant_0_2x_neon(int16_t * q,int16_t dq,unsigned char * dst,int stride)15 static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst,
16 int stride) {
17 unsigned char *dst0;
18 int i, a0, a1;
19 int16x8x2_t q2Add;
20 int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0);
21 uint8x8_t d2u8, d4u8;
22 uint16x8_t q1u16, q2u16;
23
24 a0 = ((q[0] * dq) + 4) >> 3;
25 a1 = ((q[16] * dq) + 4) >> 3;
26 q[0] = q[16] = 0;
27 q2Add.val[0] = vdupq_n_s16((int16_t)a0);
28 q2Add.val[1] = vdupq_n_s16((int16_t)a1);
29
30 for (i = 0; i < 2; i++, dst += 4) {
31 dst0 = dst;
32 d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
33 dst0 += stride;
34 d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
35 dst0 += stride;
36 d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
37 dst0 += stride;
38 d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
39
40 q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
41 vreinterpret_u8_s32(d2s32));
42 q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
43 vreinterpret_u8_s32(d4s32));
44
45 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
46 d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
47
48 d2s32 = vreinterpret_s32_u8(d2u8);
49 d4s32 = vreinterpret_s32_u8(d4u8);
50
51 dst0 = dst;
52 vst1_lane_s32((int32_t *)dst0, d2s32, 0);
53 dst0 += stride;
54 vst1_lane_s32((int32_t *)dst0, d2s32, 1);
55 dst0 += stride;
56 vst1_lane_s32((int32_t *)dst0, d4s32, 0);
57 dst0 += stride;
58 vst1_lane_s32((int32_t *)dst0, d4s32, 1);
59 }
60 return;
61 }
62
63 static const int16_t cospi8sqrt2minus1 = 20091;
64 static const int16_t sinpi8sqrt2 = 17734;
65 // because the lowest bit in 0x8a8c is 0, we can pre-shift this
66
idct_dequant_full_2x_neon(int16_t * q,int16_t * dq,unsigned char * dst,int stride)67 static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq,
68 unsigned char *dst, int stride) {
69 unsigned char *dst0, *dst1;
70 int32x2_t d28, d29, d30, d31;
71 int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
72 int16x8_t qEmpty = vdupq_n_s16(0);
73 int32x4x2_t q2tmp0, q2tmp1;
74 int16x8x2_t q2tmp2, q2tmp3;
75 int16x4_t dLow0, dLow1, dHigh0, dHigh1;
76
77 d28 = d29 = d30 = d31 = vdup_n_s32(0);
78
79 // load dq
80 q0 = vld1q_s16(dq);
81 dq += 8;
82 q1 = vld1q_s16(dq);
83
84 // load q
85 q2 = vld1q_s16(q);
86 vst1q_s16(q, qEmpty);
87 q += 8;
88 q3 = vld1q_s16(q);
89 vst1q_s16(q, qEmpty);
90 q += 8;
91 q4 = vld1q_s16(q);
92 vst1q_s16(q, qEmpty);
93 q += 8;
94 q5 = vld1q_s16(q);
95 vst1q_s16(q, qEmpty);
96
97 // load src from dst
98 dst0 = dst;
99 dst1 = dst + 4;
100 d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
101 dst0 += stride;
102 d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
103 dst1 += stride;
104 d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
105 dst0 += stride;
106 d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
107 dst1 += stride;
108
109 d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
110 dst0 += stride;
111 d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
112 dst1 += stride;
113 d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
114 d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
115
116 q2 = vmulq_s16(q2, q0);
117 q3 = vmulq_s16(q3, q1);
118 q4 = vmulq_s16(q4, q0);
119 q5 = vmulq_s16(q5, q1);
120
121 // vswp
122 dLow0 = vget_low_s16(q2);
123 dHigh0 = vget_high_s16(q2);
124 dLow1 = vget_low_s16(q4);
125 dHigh1 = vget_high_s16(q4);
126 q2 = vcombine_s16(dLow0, dLow1);
127 q4 = vcombine_s16(dHigh0, dHigh1);
128
129 dLow0 = vget_low_s16(q3);
130 dHigh0 = vget_high_s16(q3);
131 dLow1 = vget_low_s16(q5);
132 dHigh1 = vget_high_s16(q5);
133 q3 = vcombine_s16(dLow0, dLow1);
134 q5 = vcombine_s16(dHigh0, dHigh1);
135
136 q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
137 q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
138 q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
139 q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
140
141 q10 = vqaddq_s16(q2, q3);
142 q11 = vqsubq_s16(q2, q3);
143
144 q8 = vshrq_n_s16(q8, 1);
145 q9 = vshrq_n_s16(q9, 1);
146
147 q4 = vqaddq_s16(q4, q8);
148 q5 = vqaddq_s16(q5, q9);
149
150 q2 = vqsubq_s16(q6, q5);
151 q3 = vqaddq_s16(q7, q4);
152
153 q4 = vqaddq_s16(q10, q3);
154 q5 = vqaddq_s16(q11, q2);
155 q6 = vqsubq_s16(q11, q2);
156 q7 = vqsubq_s16(q10, q3);
157
158 q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
159 q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
160 q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
161 vreinterpretq_s16_s32(q2tmp1.val[0]));
162 q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
163 vreinterpretq_s16_s32(q2tmp1.val[1]));
164
165 // loop 2
166 q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
167 q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
168 q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
169 q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
170
171 q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
172 q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
173
174 q10 = vshrq_n_s16(q10, 1);
175 q11 = vshrq_n_s16(q11, 1);
176
177 q10 = vqaddq_s16(q2tmp2.val[1], q10);
178 q11 = vqaddq_s16(q2tmp3.val[1], q11);
179
180 q8 = vqsubq_s16(q8, q11);
181 q9 = vqaddq_s16(q9, q10);
182
183 q4 = vqaddq_s16(q2, q9);
184 q5 = vqaddq_s16(q3, q8);
185 q6 = vqsubq_s16(q3, q8);
186 q7 = vqsubq_s16(q2, q9);
187
188 q4 = vrshrq_n_s16(q4, 3);
189 q5 = vrshrq_n_s16(q5, 3);
190 q6 = vrshrq_n_s16(q6, 3);
191 q7 = vrshrq_n_s16(q7, 3);
192
193 q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
194 q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
195 q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
196 vreinterpretq_s16_s32(q2tmp1.val[0]));
197 q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
198 vreinterpretq_s16_s32(q2tmp1.val[1]));
199
200 q4 = vreinterpretq_s16_u16(
201 vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28)));
202 q5 = vreinterpretq_s16_u16(
203 vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29)));
204 q6 = vreinterpretq_s16_u16(
205 vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30)));
206 q7 = vreinterpretq_s16_u16(
207 vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31)));
208
209 d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
210 d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
211 d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
212 d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
213
214 dst0 = dst;
215 dst1 = dst + 4;
216 vst1_lane_s32((int32_t *)dst0, d28, 0);
217 dst0 += stride;
218 vst1_lane_s32((int32_t *)dst1, d28, 1);
219 dst1 += stride;
220 vst1_lane_s32((int32_t *)dst0, d29, 0);
221 dst0 += stride;
222 vst1_lane_s32((int32_t *)dst1, d29, 1);
223 dst1 += stride;
224
225 vst1_lane_s32((int32_t *)dst0, d30, 0);
226 dst0 += stride;
227 vst1_lane_s32((int32_t *)dst1, d30, 1);
228 dst1 += stride;
229 vst1_lane_s32((int32_t *)dst0, d31, 0);
230 vst1_lane_s32((int32_t *)dst1, d31, 1);
231 return;
232 }
233
vp8_dequant_idct_add_y_block_neon(short * q,short * dq,unsigned char * dst,int stride,char * eobs)234 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst,
235 int stride, char *eobs) {
236 int i;
237
238 for (i = 0; i < 4; ++i) {
239 if (((short *)(eobs))[0]) {
240 if (((short *)eobs)[0] & 0xfefe)
241 idct_dequant_full_2x_neon(q, dq, dst, stride);
242 else
243 idct_dequant_0_2x_neon(q, dq[0], dst, stride);
244 }
245
246 if (((short *)(eobs))[1]) {
247 if (((short *)eobs)[1] & 0xfefe)
248 idct_dequant_full_2x_neon(q + 32, dq, dst + 8, stride);
249 else
250 idct_dequant_0_2x_neon(q + 32, dq[0], dst + 8, stride);
251 }
252 q += 64;
253 dst += 4 * stride;
254 eobs += 4;
255 }
256 }
257
vp8_dequant_idct_add_uv_block_neon(short * q,short * dq,unsigned char * dst_u,unsigned char * dst_v,int stride,char * eobs)258 void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
259 unsigned char *dst_u,
260 unsigned char *dst_v, int stride,
261 char *eobs) {
262 if (((short *)(eobs))[0]) {
263 if (((short *)eobs)[0] & 0xfefe)
264 idct_dequant_full_2x_neon(q, dq, dst_u, stride);
265 else
266 idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
267 }
268
269 q += 32;
270 dst_u += 4 * stride;
271
272 if (((short *)(eobs))[1]) {
273 if (((short *)eobs)[1] & 0xfefe)
274 idct_dequant_full_2x_neon(q, dq, dst_u, stride);
275 else
276 idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
277 }
278
279 q += 32;
280
281 if (((short *)(eobs))[2]) {
282 if (((short *)eobs)[2] & 0xfefe)
283 idct_dequant_full_2x_neon(q, dq, dst_v, stride);
284 else
285 idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
286 }
287
288 q += 32;
289 dst_v += 4 * stride;
290
291 if (((short *)(eobs))[3]) {
292 if (((short *)eobs)[3] & 0xfefe)
293 idct_dequant_full_2x_neon(q, dq, dst_v, stride);
294 else
295 idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
296 }
297 }
298