1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vp8_rtcd.h"
14
idct_dequant_0_2x_neon(int16_t * q,int16_t dq,unsigned char * dst,int stride)15 static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst,
16 int stride) {
17 unsigned char *dst0;
18 int i, a0, a1;
19 int16x8x2_t q2Add;
20 int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0);
21 uint8x8_t d2u8, d4u8;
22 uint16x8_t q1u16, q2u16;
23
24 a0 = ((q[0] * dq) + 4) >> 3;
25 a1 = ((q[16] * dq) + 4) >> 3;
26 q[0] = q[16] = 0;
27 q2Add.val[0] = vdupq_n_s16((int16_t)a0);
28 q2Add.val[1] = vdupq_n_s16((int16_t)a1);
29
30 for (i = 0; i < 2; i++, dst += 4) {
31 dst0 = dst;
32 d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
33 dst0 += stride;
34 d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
35 dst0 += stride;
36 d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
37 dst0 += stride;
38 d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
39
40 q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
41 vreinterpret_u8_s32(d2s32));
42 q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
43 vreinterpret_u8_s32(d4s32));
44
45 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
46 d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
47
48 d2s32 = vreinterpret_s32_u8(d2u8);
49 d4s32 = vreinterpret_s32_u8(d4u8);
50
51 dst0 = dst;
52 vst1_lane_s32((int32_t *)dst0, d2s32, 0);
53 dst0 += stride;
54 vst1_lane_s32((int32_t *)dst0, d2s32, 1);
55 dst0 += stride;
56 vst1_lane_s32((int32_t *)dst0, d4s32, 0);
57 dst0 += stride;
58 vst1_lane_s32((int32_t *)dst0, d4s32, 1);
59 }
60 }
61
62 static const int16_t cospi8sqrt2minus1 = 20091;
63 static const int16_t sinpi8sqrt2 = 17734;
64 // because the lowest bit in 0x8a8c is 0, we can pre-shift this
65
idct_dequant_full_2x_neon(int16_t * q,int16_t * dq,unsigned char * dst,int stride)66 static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq,
67 unsigned char *dst, int stride) {
68 unsigned char *dst0, *dst1;
69 int32x2_t d28, d29, d30, d31;
70 int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
71 int16x8_t qEmpty = vdupq_n_s16(0);
72 int32x4x2_t q2tmp0, q2tmp1;
73 int16x8x2_t q2tmp2, q2tmp3;
74 int16x4_t dLow0, dLow1, dHigh0, dHigh1;
75
76 d28 = d29 = d30 = d31 = vdup_n_s32(0);
77
78 // load dq
79 q0 = vld1q_s16(dq);
80 dq += 8;
81 q1 = vld1q_s16(dq);
82
83 // load q
84 q2 = vld1q_s16(q);
85 vst1q_s16(q, qEmpty);
86 q += 8;
87 q3 = vld1q_s16(q);
88 vst1q_s16(q, qEmpty);
89 q += 8;
90 q4 = vld1q_s16(q);
91 vst1q_s16(q, qEmpty);
92 q += 8;
93 q5 = vld1q_s16(q);
94 vst1q_s16(q, qEmpty);
95
96 // load src from dst
97 dst0 = dst;
98 dst1 = dst + 4;
99 d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
100 dst0 += stride;
101 d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
102 dst1 += stride;
103 d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
104 dst0 += stride;
105 d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
106 dst1 += stride;
107
108 d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
109 dst0 += stride;
110 d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
111 dst1 += stride;
112 d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
113 d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
114
115 q2 = vmulq_s16(q2, q0);
116 q3 = vmulq_s16(q3, q1);
117 q4 = vmulq_s16(q4, q0);
118 q5 = vmulq_s16(q5, q1);
119
120 // vswp
121 dLow0 = vget_low_s16(q2);
122 dHigh0 = vget_high_s16(q2);
123 dLow1 = vget_low_s16(q4);
124 dHigh1 = vget_high_s16(q4);
125 q2 = vcombine_s16(dLow0, dLow1);
126 q4 = vcombine_s16(dHigh0, dHigh1);
127
128 dLow0 = vget_low_s16(q3);
129 dHigh0 = vget_high_s16(q3);
130 dLow1 = vget_low_s16(q5);
131 dHigh1 = vget_high_s16(q5);
132 q3 = vcombine_s16(dLow0, dLow1);
133 q5 = vcombine_s16(dHigh0, dHigh1);
134
135 q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
136 q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
137 q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
138 q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
139
140 q10 = vqaddq_s16(q2, q3);
141 q11 = vqsubq_s16(q2, q3);
142
143 q8 = vshrq_n_s16(q8, 1);
144 q9 = vshrq_n_s16(q9, 1);
145
146 q4 = vqaddq_s16(q4, q8);
147 q5 = vqaddq_s16(q5, q9);
148
149 q2 = vqsubq_s16(q6, q5);
150 q3 = vqaddq_s16(q7, q4);
151
152 q4 = vqaddq_s16(q10, q3);
153 q5 = vqaddq_s16(q11, q2);
154 q6 = vqsubq_s16(q11, q2);
155 q7 = vqsubq_s16(q10, q3);
156
157 q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
158 q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
159 q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
160 vreinterpretq_s16_s32(q2tmp1.val[0]));
161 q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
162 vreinterpretq_s16_s32(q2tmp1.val[1]));
163
164 // loop 2
165 q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
166 q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
167 q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
168 q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
169
170 q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
171 q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
172
173 q10 = vshrq_n_s16(q10, 1);
174 q11 = vshrq_n_s16(q11, 1);
175
176 q10 = vqaddq_s16(q2tmp2.val[1], q10);
177 q11 = vqaddq_s16(q2tmp3.val[1], q11);
178
179 q8 = vqsubq_s16(q8, q11);
180 q9 = vqaddq_s16(q9, q10);
181
182 q4 = vqaddq_s16(q2, q9);
183 q5 = vqaddq_s16(q3, q8);
184 q6 = vqsubq_s16(q3, q8);
185 q7 = vqsubq_s16(q2, q9);
186
187 q4 = vrshrq_n_s16(q4, 3);
188 q5 = vrshrq_n_s16(q5, 3);
189 q6 = vrshrq_n_s16(q6, 3);
190 q7 = vrshrq_n_s16(q7, 3);
191
192 q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
193 q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
194 q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
195 vreinterpretq_s16_s32(q2tmp1.val[0]));
196 q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
197 vreinterpretq_s16_s32(q2tmp1.val[1]));
198
199 q4 = vreinterpretq_s16_u16(
200 vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28)));
201 q5 = vreinterpretq_s16_u16(
202 vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29)));
203 q6 = vreinterpretq_s16_u16(
204 vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30)));
205 q7 = vreinterpretq_s16_u16(
206 vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31)));
207
208 d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
209 d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
210 d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
211 d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
212
213 dst0 = dst;
214 dst1 = dst + 4;
215 vst1_lane_s32((int32_t *)dst0, d28, 0);
216 dst0 += stride;
217 vst1_lane_s32((int32_t *)dst1, d28, 1);
218 dst1 += stride;
219 vst1_lane_s32((int32_t *)dst0, d29, 0);
220 dst0 += stride;
221 vst1_lane_s32((int32_t *)dst1, d29, 1);
222 dst1 += stride;
223
224 vst1_lane_s32((int32_t *)dst0, d30, 0);
225 dst0 += stride;
226 vst1_lane_s32((int32_t *)dst1, d30, 1);
227 dst1 += stride;
228 vst1_lane_s32((int32_t *)dst0, d31, 0);
229 vst1_lane_s32((int32_t *)dst1, d31, 1);
230 }
231
vp8_dequant_idct_add_y_block_neon(short * q,short * dq,unsigned char * dst,int stride,char * eobs)232 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst,
233 int stride, char *eobs) {
234 int i;
235
236 for (i = 0; i < 4; ++i) {
237 if (((short *)(eobs))[0]) {
238 if (((short *)eobs)[0] & 0xfefe)
239 idct_dequant_full_2x_neon(q, dq, dst, stride);
240 else
241 idct_dequant_0_2x_neon(q, dq[0], dst, stride);
242 }
243
244 if (((short *)(eobs))[1]) {
245 if (((short *)eobs)[1] & 0xfefe)
246 idct_dequant_full_2x_neon(q + 32, dq, dst + 8, stride);
247 else
248 idct_dequant_0_2x_neon(q + 32, dq[0], dst + 8, stride);
249 }
250 q += 64;
251 dst += 4 * stride;
252 eobs += 4;
253 }
254 }
255
vp8_dequant_idct_add_uv_block_neon(short * q,short * dq,unsigned char * dst_u,unsigned char * dst_v,int stride,char * eobs)256 void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
257 unsigned char *dst_u,
258 unsigned char *dst_v, int stride,
259 char *eobs) {
260 if (((short *)(eobs))[0]) {
261 if (((short *)eobs)[0] & 0xfefe)
262 idct_dequant_full_2x_neon(q, dq, dst_u, stride);
263 else
264 idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
265 }
266
267 q += 32;
268 dst_u += 4 * stride;
269
270 if (((short *)(eobs))[1]) {
271 if (((short *)eobs)[1] & 0xfefe)
272 idct_dequant_full_2x_neon(q, dq, dst_u, stride);
273 else
274 idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
275 }
276
277 q += 32;
278
279 if (((short *)(eobs))[2]) {
280 if (((short *)eobs)[2] & 0xfefe)
281 idct_dequant_full_2x_neon(q, dq, dst_v, stride);
282 else
283 idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
284 }
285
286 q += 32;
287 dst_v += 4 * stride;
288
289 if (((short *)(eobs))[3]) {
290 if (((short *)eobs)[3] & 0xfefe)
291 idct_dequant_full_2x_neon(q, dq, dst_v, stride);
292 else
293 idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
294 }
295 }
296