1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/highbd_idct_neon.h"
15 #include "vpx_dsp/arm/idct_neon.h"
16 #include "vpx_dsp/arm/transpose_neon.h"
17 #include "vpx_dsp/inv_txfm.h"
18
highbd_idct8x8_1_add_pos_kernel(uint16_t ** dest,const int stride,const int16x8_t res,const int16x8_t max)19 static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest,
20 const int stride,
21 const int16x8_t res,
22 const int16x8_t max) {
23 const uint16x8_t a = vld1q_u16(*dest);
24 const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
25 const int16x8_t c = vminq_s16(b, max);
26 vst1q_u16(*dest, vreinterpretq_u16_s16(c));
27 *dest += stride;
28 }
29
highbd_idct8x8_1_add_neg_kernel(uint16_t ** dest,const int stride,const int16x8_t res)30 static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
31 const int stride,
32 const int16x8_t res) {
33 const uint16x8_t a = vld1q_u16(*dest);
34 const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
35 const uint16x8_t c = vqshluq_n_s16(b, 0);
36 vst1q_u16(*dest, c);
37 *dest += stride;
38 }
39
vpx_highbd_idct8x8_1_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int bd)40 void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
41 int stride, int bd) {
42 const tran_low_t out0 = HIGHBD_WRAPLOW(
43 dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
44 const tran_low_t out1 = HIGHBD_WRAPLOW(
45 dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
46 const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
47 const int16x8_t dc = vdupq_n_s16(a1);
48
49 if (a1 >= 0) {
50 const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
51 highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
52 highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
53 highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
54 highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
55 highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
56 highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
57 highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
58 highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
59 } else {
60 highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
61 highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
62 highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
63 highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
64 highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
65 highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
66 highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
67 highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
68 }
69 }
70
idct8x8_12_half1d_bd10(const int32x4_t cospis0,const int32x4_t cospis1,int32x4_t * const io0,int32x4_t * const io1,int32x4_t * const io2,int32x4_t * const io3,int32x4_t * const io4,int32x4_t * const io5,int32x4_t * const io6,int32x4_t * const io7)71 static INLINE void idct8x8_12_half1d_bd10(
72 const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
73 int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
74 int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
75 int32x4_t *const io7) {
76 int32x4_t step1[8], step2[8];
77
78 transpose_s32_4x4(io0, io1, io2, io3);
79
80 // stage 1
81 step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
82 step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
83 step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
84 step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
85 step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
86 step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
87 step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
88 step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
89
90 // stage 2
91 step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
92 step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
93 step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
94 step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
95 step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
96 step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
97
98 step2[4] = vaddq_s32(step1[4], step1[5]);
99 step2[5] = vsubq_s32(step1[4], step1[5]);
100 step2[6] = vsubq_s32(step1[7], step1[6]);
101 step2[7] = vaddq_s32(step1[7], step1[6]);
102
103 // stage 3
104 step1[0] = vaddq_s32(step2[1], step2[3]);
105 step1[1] = vaddq_s32(step2[1], step2[2]);
106 step1[2] = vsubq_s32(step2[1], step2[2]);
107 step1[3] = vsubq_s32(step2[1], step2[3]);
108
109 step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
110 step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
111 step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
112 step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
113 step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
114
115 // stage 4
116 *io0 = vaddq_s32(step1[0], step2[7]);
117 *io1 = vaddq_s32(step1[1], step1[6]);
118 *io2 = vaddq_s32(step1[2], step1[5]);
119 *io3 = vaddq_s32(step1[3], step2[4]);
120 *io4 = vsubq_s32(step1[3], step2[4]);
121 *io5 = vsubq_s32(step1[2], step1[5]);
122 *io6 = vsubq_s32(step1[1], step1[6]);
123 *io7 = vsubq_s32(step1[0], step2[7]);
124 }
125
idct8x8_12_half1d_bd12(const int32x4_t cospis0,const int32x4_t cospis1,int32x4_t * const io0,int32x4_t * const io1,int32x4_t * const io2,int32x4_t * const io3,int32x4_t * const io4,int32x4_t * const io5,int32x4_t * const io6,int32x4_t * const io7)126 static INLINE void idct8x8_12_half1d_bd12(
127 const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
128 int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
129 int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
130 int32x4_t *const io7) {
131 int32x2_t input1l, input1h, input3l, input3h;
132 int32x2_t step1l[2], step1h[2];
133 int32x4_t step1[8], step2[8];
134 int64x2_t t64[8];
135 int32x2_t t32[8];
136
137 transpose_s32_4x4(io0, io1, io2, io3);
138
139 // stage 1
140 input1l = vget_low_s32(*io1);
141 input1h = vget_high_s32(*io1);
142 input3l = vget_low_s32(*io3);
143 input3h = vget_high_s32(*io3);
144 step1l[0] = vget_low_s32(*io0);
145 step1h[0] = vget_high_s32(*io0);
146 step1l[1] = vget_low_s32(*io2);
147 step1h[1] = vget_high_s32(*io2);
148
149 t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
150 t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
151 t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
152 t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
153 t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
154 t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
155 t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
156 t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
157 t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
158 t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
159 t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
160 t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
161 t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
162 t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
163 t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
164 t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
165 step1[4] = vcombine_s32(t32[0], t32[1]);
166 step1[5] = vcombine_s32(t32[2], t32[3]);
167 step1[6] = vcombine_s32(t32[4], t32[5]);
168 step1[7] = vcombine_s32(t32[6], t32[7]);
169
170 // stage 2
171 t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
172 t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
173 t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
174 t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
175 t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
176 t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
177 t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
178 t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
179 t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
180 t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
181 t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
182 t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
183 step2[1] = vcombine_s32(t32[2], t32[3]);
184 step2[2] = vcombine_s32(t32[4], t32[5]);
185 step2[3] = vcombine_s32(t32[6], t32[7]);
186
187 step2[4] = vaddq_s32(step1[4], step1[5]);
188 step2[5] = vsubq_s32(step1[4], step1[5]);
189 step2[6] = vsubq_s32(step1[7], step1[6]);
190 step2[7] = vaddq_s32(step1[7], step1[6]);
191
192 // stage 3
193 step1[0] = vaddq_s32(step2[1], step2[3]);
194 step1[1] = vaddq_s32(step2[1], step2[2]);
195 step1[2] = vsubq_s32(step2[1], step2[2]);
196 step1[3] = vsubq_s32(step2[1], step2[3]);
197
198 t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
199 t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
200 t64[0] =
201 vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
202 t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
203 vget_high_s32(cospis0), 0);
204 t64[2] =
205 vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
206 t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
207 vget_high_s32(cospis0), 0);
208 t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
209 t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
210 t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
211 t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
212 step1[5] = vcombine_s32(t32[0], t32[1]);
213 step1[6] = vcombine_s32(t32[2], t32[3]);
214
215 // stage 4
216 *io0 = vaddq_s32(step1[0], step2[7]);
217 *io1 = vaddq_s32(step1[1], step1[6]);
218 *io2 = vaddq_s32(step1[2], step1[5]);
219 *io3 = vaddq_s32(step1[3], step2[4]);
220 *io4 = vsubq_s32(step1[3], step2[4]);
221 *io5 = vsubq_s32(step1[2], step1[5]);
222 *io6 = vsubq_s32(step1[1], step1[6]);
223 *io7 = vsubq_s32(step1[0], step2[7]);
224 }
225
vpx_highbd_idct8x8_12_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int bd)226 void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
227 int stride, int bd) {
228 int32x4_t a[16];
229 int16x8_t c[8];
230
231 a[0] = vld1q_s32(input);
232 a[1] = vld1q_s32(input + 8);
233 a[2] = vld1q_s32(input + 16);
234 a[3] = vld1q_s32(input + 24);
235
236 if (bd == 8) {
237 const int16x8_t cospis = vld1q_s16(kCospi);
238 const int16x8_t cospisd = vaddq_s16(cospis, cospis);
239 const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
240 const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24
241 const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28
242 int16x4_t b[8];
243
244 b[0] = vmovn_s32(a[0]);
245 b[1] = vmovn_s32(a[1]);
246 b[2] = vmovn_s32(a[2]);
247 b[3] = vmovn_s32(a[3]);
248
249 idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, b);
250 idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b, c);
251 c[0] = vrshrq_n_s16(c[0], 5);
252 c[1] = vrshrq_n_s16(c[1], 5);
253 c[2] = vrshrq_n_s16(c[2], 5);
254 c[3] = vrshrq_n_s16(c[3], 5);
255 c[4] = vrshrq_n_s16(c[4], 5);
256 c[5] = vrshrq_n_s16(c[5], 5);
257 c[6] = vrshrq_n_s16(c[6], 5);
258 c[7] = vrshrq_n_s16(c[7], 5);
259 } else {
260 const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24
261 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
262
263 if (bd == 10) {
264 idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
265 &a[4], &a[5], &a[6], &a[7]);
266 idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
267 &a[8], &a[9], &a[10], &a[11]);
268 idct8x8_12_half1d_bd10(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
269 &a[12], &a[13], &a[14], &a[15]);
270 } else {
271 idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
272 &a[4], &a[5], &a[6], &a[7]);
273 idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
274 &a[8], &a[9], &a[10], &a[11]);
275 idct8x8_12_half1d_bd12(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
276 &a[12], &a[13], &a[14], &a[15]);
277 }
278 c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
279 c[1] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
280 c[2] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
281 c[3] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
282 c[4] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
283 c[5] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
284 c[6] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
285 c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
286 }
287 highbd_add8x8(c, dest, stride, bd);
288 }
289
vpx_highbd_idct8x8_64_add_neon(const tran_low_t * input,uint16_t * dest,int stride,int bd)290 void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
291 int stride, int bd) {
292 int32x4_t a[16];
293 int16x8_t c[8];
294
295 a[0] = vld1q_s32(input);
296 a[1] = vld1q_s32(input + 4);
297 a[2] = vld1q_s32(input + 8);
298 a[3] = vld1q_s32(input + 12);
299 a[4] = vld1q_s32(input + 16);
300 a[5] = vld1q_s32(input + 20);
301 a[6] = vld1q_s32(input + 24);
302 a[7] = vld1q_s32(input + 28);
303 a[8] = vld1q_s32(input + 32);
304 a[9] = vld1q_s32(input + 36);
305 a[10] = vld1q_s32(input + 40);
306 a[11] = vld1q_s32(input + 44);
307 a[12] = vld1q_s32(input + 48);
308 a[13] = vld1q_s32(input + 52);
309 a[14] = vld1q_s32(input + 56);
310 a[15] = vld1q_s32(input + 60);
311
312 if (bd == 8) {
313 const int16x8_t cospis = vld1q_s16(kCospi);
314 const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
315 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
316 int16x8_t b[8];
317
318 b[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1]));
319 b[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3]));
320 b[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5]));
321 b[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7]));
322 b[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9]));
323 b[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11]));
324 b[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13]));
325 b[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15]));
326
327 idct8x8_64_1d_bd8(cospis0, cospis1, b);
328 idct8x8_64_1d_bd8(cospis0, cospis1, b);
329
330 c[0] = vrshrq_n_s16(b[0], 5);
331 c[1] = vrshrq_n_s16(b[1], 5);
332 c[2] = vrshrq_n_s16(b[2], 5);
333 c[3] = vrshrq_n_s16(b[3], 5);
334 c[4] = vrshrq_n_s16(b[4], 5);
335 c[5] = vrshrq_n_s16(b[5], 5);
336 c[6] = vrshrq_n_s16(b[6], 5);
337 c[7] = vrshrq_n_s16(b[7], 5);
338 } else {
339 const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24
340 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28
341
342 if (bd == 10) {
343 idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
344 &a[4], &a[5], &a[6], &a[7]);
345 idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
346 &a[12], &a[13], &a[14], &a[15]);
347 idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
348 &a[2], &a[10], &a[3], &a[11]);
349 idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
350 &a[6], &a[14], &a[7], &a[15]);
351 } else {
352 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
353 &a[4], &a[5], &a[6], &a[7]);
354 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
355 &a[12], &a[13], &a[14], &a[15]);
356 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
357 &a[2], &a[10], &a[3], &a[11]);
358 idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
359 &a[6], &a[14], &a[7], &a[15]);
360 }
361 c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
362 c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
363 c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
364 c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
365 c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
366 c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
367 c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
368 c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
369 }
370 highbd_add8x8(c, dest, stride, bd);
371 }
372