1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx/vpx_integer.h"
16
17 //------------------------------------------------------------------------------
18 // DC 4x4
19
dc_sum_4(const uint16_t * ref)20 static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) {
21 const uint16x4_t ref_u16 = vld1_u16(ref);
22 const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16);
23 return vpadd_u16(p0, p0);
24 }
25
dc_store_4x4(uint16_t * dst,ptrdiff_t stride,const uint16x4_t dc)26 static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
27 const uint16x4_t dc) {
28 const uint16x4_t dc_dup = vdup_lane_u16(dc, 0);
29 int i;
30 for (i = 0; i < 4; ++i, dst += stride) {
31 vst1_u16(dst, dc_dup);
32 }
33 }
34
vpx_highbd_dc_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)35 void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
36 const uint16_t *above,
37 const uint16_t *left, int bd) {
38 const uint16x4_t a = vld1_u16(above);
39 const uint16x4_t l = vld1_u16(left);
40 uint16x4_t sum;
41 uint16x4_t dc;
42 (void)bd;
43 sum = vadd_u16(a, l);
44 sum = vpadd_u16(sum, sum);
45 sum = vpadd_u16(sum, sum);
46 dc = vrshr_n_u16(sum, 3);
47 dc_store_4x4(dst, stride, dc);
48 }
49
vpx_highbd_dc_left_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)50 void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
51 const uint16_t *above,
52 const uint16_t *left, int bd) {
53 const uint16x4_t sum = dc_sum_4(left);
54 const uint16x4_t dc = vrshr_n_u16(sum, 2);
55 (void)above;
56 (void)bd;
57 dc_store_4x4(dst, stride, dc);
58 }
59
vpx_highbd_dc_top_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)60 void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
61 const uint16_t *above,
62 const uint16_t *left, int bd) {
63 const uint16x4_t sum = dc_sum_4(above);
64 const uint16x4_t dc = vrshr_n_u16(sum, 2);
65 (void)left;
66 (void)bd;
67 dc_store_4x4(dst, stride, dc);
68 }
69
vpx_highbd_dc_128_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)70 void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
71 const uint16_t *above,
72 const uint16_t *left, int bd) {
73 const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
74 (void)above;
75 (void)left;
76 dc_store_4x4(dst, stride, dc);
77 }
78
79 //------------------------------------------------------------------------------
80 // DC 8x8
81
dc_sum_8(const uint16_t * ref)82 static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) {
83 const uint16x8_t ref_u16 = vld1q_u16(ref);
84 uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16));
85 sum = vpadd_u16(sum, sum);
86 return vpadd_u16(sum, sum);
87 }
88
dc_store_8x8(uint16_t * dst,ptrdiff_t stride,const uint16x4_t dc)89 static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
90 const uint16x4_t dc) {
91 const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
92 int i;
93 for (i = 0; i < 8; ++i, dst += stride) {
94 vst1q_u16(dst, dc_dup);
95 }
96 }
97
vpx_highbd_dc_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)98 void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
99 const uint16_t *above,
100 const uint16_t *left, int bd) {
101 const uint16x8_t above_u16 = vld1q_u16(above);
102 const uint16x8_t left_u16 = vld1q_u16(left);
103 const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
104 uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
105 uint16x4_t dc;
106 (void)bd;
107 sum = vpadd_u16(sum, sum);
108 sum = vpadd_u16(sum, sum);
109 dc = vrshr_n_u16(sum, 4);
110 dc_store_8x8(dst, stride, dc);
111 }
112
vpx_highbd_dc_left_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)113 void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
114 const uint16_t *above,
115 const uint16_t *left, int bd) {
116 const uint16x4_t sum = dc_sum_8(left);
117 const uint16x4_t dc = vrshr_n_u16(sum, 3);
118 (void)above;
119 (void)bd;
120 dc_store_8x8(dst, stride, dc);
121 }
122
vpx_highbd_dc_top_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)123 void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
124 const uint16_t *above,
125 const uint16_t *left, int bd) {
126 const uint16x4_t sum = dc_sum_8(above);
127 const uint16x4_t dc = vrshr_n_u16(sum, 3);
128 (void)left;
129 (void)bd;
130 dc_store_8x8(dst, stride, dc);
131 }
132
vpx_highbd_dc_128_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)133 void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
134 const uint16_t *above,
135 const uint16_t *left, int bd) {
136 const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
137 (void)above;
138 (void)left;
139 dc_store_8x8(dst, stride, dc);
140 }
141
142 //------------------------------------------------------------------------------
143 // DC 16x16
144
dc_sum_16(const uint16_t * ref)145 static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
146 const uint16x8x2_t ref_u16 = vld2q_u16(ref);
147 const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]);
148 uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
149 sum = vpadd_u16(sum, sum);
150 return vpadd_u16(sum, sum);
151 }
152
dc_store_16x16(uint16_t * dst,ptrdiff_t stride,const uint16x4_t dc)153 static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
154 const uint16x4_t dc) {
155 uint16x8x2_t dc_dup;
156 int i;
157 dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
158 for (i = 0; i < 16; ++i, dst += stride) {
159 vst2q_u16(dst, dc_dup);
160 }
161 }
162
vpx_highbd_dc_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)163 void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
164 const uint16_t *above,
165 const uint16_t *left, int bd) {
166 const uint16x8x2_t a = vld2q_u16(above);
167 const uint16x8x2_t l = vld2q_u16(left);
168 const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]);
169 const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]);
170 const uint16x8_t pal0 = vaddq_u16(pa, pl);
171 uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
172 uint32x2_t sum;
173 uint16x4_t dc;
174 (void)bd;
175 pal1 = vpadd_u16(pal1, pal1);
176 sum = vpaddl_u16(pal1);
177 dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
178 dc_store_16x16(dst, stride, dc);
179 }
180
vpx_highbd_dc_left_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)181 void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
182 const uint16_t *above,
183 const uint16_t *left, int bd) {
184 const uint16x4_t sum = dc_sum_16(left);
185 const uint16x4_t dc = vrshr_n_u16(sum, 4);
186 (void)above;
187 (void)bd;
188 dc_store_16x16(dst, stride, dc);
189 }
190
vpx_highbd_dc_top_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)191 void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
192 const uint16_t *above,
193 const uint16_t *left, int bd) {
194 const uint16x4_t sum = dc_sum_16(above);
195 const uint16x4_t dc = vrshr_n_u16(sum, 4);
196 (void)left;
197 (void)bd;
198 dc_store_16x16(dst, stride, dc);
199 }
200
vpx_highbd_dc_128_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)201 void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
202 const uint16_t *above,
203 const uint16_t *left, int bd) {
204 const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
205 (void)above;
206 (void)left;
207 dc_store_16x16(dst, stride, dc);
208 }
209
210 //------------------------------------------------------------------------------
211 // DC 32x32
212
dc_sum_32(const uint16_t * ref)213 static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
214 const uint16x8x4_t r = vld4q_u16(ref);
215 const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]);
216 const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]);
217 const uint16x8_t p2 = vaddq_u16(p0, p1);
218 uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
219 sum = vpadd_u16(sum, sum);
220 return vpaddl_u16(sum);
221 }
222
dc_store_32x32(uint16_t * dst,ptrdiff_t stride,const uint16x4_t dc)223 static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
224 const uint16x4_t dc) {
225 uint16x8x2_t dc_dup;
226 int i;
227 dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
228
229 for (i = 0; i < 32; ++i) {
230 vst2q_u16(dst, dc_dup);
231 dst += 16;
232 vst2q_u16(dst, dc_dup);
233 dst += stride - 16;
234 }
235 }
236
vpx_highbd_dc_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)237 void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
238 const uint16_t *above,
239 const uint16_t *left, int bd) {
240 const uint16x8x4_t a = vld4q_u16(above);
241 const uint16x8x4_t l = vld4q_u16(left);
242 const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]);
243 const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]);
244 const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]);
245 const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]);
246 const uint16x8_t pa = vaddq_u16(pa0, pa1);
247 const uint16x8_t pl = vaddq_u16(pl0, pl1);
248 const uint16x8_t pal0 = vaddq_u16(pa, pl);
249 const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
250 uint32x2_t sum = vpaddl_u16(pal1);
251 uint16x4_t dc;
252 (void)bd;
253 sum = vpadd_u32(sum, sum);
254 dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6));
255 dc_store_32x32(dst, stride, dc);
256 }
257
vpx_highbd_dc_left_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)258 void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
259 const uint16_t *above,
260 const uint16_t *left, int bd) {
261 const uint32x2_t sum = dc_sum_32(left);
262 const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
263 (void)above;
264 (void)bd;
265 dc_store_32x32(dst, stride, dc);
266 }
267
vpx_highbd_dc_top_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)268 void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
269 const uint16_t *above,
270 const uint16_t *left, int bd) {
271 const uint32x2_t sum = dc_sum_32(above);
272 const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
273 (void)left;
274 (void)bd;
275 dc_store_32x32(dst, stride, dc);
276 }
277
vpx_highbd_dc_128_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)278 void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
279 const uint16_t *above,
280 const uint16_t *left, int bd) {
281 const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
282 (void)above;
283 (void)left;
284 dc_store_32x32(dst, stride, dc);
285 }
286
287 // -----------------------------------------------------------------------------
288
vpx_highbd_d45_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)289 void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
290 const uint16_t *above,
291 const uint16_t *left, int bd) {
292 const uint16x8_t ABCDEFGH = vld1q_u16(above);
293 const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
294 const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
295 const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
296 const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
297 const uint16x4_t avg2_low = vget_low_u16(avg2);
298 const uint16x4_t avg2_high = vget_high_u16(avg2);
299 const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
300 const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
301 const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
302 (void)left;
303 (void)bd;
304 vst1_u16(dst, avg2_low);
305 dst += stride;
306 vst1_u16(dst, r1);
307 dst += stride;
308 vst1_u16(dst, r2);
309 dst += stride;
310 vst1_u16(dst, r3);
311 vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
312 }
313
d45_store_8(uint16_t ** dst,const ptrdiff_t stride,const uint16x8_t above_right,uint16x8_t * row)314 static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
315 const uint16x8_t above_right, uint16x8_t *row) {
316 *row = vextq_u16(*row, above_right, 1);
317 vst1q_u16(*dst, *row);
318 *dst += stride;
319 }
320
vpx_highbd_d45_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)321 void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
322 const uint16_t *above,
323 const uint16_t *left, int bd) {
324 const uint16x8_t A0 = vld1q_u16(above);
325 const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
326 const uint16x8_t A1 = vld1q_u16(above + 1);
327 const uint16x8_t A2 = vld1q_u16(above + 2);
328 const uint16x8_t avg1 = vhaddq_u16(A0, A2);
329 uint16x8_t row = vrhaddq_u16(avg1, A1);
330 (void)left;
331 (void)bd;
332
333 vst1q_u16(dst, row);
334 dst += stride;
335 d45_store_8(&dst, stride, above_right, &row);
336 d45_store_8(&dst, stride, above_right, &row);
337 d45_store_8(&dst, stride, above_right, &row);
338 d45_store_8(&dst, stride, above_right, &row);
339 d45_store_8(&dst, stride, above_right, &row);
340 d45_store_8(&dst, stride, above_right, &row);
341 vst1q_u16(dst, above_right);
342 }
343
d45_store_16(uint16_t ** dst,const ptrdiff_t stride,const uint16x8_t above_right,uint16x8_t * row_0,uint16x8_t * row_1)344 static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
345 const uint16x8_t above_right, uint16x8_t *row_0,
346 uint16x8_t *row_1) {
347 *row_0 = vextq_u16(*row_0, *row_1, 1);
348 *row_1 = vextq_u16(*row_1, above_right, 1);
349 vst1q_u16(*dst, *row_0);
350 *dst += 8;
351 vst1q_u16(*dst, *row_1);
352 *dst += stride - 8;
353 }
354
vpx_highbd_d45_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)355 void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
356 const uint16_t *above,
357 const uint16_t *left, int bd) {
358 const uint16x8_t A0_0 = vld1q_u16(above);
359 const uint16x8_t A0_1 = vld1q_u16(above + 8);
360 const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
361 const uint16x8_t A1_0 = vld1q_u16(above + 1);
362 const uint16x8_t A1_1 = vld1q_u16(above + 9);
363 const uint16x8_t A2_0 = vld1q_u16(above + 2);
364 const uint16x8_t A2_1 = vld1q_u16(above + 10);
365 const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
366 const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
367 uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
368 uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
369 (void)left;
370 (void)bd;
371
372 vst1q_u16(dst, row_0);
373 vst1q_u16(dst + 8, row_1);
374 dst += stride;
375 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
376 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
377 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
378 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
379 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
380 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
381 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
382 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
383 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
384 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
385 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
386 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
387 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
388 d45_store_16(&dst, stride, above_right, &row_0, &row_1);
389 vst1q_u16(dst, above_right);
390 vst1q_u16(dst + 8, above_right);
391 }
392
vpx_highbd_d45_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)393 void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
394 const uint16_t *above,
395 const uint16_t *left, int bd) {
396 const uint16x8_t A0_0 = vld1q_u16(above);
397 const uint16x8_t A0_1 = vld1q_u16(above + 8);
398 const uint16x8_t A0_2 = vld1q_u16(above + 16);
399 const uint16x8_t A0_3 = vld1q_u16(above + 24);
400 const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
401 const uint16x8_t A1_0 = vld1q_u16(above + 1);
402 const uint16x8_t A1_1 = vld1q_u16(above + 9);
403 const uint16x8_t A1_2 = vld1q_u16(above + 17);
404 const uint16x8_t A1_3 = vld1q_u16(above + 25);
405 const uint16x8_t A2_0 = vld1q_u16(above + 2);
406 const uint16x8_t A2_1 = vld1q_u16(above + 10);
407 const uint16x8_t A2_2 = vld1q_u16(above + 18);
408 const uint16x8_t A2_3 = vld1q_u16(above + 26);
409 const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
410 const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
411 const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
412 const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
413 uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
414 uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
415 uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
416 uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
417 int i;
418 (void)left;
419 (void)bd;
420
421 vst1q_u16(dst, row_0);
422 dst += 8;
423 vst1q_u16(dst, row_1);
424 dst += 8;
425 vst1q_u16(dst, row_2);
426 dst += 8;
427 vst1q_u16(dst, row_3);
428 dst += stride - 24;
429
430 for (i = 0; i < 30; ++i) {
431 row_0 = vextq_u16(row_0, row_1, 1);
432 row_1 = vextq_u16(row_1, row_2, 1);
433 row_2 = vextq_u16(row_2, row_3, 1);
434 row_3 = vextq_u16(row_3, above_right, 1);
435 vst1q_u16(dst, row_0);
436 dst += 8;
437 vst1q_u16(dst, row_1);
438 dst += 8;
439 vst1q_u16(dst, row_2);
440 dst += 8;
441 vst1q_u16(dst, row_3);
442 dst += stride - 24;
443 }
444
445 vst1q_u16(dst, above_right);
446 dst += 8;
447 vst1q_u16(dst, above_right);
448 dst += 8;
449 vst1q_u16(dst, above_right);
450 dst += 8;
451 vst1q_u16(dst, above_right);
452 }
453
454 // -----------------------------------------------------------------------------
455
vpx_highbd_d135_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)456 void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
457 const uint16_t *above,
458 const uint16_t *left, int bd) {
459 const uint16x8_t XA0123___ = vld1q_u16(above - 1);
460 const uint16x4_t L0123 = vld1_u16(left);
461 const uint16x4_t L3210 = vrev64_u16(L0123);
462 const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
463 const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
464 const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
465 const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
466 const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
467 const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
468 const uint16x4_t row_0 = vget_low_u16(avg2);
469 const uint16x4_t row_1 = vget_high_u16(avg2);
470 const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
471 const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
472 const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
473 (void)bd;
474 vst1_u16(dst, r0);
475 dst += stride;
476 vst1_u16(dst, r1);
477 dst += stride;
478 vst1_u16(dst, r2);
479 dst += stride;
480 vst1_u16(dst, row_0);
481 }
482
vpx_highbd_d135_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)483 void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
484 const uint16_t *above,
485 const uint16_t *left, int bd) {
486 const uint16x8_t XA0123456 = vld1q_u16(above - 1);
487 const uint16x8_t A01234567 = vld1q_u16(above);
488 const uint16x8_t A1234567_ = vld1q_u16(above + 1);
489 const uint16x8_t L01234567 = vld1q_u16(left);
490 const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
491 const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
492 const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
493 const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
494 const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
495 const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
496 const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
497 const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
498 const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
499 const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
500 const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
501 const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
502 const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
503 const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
504 const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
505 const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
506 (void)bd;
507 vst1q_u16(dst, r0);
508 dst += stride;
509 vst1q_u16(dst, r1);
510 dst += stride;
511 vst1q_u16(dst, r2);
512 dst += stride;
513 vst1q_u16(dst, r3);
514 dst += stride;
515 vst1q_u16(dst, r4);
516 dst += stride;
517 vst1q_u16(dst, r5);
518 dst += stride;
519 vst1q_u16(dst, r6);
520 dst += stride;
521 vst1q_u16(dst, row_0);
522 }
523
d135_store_16(uint16_t ** dst,const ptrdiff_t stride,const uint16x8_t row_0,const uint16x8_t row_1)524 static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
525 const uint16x8_t row_0,
526 const uint16x8_t row_1) {
527 vst1q_u16(*dst, row_0);
528 *dst += 8;
529 vst1q_u16(*dst, row_1);
530 *dst += stride - 8;
531 }
532
vpx_highbd_d135_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)533 void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
534 const uint16_t *above,
535 const uint16_t *left, int bd) {
536 const uint16x8_t L01234567 = vld1q_u16(left);
537 const uint16x8_t L89abcdef = vld1q_u16(left + 8);
538 const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
539 const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
540 const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
541 const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
542 const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
543 const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
544 const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
545 const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
546 const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
547 const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
548
549 const uint16x8_t XA0123456 = vld1q_u16(above - 1);
550 const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
551 const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
552 const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
553 const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
554
555 const uint16x8_t A01234567 = vld1q_u16(above);
556 const uint16x8_t A12345678 = vld1q_u16(above + 1);
557 const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
558 const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
559
560 const uint16x8_t A789abcde = vld1q_u16(above + 7);
561 const uint16x8_t A89abcdef = vld1q_u16(above + 8);
562 const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
563 const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
564 const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
565
566 const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
567 const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
568 const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
569 const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
570 const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
571 const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
572 const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
573 const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
574 const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
575 const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
576 const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
577 const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
578 const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
579 const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
580 const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
581 const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
582 const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
583 const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
584 const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
585 const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
586 const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
587 (void)bd;
588
589 d135_store_16(&dst, stride, r0_0, r0_1);
590 d135_store_16(&dst, stride, r1_0, r1_1);
591 d135_store_16(&dst, stride, r2_0, r2_1);
592 d135_store_16(&dst, stride, r3_0, r3_1);
593 d135_store_16(&dst, stride, r4_0, r4_1);
594 d135_store_16(&dst, stride, r5_0, r5_1);
595 d135_store_16(&dst, stride, r6_0, r6_1);
596 d135_store_16(&dst, stride, row_1, row_2);
597 d135_store_16(&dst, stride, r8_0, r0_0);
598 d135_store_16(&dst, stride, r9_0, r1_0);
599 d135_store_16(&dst, stride, ra_0, r2_0);
600 d135_store_16(&dst, stride, rb_0, r3_0);
601 d135_store_16(&dst, stride, rc_0, r4_0);
602 d135_store_16(&dst, stride, rd_0, r5_0);
603 d135_store_16(&dst, stride, re_0, r6_0);
604 vst1q_u16(dst, row_0);
605 dst += 8;
606 vst1q_u16(dst, row_1);
607 }
608
vpx_highbd_d135_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)609 void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
610 const uint16_t *above,
611 const uint16_t *left, int bd) {
612 const uint16x8_t LL01234567 = vld1q_u16(left + 16);
613 const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
614 const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
615 const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
616 const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
617 const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
618 const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
619 const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
620 const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
621 const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
622 const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
623 uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
624
625 const uint16x8_t LU01234567 = vld1q_u16(left);
626 const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
627 const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
628 const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
629 const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
630 const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
631 const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
632 const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
633 const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
634 const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
635 const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
636 uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
637
638 const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
639 const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
640 const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
641 uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
642
643 const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
644 const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
645 const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
646 const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
647 uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
648
649 const uint16x8_t AL01234567 = vld1q_u16(above);
650 const uint16x8_t AL12345678 = vld1q_u16(above + 1);
651 const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
652 uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
653
654 const uint16x8_t AL789abcde = vld1q_u16(above + 7);
655 const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
656 const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
657 const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
658 uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
659
660 const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
661 const uint16x8_t AR01234567 = vld1q_u16(above + 16);
662 const uint16x8_t AR12345678 = vld1q_u16(above + 17);
663 const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
664 uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
665
666 const uint16x8_t AR789abcde = vld1q_u16(above + 23);
667 const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
668 const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
669 const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
670 uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
671 int i, j;
672 (void)bd;
673
674 dst += 31 * stride;
675 for (i = 0; i < 4; ++i) {
676 for (j = 0; j < 8; ++j) {
677 vst1q_u16(dst, row_0);
678 dst += 8;
679 vst1q_u16(dst, row_1);
680 dst += 8;
681 vst1q_u16(dst, row_2);
682 dst += 8;
683 vst1q_u16(dst, row_3);
684 dst -= stride + 24;
685 row_0 = vextq_u16(row_0, row_1, 1);
686 row_1 = vextq_u16(row_1, row_2, 1);
687 row_2 = vextq_u16(row_2, row_3, 1);
688 row_3 = vextq_u16(row_3, row_4, 1);
689 row_4 = vextq_u16(row_4, row_4, 1);
690 }
691 row_4 = row_5;
692 row_5 = row_6;
693 row_6 = row_7;
694 }
695 }
696
697 //------------------------------------------------------------------------------
698
vpx_highbd_v_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)699 void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
700 const uint16_t *above,
701 const uint16_t *left, int bd) {
702 const uint16x4_t row = vld1_u16(above);
703 int i;
704 (void)left;
705 (void)bd;
706
707 for (i = 0; i < 4; i++, dst += stride) {
708 vst1_u16(dst, row);
709 }
710 }
711
vpx_highbd_v_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)712 void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
713 const uint16_t *above,
714 const uint16_t *left, int bd) {
715 const uint16x8_t row = vld1q_u16(above);
716 int i;
717 (void)left;
718 (void)bd;
719
720 for (i = 0; i < 8; i++, dst += stride) {
721 vst1q_u16(dst, row);
722 }
723 }
724
vpx_highbd_v_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)725 void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
726 const uint16_t *above,
727 const uint16_t *left, int bd) {
728 const uint16x8x2_t row = vld2q_u16(above);
729 int i;
730 (void)left;
731 (void)bd;
732
733 for (i = 0; i < 16; i++, dst += stride) {
734 vst2q_u16(dst, row);
735 }
736 }
737
vpx_highbd_v_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)738 void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
739 const uint16_t *above,
740 const uint16_t *left, int bd) {
741 const uint16x8x2_t row0 = vld2q_u16(above);
742 const uint16x8x2_t row1 = vld2q_u16(above + 16);
743 int i;
744 (void)left;
745 (void)bd;
746
747 for (i = 0; i < 32; i++) {
748 vst2q_u16(dst, row0);
749 dst += 16;
750 vst2q_u16(dst, row1);
751 dst += stride - 16;
752 }
753 }
754
755 // -----------------------------------------------------------------------------
756
vpx_highbd_h_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)757 void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
758 const uint16_t *above,
759 const uint16_t *left, int bd) {
760 const uint16x4_t left_u16 = vld1_u16(left);
761 uint16x4_t row;
762 (void)above;
763 (void)bd;
764
765 row = vdup_lane_u16(left_u16, 0);
766 vst1_u16(dst, row);
767 dst += stride;
768 row = vdup_lane_u16(left_u16, 1);
769 vst1_u16(dst, row);
770 dst += stride;
771 row = vdup_lane_u16(left_u16, 2);
772 vst1_u16(dst, row);
773 dst += stride;
774 row = vdup_lane_u16(left_u16, 3);
775 vst1_u16(dst, row);
776 }
777
vpx_highbd_h_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)778 void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
779 const uint16_t *above,
780 const uint16_t *left, int bd) {
781 const uint16x8_t left_u16 = vld1q_u16(left);
782 const uint16x4_t left_low = vget_low_u16(left_u16);
783 const uint16x4_t left_high = vget_high_u16(left_u16);
784 uint16x8_t row;
785 (void)above;
786 (void)bd;
787
788 row = vdupq_lane_u16(left_low, 0);
789 vst1q_u16(dst, row);
790 dst += stride;
791 row = vdupq_lane_u16(left_low, 1);
792 vst1q_u16(dst, row);
793 dst += stride;
794 row = vdupq_lane_u16(left_low, 2);
795 vst1q_u16(dst, row);
796 dst += stride;
797 row = vdupq_lane_u16(left_low, 3);
798 vst1q_u16(dst, row);
799 dst += stride;
800 row = vdupq_lane_u16(left_high, 0);
801 vst1q_u16(dst, row);
802 dst += stride;
803 row = vdupq_lane_u16(left_high, 1);
804 vst1q_u16(dst, row);
805 dst += stride;
806 row = vdupq_lane_u16(left_high, 2);
807 vst1q_u16(dst, row);
808 dst += stride;
809 row = vdupq_lane_u16(left_high, 3);
810 vst1q_u16(dst, row);
811 }
812
h_store_16(uint16_t ** dst,const ptrdiff_t stride,const uint16x8_t row)813 static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
814 const uint16x8_t row) {
815 // Note: vst1q is faster than vst2q
816 vst1q_u16(*dst, row);
817 *dst += 8;
818 vst1q_u16(*dst, row);
819 *dst += stride - 8;
820 }
821
vpx_highbd_h_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)822 void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
823 const uint16_t *above,
824 const uint16_t *left, int bd) {
825 int i;
826 (void)above;
827 (void)bd;
828
829 for (i = 0; i < 2; i++, left += 8) {
830 const uint16x8_t left_u16q = vld1q_u16(left);
831 const uint16x4_t left_low = vget_low_u16(left_u16q);
832 const uint16x4_t left_high = vget_high_u16(left_u16q);
833 uint16x8_t row;
834
835 row = vdupq_lane_u16(left_low, 0);
836 h_store_16(&dst, stride, row);
837 row = vdupq_lane_u16(left_low, 1);
838 h_store_16(&dst, stride, row);
839 row = vdupq_lane_u16(left_low, 2);
840 h_store_16(&dst, stride, row);
841 row = vdupq_lane_u16(left_low, 3);
842 h_store_16(&dst, stride, row);
843 row = vdupq_lane_u16(left_high, 0);
844 h_store_16(&dst, stride, row);
845 row = vdupq_lane_u16(left_high, 1);
846 h_store_16(&dst, stride, row);
847 row = vdupq_lane_u16(left_high, 2);
848 h_store_16(&dst, stride, row);
849 row = vdupq_lane_u16(left_high, 3);
850 h_store_16(&dst, stride, row);
851 }
852 }
853
h_store_32(uint16_t ** dst,const ptrdiff_t stride,const uint16x8_t row)854 static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
855 const uint16x8_t row) {
856 // Note: vst1q is faster than vst2q
857 vst1q_u16(*dst, row);
858 *dst += 8;
859 vst1q_u16(*dst, row);
860 *dst += 8;
861 vst1q_u16(*dst, row);
862 *dst += 8;
863 vst1q_u16(*dst, row);
864 *dst += stride - 24;
865 }
866
vpx_highbd_h_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)867 void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
868 const uint16_t *above,
869 const uint16_t *left, int bd) {
870 int i;
871 (void)above;
872 (void)bd;
873
874 for (i = 0; i < 4; i++, left += 8) {
875 const uint16x8_t left_u16q = vld1q_u16(left);
876 const uint16x4_t left_low = vget_low_u16(left_u16q);
877 const uint16x4_t left_high = vget_high_u16(left_u16q);
878 uint16x8_t row;
879
880 row = vdupq_lane_u16(left_low, 0);
881 h_store_32(&dst, stride, row);
882 row = vdupq_lane_u16(left_low, 1);
883 h_store_32(&dst, stride, row);
884 row = vdupq_lane_u16(left_low, 2);
885 h_store_32(&dst, stride, row);
886 row = vdupq_lane_u16(left_low, 3);
887 h_store_32(&dst, stride, row);
888 row = vdupq_lane_u16(left_high, 0);
889 h_store_32(&dst, stride, row);
890 row = vdupq_lane_u16(left_high, 1);
891 h_store_32(&dst, stride, row);
892 row = vdupq_lane_u16(left_high, 2);
893 h_store_32(&dst, stride, row);
894 row = vdupq_lane_u16(left_high, 3);
895 h_store_32(&dst, stride, row);
896 }
897 }
898
899 // -----------------------------------------------------------------------------
900
vpx_highbd_tm_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)901 void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
902 const uint16_t *above,
903 const uint16_t *left, int bd) {
904 const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
905 const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
906 const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
907 const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
908 const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
909 const int16x8_t sub = vsubq_s16(above_s16, top_left);
910 int16x8_t sum;
911 uint16x8_t row;
912
913 sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
914 sum = vaddq_s16(sum, sub);
915 sum = vminq_s16(sum, max);
916 row = vqshluq_n_s16(sum, 0);
917 vst1_u16(dst, vget_low_u16(row));
918 dst += stride;
919 vst1_u16(dst, vget_high_u16(row));
920 dst += stride;
921
922 sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
923 sum = vaddq_s16(sum, sub);
924 sum = vminq_s16(sum, max);
925 row = vqshluq_n_s16(sum, 0);
926 vst1_u16(dst, vget_low_u16(row));
927 dst += stride;
928 vst1_u16(dst, vget_high_u16(row));
929 }
930
tm_8_kernel(uint16_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub,const int16x8_t max)931 static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
932 const int16x8_t left_dup, const int16x8_t sub,
933 const int16x8_t max) {
934 uint16x8_t row;
935 int16x8_t sum = vaddq_s16(left_dup, sub);
936 sum = vminq_s16(sum, max);
937 row = vqshluq_n_s16(sum, 0);
938 vst1q_u16(*dst, row);
939 *dst += stride;
940 }
941
vpx_highbd_tm_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)942 void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
943 const uint16_t *above,
944 const uint16_t *left, int bd) {
945 const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
946 const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
947 const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
948 const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
949 const int16x8_t sub = vsubq_s16(above_s16, top_left);
950 int16x4_t left_s16d;
951 int16x8_t left_dup;
952 int i;
953
954 left_s16d = vget_low_s16(left_s16);
955
956 for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
957 left_dup = vdupq_lane_s16(left_s16d, 0);
958 tm_8_kernel(&dst, stride, left_dup, sub, max);
959
960 left_dup = vdupq_lane_s16(left_s16d, 1);
961 tm_8_kernel(&dst, stride, left_dup, sub, max);
962
963 left_dup = vdupq_lane_s16(left_s16d, 2);
964 tm_8_kernel(&dst, stride, left_dup, sub, max);
965
966 left_dup = vdupq_lane_s16(left_s16d, 3);
967 tm_8_kernel(&dst, stride, left_dup, sub, max);
968 }
969 }
970
tm_16_kernel(uint16_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub0,const int16x8_t sub1,const int16x8_t max)971 static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
972 const int16x8_t left_dup, const int16x8_t sub0,
973 const int16x8_t sub1, const int16x8_t max) {
974 uint16x8_t row0, row1;
975 int16x8_t sum0 = vaddq_s16(left_dup, sub0);
976 int16x8_t sum1 = vaddq_s16(left_dup, sub1);
977 sum0 = vminq_s16(sum0, max);
978 sum1 = vminq_s16(sum1, max);
979 row0 = vqshluq_n_s16(sum0, 0);
980 row1 = vqshluq_n_s16(sum1, 0);
981 vst1q_u16(*dst, row0);
982 *dst += 8;
983 vst1q_u16(*dst, row1);
984 *dst += stride - 8;
985 }
986
vpx_highbd_tm_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)987 void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
988 const uint16_t *above,
989 const uint16_t *left, int bd) {
990 const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
991 const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
992 const int16x8_t above0 = vld1q_s16((const int16_t *)above);
993 const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
994 const int16x8_t sub0 = vsubq_s16(above0, top_left);
995 const int16x8_t sub1 = vsubq_s16(above1, top_left);
996 int16x8_t left_dup;
997 int i, j;
998
999 for (j = 0; j < 2; j++, left += 8) {
1000 const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
1001 int16x4_t left_s16d = vget_low_s16(left_s16q);
1002 for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
1003 left_dup = vdupq_lane_s16(left_s16d, 0);
1004 tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
1005
1006 left_dup = vdupq_lane_s16(left_s16d, 1);
1007 tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
1008
1009 left_dup = vdupq_lane_s16(left_s16d, 2);
1010 tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
1011
1012 left_dup = vdupq_lane_s16(left_s16d, 3);
1013 tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
1014 }
1015 }
1016 }
1017
tm_32_kernel(uint16_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub0,const int16x8_t sub1,const int16x8_t sub2,const int16x8_t sub3,const int16x8_t max)1018 static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
1019 const int16x8_t left_dup, const int16x8_t sub0,
1020 const int16x8_t sub1, const int16x8_t sub2,
1021 const int16x8_t sub3, const int16x8_t max) {
1022 uint16x8_t row0, row1, row2, row3;
1023 int16x8_t sum0 = vaddq_s16(left_dup, sub0);
1024 int16x8_t sum1 = vaddq_s16(left_dup, sub1);
1025 int16x8_t sum2 = vaddq_s16(left_dup, sub2);
1026 int16x8_t sum3 = vaddq_s16(left_dup, sub3);
1027 sum0 = vminq_s16(sum0, max);
1028 sum1 = vminq_s16(sum1, max);
1029 sum2 = vminq_s16(sum2, max);
1030 sum3 = vminq_s16(sum3, max);
1031 row0 = vqshluq_n_s16(sum0, 0);
1032 row1 = vqshluq_n_s16(sum1, 0);
1033 row2 = vqshluq_n_s16(sum2, 0);
1034 row3 = vqshluq_n_s16(sum3, 0);
1035 vst1q_u16(*dst, row0);
1036 *dst += 8;
1037 vst1q_u16(*dst, row1);
1038 *dst += 8;
1039 vst1q_u16(*dst, row2);
1040 *dst += 8;
1041 vst1q_u16(*dst, row3);
1042 *dst += stride - 24;
1043 }
1044
vpx_highbd_tm_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1045 void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
1046 const uint16_t *above,
1047 const uint16_t *left, int bd) {
1048 const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
1049 const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
1050 const int16x8_t above0 = vld1q_s16((const int16_t *)above);
1051 const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
1052 const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
1053 const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
1054 const int16x8_t sub0 = vsubq_s16(above0, top_left);
1055 const int16x8_t sub1 = vsubq_s16(above1, top_left);
1056 const int16x8_t sub2 = vsubq_s16(above2, top_left);
1057 const int16x8_t sub3 = vsubq_s16(above3, top_left);
1058 int16x8_t left_dup;
1059 int i, j;
1060
1061 for (i = 0; i < 4; i++, left += 8) {
1062 const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
1063 int16x4_t left_s16d = vget_low_s16(left_s16q);
1064 for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
1065 left_dup = vdupq_lane_s16(left_s16d, 0);
1066 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
1067
1068 left_dup = vdupq_lane_s16(left_s16d, 1);
1069 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
1070
1071 left_dup = vdupq_lane_s16(left_s16d, 2);
1072 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
1073
1074 left_dup = vdupq_lane_s16(left_s16d, 3);
1075 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
1076 }
1077 }
1078 }
1079