• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/arm/blend_neon.h"
20 #include "aom_dsp/arm/mem_neon.h"
21 #include "aom_dsp/blend.h"
22 
alpha_blend_a64_d16_u16x8(uint16x8_t m,uint16x8_t a,uint16x8_t b,uint16x8_t round_offset)23 uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
24                                     uint16x8_t round_offset) {
25   const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
26 
27   uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
28   uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(m), vget_high_u16(a));
29 
30   blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));
31   blend_u32_hi =
32       vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));
33 
34   uint16x4_t blend_u16_lo = vshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
35   uint16x4_t blend_u16_hi = vshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
36 
37   uint16x8_t res = vcombine_u16(blend_u16_lo, blend_u16_hi);
38 
39   res = vqsubq_u16(res, round_offset);
40 
41   return vqrshrn_n_u16(res,
42                        2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
43 }
44 
aom_lowbd_blend_a64_d16_mask_neon(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh,ConvolveParams * conv_params)45 void aom_lowbd_blend_a64_d16_mask_neon(
46     uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
47     uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
48     const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
49     ConvolveParams *conv_params) {
50   (void)conv_params;
51 
52   const int bd = 8;
53   const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
54   const int round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
55                            (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
56   const uint16x8_t offset_vec = vdupq_n_u16(round_offset);
57 
58   assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
59   assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
60 
61   assert(h >= 4);
62   assert(w >= 4);
63   assert(IS_POWER_OF_TWO(h));
64   assert(IS_POWER_OF_TWO(w));
65 
66   if (subw == 0 && subh == 0) {
67     if (w >= 8) {
68       do {
69         int i = 0;
70         do {
71           uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
72           uint16x8_t s0 = vld1q_u16(src0 + i);
73           uint16x8_t s1 = vld1q_u16(src1 + i);
74 
75           uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
76 
77           vst1_u8(dst + i, blend);
78           i += 8;
79         } while (i < w);
80 
81         mask += mask_stride;
82         src0 += src0_stride;
83         src1 += src1_stride;
84         dst += dst_stride;
85       } while (--h != 0);
86     } else {
87       do {
88         uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
89         uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
90         uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
91 
92         uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
93 
94         store_u8x4_strided_x2(dst, dst_stride, blend);
95 
96         mask += 2 * mask_stride;
97         src0 += 2 * src0_stride;
98         src1 += 2 * src1_stride;
99         dst += 2 * dst_stride;
100         h -= 2;
101       } while (h != 0);
102     }
103   } else if (subw == 1 && subh == 1) {
104     if (w >= 8) {
105       do {
106         int i = 0;
107         do {
108           uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
109           uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
110           uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
111           uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
112           uint16x8_t s0 = vld1q_u16(src0 + i);
113           uint16x8_t s1 = vld1q_u16(src1 + i);
114 
115           uint16x8_t m_avg =
116               vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
117 
118           uint8x8_t blend =
119               alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
120 
121           vst1_u8(dst + i, blend);
122           i += 8;
123         } while (i < w);
124 
125         mask += 2 * mask_stride;
126         src0 += src0_stride;
127         src1 += src1_stride;
128         dst += dst_stride;
129       } while (--h != 0);
130     } else {
131       do {
132         uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
133         uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
134         uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
135         uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
136         uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
137         uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
138 
139         uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
140         uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
141 
142         store_u8x4_strided_x2(dst, dst_stride, blend);
143 
144         mask += 4 * mask_stride;
145         src0 += 2 * src0_stride;
146         src1 += 2 * src1_stride;
147         dst += 2 * dst_stride;
148         h -= 2;
149       } while (h != 0);
150     }
151   } else if (subw == 1 && subh == 0) {
152     if (w >= 8) {
153       do {
154         int i = 0;
155         do {
156           uint8x8_t m0 = vld1_u8(mask + 2 * i);
157           uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
158           uint16x8_t s0 = vld1q_u16(src0 + i);
159           uint16x8_t s1 = vld1q_u16(src1 + i);
160 
161           uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
162           uint8x8_t blend =
163               alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
164 
165           vst1_u8(dst + i, blend);
166           i += 8;
167         } while (i < w);
168 
169         mask += mask_stride;
170         src0 += src0_stride;
171         src1 += src1_stride;
172         dst += dst_stride;
173       } while (--h != 0);
174     } else {
175       do {
176         uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
177         uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
178         uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
179         uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
180 
181         uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
182         uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
183 
184         store_u8x4_strided_x2(dst, dst_stride, blend);
185 
186         mask += 2 * mask_stride;
187         src0 += 2 * src0_stride;
188         src1 += 2 * src1_stride;
189         dst += 2 * dst_stride;
190         h -= 2;
191       } while (h != 0);
192     }
193   } else {
194     if (w >= 8) {
195       do {
196         int i = 0;
197         do {
198           uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
199           uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
200           uint16x8_t s0 = vld1q_u16(src0 + i);
201           uint16x8_t s1 = vld1q_u16(src1 + i);
202 
203           uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
204           uint8x8_t blend =
205               alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
206 
207           vst1_u8(dst + i, blend);
208           i += 8;
209         } while (i < w);
210 
211         mask += 2 * mask_stride;
212         src0 += src0_stride;
213         src1 += src1_stride;
214         dst += dst_stride;
215       } while (--h != 0);
216     } else {
217       do {
218         uint8x8_t m0_2 =
219             load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
220         uint8x8_t m1_3 =
221             load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
222         uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
223         uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
224 
225         uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
226         uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
227 
228         store_u8x4_strided_x2(dst, dst_stride, blend);
229 
230         mask += 4 * mask_stride;
231         src0 += 2 * src0_stride;
232         src1 += 2 * src1_stride;
233         dst += 2 * dst_stride;
234         h -= 2;
235       } while (h != 0);
236     }
237   }
238 }
239 
aom_blend_a64_mask_neon(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh)240 void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride,
241                              const uint8_t *src0, uint32_t src0_stride,
242                              const uint8_t *src1, uint32_t src1_stride,
243                              const uint8_t *mask, uint32_t mask_stride, int w,
244                              int h, int subw, int subh) {
245   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
246   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
247 
248   assert(h >= 1);
249   assert(w >= 1);
250   assert(IS_POWER_OF_TWO(h));
251   assert(IS_POWER_OF_TWO(w));
252 
253   if ((subw | subh) == 0) {
254     if (w > 8) {
255       do {
256         int i = 0;
257         do {
258           uint8x16_t m0 = vld1q_u8(mask + i);
259           uint8x16_t s0 = vld1q_u8(src0 + i);
260           uint8x16_t s1 = vld1q_u8(src1 + i);
261 
262           uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
263 
264           vst1q_u8(dst + i, blend);
265           i += 16;
266         } while (i < w);
267 
268         mask += mask_stride;
269         src0 += src0_stride;
270         src1 += src1_stride;
271         dst += dst_stride;
272       } while (--h != 0);
273     } else if (w == 8) {
274       do {
275         uint8x8_t m0 = vld1_u8(mask);
276         uint8x8_t s0 = vld1_u8(src0);
277         uint8x8_t s1 = vld1_u8(src1);
278 
279         uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
280 
281         vst1_u8(dst, blend);
282 
283         mask += mask_stride;
284         src0 += src0_stride;
285         src1 += src1_stride;
286         dst += dst_stride;
287       } while (--h != 0);
288     } else {
289       do {
290         uint8x8_t m0 = load_unaligned_u8_4x2(mask, mask_stride);
291         uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
292         uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
293 
294         uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
295 
296         store_u8x4_strided_x2(dst, dst_stride, blend);
297 
298         mask += 2 * mask_stride;
299         src0 += 2 * src0_stride;
300         src1 += 2 * src1_stride;
301         dst += 2 * dst_stride;
302         h -= 2;
303       } while (h != 0);
304     }
305   } else if ((subw & subh) == 1) {
306     if (w > 8) {
307       do {
308         int i = 0;
309         do {
310           uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);
311           uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);
312           uint8x16_t m2 = vld1q_u8(mask + 0 * mask_stride + 2 * i + 16);
313           uint8x16_t m3 = vld1q_u8(mask + 1 * mask_stride + 2 * i + 16);
314           uint8x16_t s0 = vld1q_u8(src0 + i);
315           uint8x16_t s1 = vld1q_u8(src1 + i);
316 
317           uint8x16_t m_avg = avg_blend_pairwise_u8x16_4(m0, m1, m2, m3);
318           uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
319 
320           vst1q_u8(dst + i, blend);
321 
322           i += 16;
323         } while (i < w);
324 
325         mask += 2 * mask_stride;
326         src0 += src0_stride;
327         src1 += src1_stride;
328         dst += dst_stride;
329       } while (--h != 0);
330     } else if (w == 8) {
331       do {
332         uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
333         uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
334         uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 8);
335         uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 8);
336         uint8x8_t s0 = vld1_u8(src0);
337         uint8x8_t s1 = vld1_u8(src1);
338 
339         uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
340         uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
341 
342         vst1_u8(dst, blend);
343 
344         mask += 2 * mask_stride;
345         src0 += src0_stride;
346         src1 += src1_stride;
347         dst += dst_stride;
348       } while (--h != 0);
349     } else {
350       do {
351         uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
352         uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
353         uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
354         uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
355         uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
356         uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
357 
358         uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
359         uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
360 
361         store_u8x4_strided_x2(dst, dst_stride, blend);
362 
363         mask += 4 * mask_stride;
364         src0 += 2 * src0_stride;
365         src1 += 2 * src1_stride;
366         dst += 2 * dst_stride;
367         h -= 2;
368       } while (h != 0);
369     }
370   } else if (subw == 1 && subh == 0) {
371     if (w > 8) {
372       do {
373         int i = 0;
374 
375         do {
376           uint8x16_t m0 = vld1q_u8(mask + 2 * i);
377           uint8x16_t m1 = vld1q_u8(mask + 2 * i + 16);
378           uint8x16_t s0 = vld1q_u8(src0 + i);
379           uint8x16_t s1 = vld1q_u8(src1 + i);
380 
381           uint8x16_t m_avg = avg_blend_pairwise_u8x16(m0, m1);
382           uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
383 
384           vst1q_u8(dst + i, blend);
385 
386           i += 16;
387         } while (i < w);
388 
389         mask += mask_stride;
390         src0 += src0_stride;
391         src1 += src1_stride;
392         dst += dst_stride;
393       } while (--h != 0);
394     } else if (w == 8) {
395       do {
396         uint8x8_t m0 = vld1_u8(mask);
397         uint8x8_t m1 = vld1_u8(mask + 8);
398         uint8x8_t s0 = vld1_u8(src0);
399         uint8x8_t s1 = vld1_u8(src1);
400 
401         uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
402         uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
403 
404         vst1_u8(dst, blend);
405 
406         mask += mask_stride;
407         src0 += src0_stride;
408         src1 += src1_stride;
409         dst += dst_stride;
410       } while (--h != 0);
411     } else {
412       do {
413         uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
414         uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
415         uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
416         uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
417 
418         uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
419         uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
420 
421         store_u8x4_strided_x2(dst, dst_stride, blend);
422 
423         mask += 2 * mask_stride;
424         src0 += 2 * src0_stride;
425         src1 += 2 * src1_stride;
426         dst += 2 * dst_stride;
427         h -= 2;
428       } while (h != 0);
429     }
430   } else {
431     if (w > 8) {
432       do {
433         int i = 0;
434         do {
435           uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + i);
436           uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + i);
437           uint8x16_t s0 = vld1q_u8(src0 + i);
438           uint8x16_t s1 = vld1q_u8(src1 + i);
439 
440           uint8x16_t m_avg = avg_blend_u8x16(m0, m1);
441           uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
442 
443           vst1q_u8(dst + i, blend);
444 
445           i += 16;
446         } while (i < w);
447 
448         mask += 2 * mask_stride;
449         src0 += src0_stride;
450         src1 += src1_stride;
451         dst += dst_stride;
452       } while (--h != 0);
453     } else if (w == 8) {
454       do {
455         uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
456         uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
457         uint8x8_t s0 = vld1_u8(src0);
458         uint8x8_t s1 = vld1_u8(src1);
459 
460         uint8x8_t m_avg = avg_blend_u8x8(m0, m1);
461         uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
462 
463         vst1_u8(dst, blend);
464 
465         mask += 2 * mask_stride;
466         src0 += src0_stride;
467         src1 += src1_stride;
468         dst += dst_stride;
469       } while (--h != 0);
470     } else {
471       do {
472         uint8x8_t m0_2 =
473             load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
474         uint8x8_t m1_3 =
475             load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
476         uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
477         uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
478 
479         uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3);
480         uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
481 
482         store_u8x4_strided_x2(dst, dst_stride, blend);
483 
484         mask += 4 * mask_stride;
485         src0 += 2 * src0_stride;
486         src1 += 2 * src1_stride;
487         dst += 2 * dst_stride;
488         h -= 2;
489       } while (h != 0);
490     }
491   }
492 }
493