1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/arm/blend_neon.h"
20 #include "aom_dsp/arm/mem_neon.h"
21 #include "aom_dsp/blend.h"
22
alpha_blend_a64_d16_u16x8(uint16x8_t m,uint16x8_t a,uint16x8_t b,uint16x8_t round_offset)23 uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
24 uint16x8_t round_offset) {
25 const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
26
27 uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
28 uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(m), vget_high_u16(a));
29
30 blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));
31 blend_u32_hi =
32 vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));
33
34 uint16x4_t blend_u16_lo = vshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
35 uint16x4_t blend_u16_hi = vshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
36
37 uint16x8_t res = vcombine_u16(blend_u16_lo, blend_u16_hi);
38
39 res = vqsubq_u16(res, round_offset);
40
41 return vqrshrn_n_u16(res,
42 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
43 }
44
aom_lowbd_blend_a64_d16_mask_neon(uint8_t * dst,uint32_t dst_stride,const CONV_BUF_TYPE * src0,uint32_t src0_stride,const CONV_BUF_TYPE * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh,ConvolveParams * conv_params)45 void aom_lowbd_blend_a64_d16_mask_neon(
46 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
47 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
48 const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
49 ConvolveParams *conv_params) {
50 (void)conv_params;
51
52 const int bd = 8;
53 const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
54 const int round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
55 (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
56 const uint16x8_t offset_vec = vdupq_n_u16(round_offset);
57
58 assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
59 assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
60
61 assert(h >= 4);
62 assert(w >= 4);
63 assert(IS_POWER_OF_TWO(h));
64 assert(IS_POWER_OF_TWO(w));
65
66 if (subw == 0 && subh == 0) {
67 if (w >= 8) {
68 do {
69 int i = 0;
70 do {
71 uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
72 uint16x8_t s0 = vld1q_u16(src0 + i);
73 uint16x8_t s1 = vld1q_u16(src1 + i);
74
75 uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
76
77 vst1_u8(dst + i, blend);
78 i += 8;
79 } while (i < w);
80
81 mask += mask_stride;
82 src0 += src0_stride;
83 src1 += src1_stride;
84 dst += dst_stride;
85 } while (--h != 0);
86 } else {
87 do {
88 uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
89 uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
90 uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
91
92 uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
93
94 store_u8x4_strided_x2(dst, dst_stride, blend);
95
96 mask += 2 * mask_stride;
97 src0 += 2 * src0_stride;
98 src1 += 2 * src1_stride;
99 dst += 2 * dst_stride;
100 h -= 2;
101 } while (h != 0);
102 }
103 } else if (subw == 1 && subh == 1) {
104 if (w >= 8) {
105 do {
106 int i = 0;
107 do {
108 uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
109 uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
110 uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
111 uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
112 uint16x8_t s0 = vld1q_u16(src0 + i);
113 uint16x8_t s1 = vld1q_u16(src1 + i);
114
115 uint16x8_t m_avg =
116 vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
117
118 uint8x8_t blend =
119 alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
120
121 vst1_u8(dst + i, blend);
122 i += 8;
123 } while (i < w);
124
125 mask += 2 * mask_stride;
126 src0 += src0_stride;
127 src1 += src1_stride;
128 dst += dst_stride;
129 } while (--h != 0);
130 } else {
131 do {
132 uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
133 uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
134 uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
135 uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
136 uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
137 uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
138
139 uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
140 uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
141
142 store_u8x4_strided_x2(dst, dst_stride, blend);
143
144 mask += 4 * mask_stride;
145 src0 += 2 * src0_stride;
146 src1 += 2 * src1_stride;
147 dst += 2 * dst_stride;
148 h -= 2;
149 } while (h != 0);
150 }
151 } else if (subw == 1 && subh == 0) {
152 if (w >= 8) {
153 do {
154 int i = 0;
155 do {
156 uint8x8_t m0 = vld1_u8(mask + 2 * i);
157 uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
158 uint16x8_t s0 = vld1q_u16(src0 + i);
159 uint16x8_t s1 = vld1q_u16(src1 + i);
160
161 uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
162 uint8x8_t blend =
163 alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
164
165 vst1_u8(dst + i, blend);
166 i += 8;
167 } while (i < w);
168
169 mask += mask_stride;
170 src0 += src0_stride;
171 src1 += src1_stride;
172 dst += dst_stride;
173 } while (--h != 0);
174 } else {
175 do {
176 uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
177 uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
178 uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
179 uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
180
181 uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
182 uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
183
184 store_u8x4_strided_x2(dst, dst_stride, blend);
185
186 mask += 2 * mask_stride;
187 src0 += 2 * src0_stride;
188 src1 += 2 * src1_stride;
189 dst += 2 * dst_stride;
190 h -= 2;
191 } while (h != 0);
192 }
193 } else {
194 if (w >= 8) {
195 do {
196 int i = 0;
197 do {
198 uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
199 uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
200 uint16x8_t s0 = vld1q_u16(src0 + i);
201 uint16x8_t s1 = vld1q_u16(src1 + i);
202
203 uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
204 uint8x8_t blend =
205 alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
206
207 vst1_u8(dst + i, blend);
208 i += 8;
209 } while (i < w);
210
211 mask += 2 * mask_stride;
212 src0 += src0_stride;
213 src1 += src1_stride;
214 dst += dst_stride;
215 } while (--h != 0);
216 } else {
217 do {
218 uint8x8_t m0_2 =
219 load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
220 uint8x8_t m1_3 =
221 load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
222 uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
223 uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
224
225 uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
226 uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
227
228 store_u8x4_strided_x2(dst, dst_stride, blend);
229
230 mask += 4 * mask_stride;
231 src0 += 2 * src0_stride;
232 src1 += 2 * src1_stride;
233 dst += 2 * dst_stride;
234 h -= 2;
235 } while (h != 0);
236 }
237 }
238 }
239
aom_blend_a64_mask_neon(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,uint32_t mask_stride,int w,int h,int subw,int subh)240 void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride,
241 const uint8_t *src0, uint32_t src0_stride,
242 const uint8_t *src1, uint32_t src1_stride,
243 const uint8_t *mask, uint32_t mask_stride, int w,
244 int h, int subw, int subh) {
245 assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
246 assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
247
248 assert(h >= 1);
249 assert(w >= 1);
250 assert(IS_POWER_OF_TWO(h));
251 assert(IS_POWER_OF_TWO(w));
252
253 if ((subw | subh) == 0) {
254 if (w > 8) {
255 do {
256 int i = 0;
257 do {
258 uint8x16_t m0 = vld1q_u8(mask + i);
259 uint8x16_t s0 = vld1q_u8(src0 + i);
260 uint8x16_t s1 = vld1q_u8(src1 + i);
261
262 uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
263
264 vst1q_u8(dst + i, blend);
265 i += 16;
266 } while (i < w);
267
268 mask += mask_stride;
269 src0 += src0_stride;
270 src1 += src1_stride;
271 dst += dst_stride;
272 } while (--h != 0);
273 } else if (w == 8) {
274 do {
275 uint8x8_t m0 = vld1_u8(mask);
276 uint8x8_t s0 = vld1_u8(src0);
277 uint8x8_t s1 = vld1_u8(src1);
278
279 uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
280
281 vst1_u8(dst, blend);
282
283 mask += mask_stride;
284 src0 += src0_stride;
285 src1 += src1_stride;
286 dst += dst_stride;
287 } while (--h != 0);
288 } else {
289 do {
290 uint8x8_t m0 = load_unaligned_u8_4x2(mask, mask_stride);
291 uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
292 uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
293
294 uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
295
296 store_u8x4_strided_x2(dst, dst_stride, blend);
297
298 mask += 2 * mask_stride;
299 src0 += 2 * src0_stride;
300 src1 += 2 * src1_stride;
301 dst += 2 * dst_stride;
302 h -= 2;
303 } while (h != 0);
304 }
305 } else if ((subw & subh) == 1) {
306 if (w > 8) {
307 do {
308 int i = 0;
309 do {
310 uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);
311 uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);
312 uint8x16_t m2 = vld1q_u8(mask + 0 * mask_stride + 2 * i + 16);
313 uint8x16_t m3 = vld1q_u8(mask + 1 * mask_stride + 2 * i + 16);
314 uint8x16_t s0 = vld1q_u8(src0 + i);
315 uint8x16_t s1 = vld1q_u8(src1 + i);
316
317 uint8x16_t m_avg = avg_blend_pairwise_u8x16_4(m0, m1, m2, m3);
318 uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
319
320 vst1q_u8(dst + i, blend);
321
322 i += 16;
323 } while (i < w);
324
325 mask += 2 * mask_stride;
326 src0 += src0_stride;
327 src1 += src1_stride;
328 dst += dst_stride;
329 } while (--h != 0);
330 } else if (w == 8) {
331 do {
332 uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
333 uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
334 uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 8);
335 uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 8);
336 uint8x8_t s0 = vld1_u8(src0);
337 uint8x8_t s1 = vld1_u8(src1);
338
339 uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
340 uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
341
342 vst1_u8(dst, blend);
343
344 mask += 2 * mask_stride;
345 src0 += src0_stride;
346 src1 += src1_stride;
347 dst += dst_stride;
348 } while (--h != 0);
349 } else {
350 do {
351 uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
352 uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
353 uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
354 uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
355 uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
356 uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
357
358 uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
359 uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
360
361 store_u8x4_strided_x2(dst, dst_stride, blend);
362
363 mask += 4 * mask_stride;
364 src0 += 2 * src0_stride;
365 src1 += 2 * src1_stride;
366 dst += 2 * dst_stride;
367 h -= 2;
368 } while (h != 0);
369 }
370 } else if (subw == 1 && subh == 0) {
371 if (w > 8) {
372 do {
373 int i = 0;
374
375 do {
376 uint8x16_t m0 = vld1q_u8(mask + 2 * i);
377 uint8x16_t m1 = vld1q_u8(mask + 2 * i + 16);
378 uint8x16_t s0 = vld1q_u8(src0 + i);
379 uint8x16_t s1 = vld1q_u8(src1 + i);
380
381 uint8x16_t m_avg = avg_blend_pairwise_u8x16(m0, m1);
382 uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
383
384 vst1q_u8(dst + i, blend);
385
386 i += 16;
387 } while (i < w);
388
389 mask += mask_stride;
390 src0 += src0_stride;
391 src1 += src1_stride;
392 dst += dst_stride;
393 } while (--h != 0);
394 } else if (w == 8) {
395 do {
396 uint8x8_t m0 = vld1_u8(mask);
397 uint8x8_t m1 = vld1_u8(mask + 8);
398 uint8x8_t s0 = vld1_u8(src0);
399 uint8x8_t s1 = vld1_u8(src1);
400
401 uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
402 uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
403
404 vst1_u8(dst, blend);
405
406 mask += mask_stride;
407 src0 += src0_stride;
408 src1 += src1_stride;
409 dst += dst_stride;
410 } while (--h != 0);
411 } else {
412 do {
413 uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
414 uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
415 uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
416 uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
417
418 uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
419 uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
420
421 store_u8x4_strided_x2(dst, dst_stride, blend);
422
423 mask += 2 * mask_stride;
424 src0 += 2 * src0_stride;
425 src1 += 2 * src1_stride;
426 dst += 2 * dst_stride;
427 h -= 2;
428 } while (h != 0);
429 }
430 } else {
431 if (w > 8) {
432 do {
433 int i = 0;
434 do {
435 uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + i);
436 uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + i);
437 uint8x16_t s0 = vld1q_u8(src0 + i);
438 uint8x16_t s1 = vld1q_u8(src1 + i);
439
440 uint8x16_t m_avg = avg_blend_u8x16(m0, m1);
441 uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
442
443 vst1q_u8(dst + i, blend);
444
445 i += 16;
446 } while (i < w);
447
448 mask += 2 * mask_stride;
449 src0 += src0_stride;
450 src1 += src1_stride;
451 dst += dst_stride;
452 } while (--h != 0);
453 } else if (w == 8) {
454 do {
455 uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
456 uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
457 uint8x8_t s0 = vld1_u8(src0);
458 uint8x8_t s1 = vld1_u8(src1);
459
460 uint8x8_t m_avg = avg_blend_u8x8(m0, m1);
461 uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
462
463 vst1_u8(dst, blend);
464
465 mask += 2 * mask_stride;
466 src0 += src0_stride;
467 src1 += src1_stride;
468 dst += dst_stride;
469 } while (--h != 0);
470 } else {
471 do {
472 uint8x8_t m0_2 =
473 load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
474 uint8x8_t m1_3 =
475 load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
476 uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
477 uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
478
479 uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3);
480 uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
481
482 store_u8x4_strided_x2(dst, dst_stride, blend);
483
484 mask += 4 * mask_stride;
485 src0 += 2 * src0_stride;
486 src1 += 2 * src1_stride;
487 dst += 2 * dst_stride;
488 h -= 2;
489 } while (h != 0);
490 }
491 }
492 }
493