• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *
3  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
4  *
5  * This source code is subject to the terms of the BSD 2 Clause License and
6  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7  * was not distributed with this source code in the LICENSE file, you can
8  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9  * Media Patent License 1.0 was not distributed with this source code in the
10  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11  */
12 
13 #include <arm_neon.h>
14 #include <assert.h>
15 
16 #include "aom/aom_integer.h"
17 #include "aom_dsp/blend.h"
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/blockd.h"
21 #include "config/av1_rtcd.h"
22 
av1_build_compound_diffwtd_mask_d16_neon(uint8_t * mask,DIFFWTD_MASK_TYPE mask_type,const CONV_BUF_TYPE * src0,int src0_stride,const CONV_BUF_TYPE * src1,int src1_stride,int h,int w,ConvolveParams * conv_params,int bd)23 void av1_build_compound_diffwtd_mask_d16_neon(
24     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
25     int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
26     ConvolveParams *conv_params, int bd) {
27   assert(h >= 4);
28   assert(w >= 4);
29   assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
30   const int round =
31       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
32   uint16x8_t diff_q, tmp0, tmp1;
33   uint8x8_t diff_d, diff_select;
34   const CONV_BUF_TYPE *src0_1, *src1_1;
35   const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
36   const uint8x8_t dup_38 = vdup_n_u8(38);
37   const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
38   if (mask_type == DIFFWTD_38) {
39     diff_select = vdup_n_u8(255);
40   } else {
41     diff_select = vdup_n_u8(0);
42   }
43   if (w >= 8) {
44     for (int i = 0; i < h; ++i) {
45       src0_1 = src0;
46       src1_1 = src1;
47       for (int j = 0; j < w; j += 8) {
48         __builtin_prefetch(src0_1);
49         __builtin_prefetch(src1_1);
50         diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
51         diff_q = vrshlq_u16(diff_q, dup_round);
52         diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
53         diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
54         diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
55         vst1_u8(mask, diff_d);
56         src0_1 += 8;
57         src1_1 += 8;
58         mask += 8;
59       }
60       src0 += src0_stride;
61       src1 += src1_stride;
62     }
63   } else if (w == 4) {
64     for (int i = 0; i < h; i += 2) {
65       src0_1 = src0;
66       src1_1 = src1;
67       __builtin_prefetch(src0_1 + 0 * src0_stride);
68       __builtin_prefetch(src0_1 + 1 * src0_stride);
69       __builtin_prefetch(src1_1 + 0 * src1_stride);
70       __builtin_prefetch(src1_1 + 1 * src1_stride);
71       tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
72                           vld1_u16(src0_1 + (1 * src0_stride)));
73       tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
74                           vld1_u16(src1_1 + (1 * src1_stride)));
75       diff_q = vabdq_u16(tmp0, tmp1);
76       diff_q = vrshlq_u16(diff_q, dup_round);
77       diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
78       diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
79       diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
80       vst1_u8(mask, diff_d);
81       src0 += src0_stride * 2;
82       src1 += src1_stride * 2;
83       mask += w * 2;
84     }
85   }
86 }
87