1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
12 #define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
13
14 #include <arm_neon.h>
15
16 #include "./vpx_config.h"
17 #include "./vpx_dsp_rtcd.h"
18
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)19 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
20 uint8x8_t *const s0, uint8x8_t *const s1,
21 uint8x8_t *const s2, uint8x8_t *const s3) {
22 *s0 = vld1_u8(s);
23 s += p;
24 *s1 = vld1_u8(s);
25 s += p;
26 *s2 = vld1_u8(s);
27 s += p;
28 *s3 = vld1_u8(s);
29 }
30
load_u8_8x8(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)31 static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
32 uint8x8_t *const s0, uint8x8_t *const s1,
33 uint8x8_t *const s2, uint8x8_t *const s3,
34 uint8x8_t *const s4, uint8x8_t *const s5,
35 uint8x8_t *const s6, uint8x8_t *const s7) {
36 *s0 = vld1_u8(s);
37 s += p;
38 *s1 = vld1_u8(s);
39 s += p;
40 *s2 = vld1_u8(s);
41 s += p;
42 *s3 = vld1_u8(s);
43 s += p;
44 *s4 = vld1_u8(s);
45 s += p;
46 *s5 = vld1_u8(s);
47 s += p;
48 *s6 = vld1_u8(s);
49 s += p;
50 *s7 = vld1_u8(s);
51 }
52
load_u8_16x8(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)53 static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
54 uint8x16_t *const s0, uint8x16_t *const s1,
55 uint8x16_t *const s2, uint8x16_t *const s3,
56 uint8x16_t *const s4, uint8x16_t *const s5,
57 uint8x16_t *const s6, uint8x16_t *const s7) {
58 *s0 = vld1q_u8(s);
59 s += p;
60 *s1 = vld1q_u8(s);
61 s += p;
62 *s2 = vld1q_u8(s);
63 s += p;
64 *s3 = vld1q_u8(s);
65 s += p;
66 *s4 = vld1q_u8(s);
67 s += p;
68 *s5 = vld1q_u8(s);
69 s += p;
70 *s6 = vld1q_u8(s);
71 s += p;
72 *s7 = vld1q_u8(s);
73 }
74
convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filters,const int16x4_t filter3,const int16x4_t filter4)75 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
76 const int16x4_t s2, const int16x4_t s3,
77 const int16x4_t s4, const int16x4_t s5,
78 const int16x4_t s6, const int16x4_t s7,
79 const int16x8_t filters,
80 const int16x4_t filter3,
81 const int16x4_t filter4) {
82 const int16x4_t filters_lo = vget_low_s16(filters);
83 const int16x4_t filters_hi = vget_high_s16(filters);
84 int16x4_t sum;
85
86 sum = vmul_lane_s16(s0, filters_lo, 0);
87 sum = vmla_lane_s16(sum, s1, filters_lo, 1);
88 sum = vmla_lane_s16(sum, s2, filters_lo, 2);
89 sum = vmla_lane_s16(sum, s5, filters_hi, 1);
90 sum = vmla_lane_s16(sum, s6, filters_hi, 2);
91 sum = vmla_lane_s16(sum, s7, filters_hi, 3);
92 sum = vqadd_s16(sum, vmul_s16(s3, filter3));
93 sum = vqadd_s16(sum, vmul_s16(s4, filter4));
94 return sum;
95 }
96
convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filters,const int16x8_t filter3,const int16x8_t filter4)97 static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
98 const int16x8_t s2, const int16x8_t s3,
99 const int16x8_t s4, const int16x8_t s5,
100 const int16x8_t s6, const int16x8_t s7,
101 const int16x8_t filters,
102 const int16x8_t filter3,
103 const int16x8_t filter4) {
104 const int16x4_t filters_lo = vget_low_s16(filters);
105 const int16x4_t filters_hi = vget_high_s16(filters);
106 int16x8_t sum;
107
108 sum = vmulq_lane_s16(s0, filters_lo, 0);
109 sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
110 sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
111 sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
112 sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
113 sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
114 sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
115 sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
116 return vqrshrun_n_s16(sum, 7);
117 }
118
scale_filter_8(const uint8x8_t * const s,const int16x8_t filters)119 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
120 const int16x8_t filters) {
121 const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
122 const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
123 int16x8_t ss[8];
124
125 ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
126 ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
127 ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
128 ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
129 ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
130 ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
131 ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
132 ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
133
134 return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
135 filters, filter3, filter4);
136 }
137
138 #endif // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
139