• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 
load_u8_8x4(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3)16 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
17                                uint8x8_t *const s0, uint8x8_t *const s1,
18                                uint8x8_t *const s2, uint8x8_t *const s3) {
19   *s0 = vld1_u8(s);
20   s += p;
21   *s1 = vld1_u8(s);
22   s += p;
23   *s2 = vld1_u8(s);
24   s += p;
25   *s3 = vld1_u8(s);
26 }
27 
load_u8_8x8(const uint8_t * s,const ptrdiff_t p,uint8x8_t * const s0,uint8x8_t * const s1,uint8x8_t * const s2,uint8x8_t * const s3,uint8x8_t * const s4,uint8x8_t * const s5,uint8x8_t * const s6,uint8x8_t * const s7)28 static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
29                                uint8x8_t *const s0, uint8x8_t *const s1,
30                                uint8x8_t *const s2, uint8x8_t *const s3,
31                                uint8x8_t *const s4, uint8x8_t *const s5,
32                                uint8x8_t *const s6, uint8x8_t *const s7) {
33   *s0 = vld1_u8(s);
34   s += p;
35   *s1 = vld1_u8(s);
36   s += p;
37   *s2 = vld1_u8(s);
38   s += p;
39   *s3 = vld1_u8(s);
40   s += p;
41   *s4 = vld1_u8(s);
42   s += p;
43   *s5 = vld1_u8(s);
44   s += p;
45   *s6 = vld1_u8(s);
46   s += p;
47   *s7 = vld1_u8(s);
48 }
49 
load_u8_16x8(const uint8_t * s,const ptrdiff_t p,uint8x16_t * const s0,uint8x16_t * const s1,uint8x16_t * const s2,uint8x16_t * const s3,uint8x16_t * const s4,uint8x16_t * const s5,uint8x16_t * const s6,uint8x16_t * const s7)50 static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
51                                 uint8x16_t *const s0, uint8x16_t *const s1,
52                                 uint8x16_t *const s2, uint8x16_t *const s3,
53                                 uint8x16_t *const s4, uint8x16_t *const s5,
54                                 uint8x16_t *const s6, uint8x16_t *const s7) {
55   *s0 = vld1q_u8(s);
56   s += p;
57   *s1 = vld1q_u8(s);
58   s += p;
59   *s2 = vld1q_u8(s);
60   s += p;
61   *s3 = vld1q_u8(s);
62   s += p;
63   *s4 = vld1q_u8(s);
64   s += p;
65   *s5 = vld1q_u8(s);
66   s += p;
67   *s6 = vld1q_u8(s);
68   s += p;
69   *s7 = vld1q_u8(s);
70 }
71 
convolve8_4(const int16x4_t s0,const int16x4_t s1,const int16x4_t s2,const int16x4_t s3,const int16x4_t s4,const int16x4_t s5,const int16x4_t s6,const int16x4_t s7,const int16x8_t filters,const int16x4_t filter3,const int16x4_t filter4)72 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
73                                     const int16x4_t s2, const int16x4_t s3,
74                                     const int16x4_t s4, const int16x4_t s5,
75                                     const int16x4_t s6, const int16x4_t s7,
76                                     const int16x8_t filters,
77                                     const int16x4_t filter3,
78                                     const int16x4_t filter4) {
79   const int16x4_t filters_lo = vget_low_s16(filters);
80   const int16x4_t filters_hi = vget_high_s16(filters);
81   int16x4_t sum;
82 
83   sum = vmul_lane_s16(s0, filters_lo, 0);
84   sum = vmla_lane_s16(sum, s1, filters_lo, 1);
85   sum = vmla_lane_s16(sum, s2, filters_lo, 2);
86   sum = vmla_lane_s16(sum, s5, filters_hi, 1);
87   sum = vmla_lane_s16(sum, s6, filters_hi, 2);
88   sum = vmla_lane_s16(sum, s7, filters_hi, 3);
89   sum = vqadd_s16(sum, vmul_s16(s3, filter3));
90   sum = vqadd_s16(sum, vmul_s16(s4, filter4));
91   return sum;
92 }
93 
convolve8_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,const int16x8_t s6,const int16x8_t s7,const int16x8_t filters,const int16x8_t filter3,const int16x8_t filter4)94 static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
95                                     const int16x8_t s2, const int16x8_t s3,
96                                     const int16x8_t s4, const int16x8_t s5,
97                                     const int16x8_t s6, const int16x8_t s7,
98                                     const int16x8_t filters,
99                                     const int16x8_t filter3,
100                                     const int16x8_t filter4) {
101   const int16x4_t filters_lo = vget_low_s16(filters);
102   const int16x4_t filters_hi = vget_high_s16(filters);
103   int16x8_t sum;
104 
105   sum = vmulq_lane_s16(s0, filters_lo, 0);
106   sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
107   sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
108   sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
109   sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
110   sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
111   sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
112   sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
113   return vqrshrun_n_s16(sum, 7);
114 }
115 
scale_filter_8(const uint8x8_t * const s,const int16x8_t filters)116 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
117                                        const int16x8_t filters) {
118   const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
119   const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
120   int16x8_t ss[8];
121 
122   ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
123   ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
124   ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
125   ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
126   ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
127   ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
128   ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
129   ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
130 
131   return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
132                      filters, filter3, filter4);
133 }
134