• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "./vpx_config.h"
14 
15 #include "vpx/vpx_integer.h"
16 
17 #include "vpx_dsp/variance.h"
18 #include "vpx_dsp/arm/mem_neon.h"
19 
20 static const uint8_t bilinear_filters[8][2] = {
21   { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
22   { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
23 };
24 
25 // Process a block exactly 4 wide and a multiple of 2 high.
var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,const uint8_t * filter)26 static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
27                                       uint8_t *output_ptr,
28                                       unsigned int src_pixels_per_line,
29                                       int pixel_step,
30                                       unsigned int output_height,
31                                       const uint8_t *filter) {
32   const uint8x8_t f0 = vdup_n_u8(filter[0]);
33   const uint8x8_t f1 = vdup_n_u8(filter[1]);
34   unsigned int i;
35   for (i = 0; i < output_height; i += 2) {
36     const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
37     const uint8x8_t src_1 =
38         load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
39     const uint16x8_t a = vmull_u8(src_0, f0);
40     const uint16x8_t b = vmlal_u8(a, src_1, f1);
41     const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
42     vst1_u8(output_ptr, out);
43     src_ptr += 2 * src_pixels_per_line;
44     output_ptr += 8;
45   }
46 }
47 
48 // Process a block exactly 8 wide and any height.
var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,const uint8_t * filter)49 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
50                                       uint8_t *output_ptr,
51                                       unsigned int src_pixels_per_line,
52                                       int pixel_step,
53                                       unsigned int output_height,
54                                       const uint8_t *filter) {
55   const uint8x8_t f0 = vdup_n_u8(filter[0]);
56   const uint8x8_t f1 = vdup_n_u8(filter[1]);
57   unsigned int i;
58   for (i = 0; i < output_height; ++i) {
59     const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
60     const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
61     const uint16x8_t a = vmull_u8(src_0, f0);
62     const uint16x8_t b = vmlal_u8(a, src_1, f1);
63     const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
64     vst1_u8(output_ptr, out);
65     src_ptr += src_pixels_per_line;
66     output_ptr += 8;
67   }
68 }
69 
70 // Process a block which is a mutiple of 16 wide and any height.
var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)71 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
72                                        uint8_t *output_ptr,
73                                        unsigned int src_pixels_per_line,
74                                        int pixel_step,
75                                        unsigned int output_height,
76                                        unsigned int output_width,
77                                        const uint8_t *filter) {
78   const uint8x8_t f0 = vdup_n_u8(filter[0]);
79   const uint8x8_t f1 = vdup_n_u8(filter[1]);
80   unsigned int i, j;
81   for (i = 0; i < output_height; ++i) {
82     for (j = 0; j < output_width; j += 16) {
83       const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
84       const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
85       const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
86       const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
87       const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
88       const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
89       const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
90       const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
91       vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
92     }
93     src_ptr += src_pixels_per_line;
94     output_ptr += output_width;
95   }
96 }
97 
98 // 4xM filter writes an extra row to fdata because it processes two rows at a
99 // time.
100 #define sub_pixel_varianceNxM(n, m)                                 \
101   uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                  \
102       const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
103       const uint8_t *b, int b_stride, uint32_t *sse) {              \
104     uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
105     uint8_t temp1[n * m];                                           \
106                                                                     \
107     if (n == 4) {                                                   \
108       var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
109                                 bilinear_filters[xoffset]);         \
110       var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
111                                 bilinear_filters[yoffset]);         \
112     } else if (n == 8) {                                            \
113       var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
114                                 bilinear_filters[xoffset]);         \
115       var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
116                                 bilinear_filters[yoffset]);         \
117     } else {                                                        \
118       var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
119                                  bilinear_filters[xoffset]);        \
120       var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
121                                  bilinear_filters[yoffset]);        \
122     }                                                               \
123     return vpx_variance##n##x##m(temp1, n, b, b_stride, sse);       \
124   }
125 
126 sub_pixel_varianceNxM(4, 4);
127 sub_pixel_varianceNxM(4, 8);
128 sub_pixel_varianceNxM(8, 4);
129 sub_pixel_varianceNxM(8, 8);
130 sub_pixel_varianceNxM(8, 16);
131 sub_pixel_varianceNxM(16, 8);
132 sub_pixel_varianceNxM(16, 16);
133 sub_pixel_varianceNxM(16, 32);
134 sub_pixel_varianceNxM(32, 16);
135 sub_pixel_varianceNxM(32, 32);
136 sub_pixel_varianceNxM(32, 64);
137 sub_pixel_varianceNxM(64, 32);
138 sub_pixel_varianceNxM(64, 64);
139 
140 // 4xM filter writes an extra row to fdata because it processes two rows at a
141 // time.
142 #define sub_pixel_avg_varianceNxM(n, m)                             \
143   uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(              \
144       const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
145       const uint8_t *b, int b_stride, uint32_t *sse,                \
146       const uint8_t *second_pred) {                                 \
147     uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
148     uint8_t temp1[n * m];                                           \
149                                                                     \
150     if (n == 4) {                                                   \
151       var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
152                                 bilinear_filters[xoffset]);         \
153       var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
154                                 bilinear_filters[yoffset]);         \
155     } else if (n == 8) {                                            \
156       var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
157                                 bilinear_filters[xoffset]);         \
158       var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
159                                 bilinear_filters[yoffset]);         \
160     } else {                                                        \
161       var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
162                                  bilinear_filters[xoffset]);        \
163       var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
164                                  bilinear_filters[yoffset]);        \
165     }                                                               \
166                                                                     \
167     vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);          \
168                                                                     \
169     return vpx_variance##n##x##m(temp0, n, b, b_stride, sse);       \
170   }
171 
172 sub_pixel_avg_varianceNxM(4, 4);
173 sub_pixel_avg_varianceNxM(4, 8);
174 sub_pixel_avg_varianceNxM(8, 4);
175 sub_pixel_avg_varianceNxM(8, 8);
176 sub_pixel_avg_varianceNxM(8, 16);
177 sub_pixel_avg_varianceNxM(16, 8);
178 sub_pixel_avg_varianceNxM(16, 16);
179 sub_pixel_avg_varianceNxM(16, 32);
180 sub_pixel_avg_varianceNxM(32, 16);
181 sub_pixel_avg_varianceNxM(32, 32);
182 sub_pixel_avg_varianceNxM(32, 64);
183 sub_pixel_avg_varianceNxM(64, 32);
184 sub_pixel_avg_varianceNxM(64, 64);
185