1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "./vpx_config.h"
14
15 #include "vpx/vpx_integer.h"
16
17 #include "vpx_dsp/variance.h"
18 #include "vpx_dsp/arm/mem_neon.h"
19
20 static const uint8_t bilinear_filters[8][2] = {
21 { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
22 { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
23 };
24
25 // Process a block exactly 4 wide and a multiple of 2 high.
var_filter_block2d_bil_w4(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,const uint8_t * filter)26 static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
27 uint8_t *output_ptr,
28 unsigned int src_pixels_per_line,
29 int pixel_step,
30 unsigned int output_height,
31 const uint8_t *filter) {
32 const uint8x8_t f0 = vdup_n_u8(filter[0]);
33 const uint8x8_t f1 = vdup_n_u8(filter[1]);
34 unsigned int i;
35 for (i = 0; i < output_height; i += 2) {
36 const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
37 const uint8x8_t src_1 =
38 load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
39 const uint16x8_t a = vmull_u8(src_0, f0);
40 const uint16x8_t b = vmlal_u8(a, src_1, f1);
41 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
42 vst1_u8(output_ptr, out);
43 src_ptr += 2 * src_pixels_per_line;
44 output_ptr += 8;
45 }
46 }
47
48 // Process a block exactly 8 wide and any height.
var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,const uint8_t * filter)49 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
50 uint8_t *output_ptr,
51 unsigned int src_pixels_per_line,
52 int pixel_step,
53 unsigned int output_height,
54 const uint8_t *filter) {
55 const uint8x8_t f0 = vdup_n_u8(filter[0]);
56 const uint8x8_t f1 = vdup_n_u8(filter[1]);
57 unsigned int i;
58 for (i = 0; i < output_height; ++i) {
59 const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
60 const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
61 const uint16x8_t a = vmull_u8(src_0, f0);
62 const uint16x8_t b = vmlal_u8(a, src_1, f1);
63 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
64 vst1_u8(output_ptr, out);
65 src_ptr += src_pixels_per_line;
66 output_ptr += 8;
67 }
68 }
69
70 // Process a block which is a mutiple of 16 wide and any height.
var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)71 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
72 uint8_t *output_ptr,
73 unsigned int src_pixels_per_line,
74 int pixel_step,
75 unsigned int output_height,
76 unsigned int output_width,
77 const uint8_t *filter) {
78 const uint8x8_t f0 = vdup_n_u8(filter[0]);
79 const uint8x8_t f1 = vdup_n_u8(filter[1]);
80 unsigned int i, j;
81 for (i = 0; i < output_height; ++i) {
82 for (j = 0; j < output_width; j += 16) {
83 const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
84 const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
85 const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
86 const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
87 const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
88 const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
89 const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
90 const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
91 vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
92 }
93 src_ptr += src_pixels_per_line;
94 output_ptr += output_width;
95 }
96 }
97
98 // 4xM filter writes an extra row to fdata because it processes two rows at a
99 // time.
100 #define sub_pixel_varianceNxM(n, m) \
101 uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \
102 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
103 const uint8_t *b, int b_stride, uint32_t *sse) { \
104 uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \
105 uint8_t temp1[n * m]; \
106 \
107 if (n == 4) { \
108 var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \
109 bilinear_filters[xoffset]); \
110 var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \
111 bilinear_filters[yoffset]); \
112 } else if (n == 8) { \
113 var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \
114 bilinear_filters[xoffset]); \
115 var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \
116 bilinear_filters[yoffset]); \
117 } else { \
118 var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
119 bilinear_filters[xoffset]); \
120 var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \
121 bilinear_filters[yoffset]); \
122 } \
123 return vpx_variance##n##x##m(temp1, n, b, b_stride, sse); \
124 }
125
126 sub_pixel_varianceNxM(4, 4);
127 sub_pixel_varianceNxM(4, 8);
128 sub_pixel_varianceNxM(8, 4);
129 sub_pixel_varianceNxM(8, 8);
130 sub_pixel_varianceNxM(8, 16);
131 sub_pixel_varianceNxM(16, 8);
132 sub_pixel_varianceNxM(16, 16);
133 sub_pixel_varianceNxM(16, 32);
134 sub_pixel_varianceNxM(32, 16);
135 sub_pixel_varianceNxM(32, 32);
136 sub_pixel_varianceNxM(32, 64);
137 sub_pixel_varianceNxM(64, 32);
138 sub_pixel_varianceNxM(64, 64);
139
140 // 4xM filter writes an extra row to fdata because it processes two rows at a
141 // time.
142 #define sub_pixel_avg_varianceNxM(n, m) \
143 uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \
144 const uint8_t *a, int a_stride, int xoffset, int yoffset, \
145 const uint8_t *b, int b_stride, uint32_t *sse, \
146 const uint8_t *second_pred) { \
147 uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \
148 uint8_t temp1[n * m]; \
149 \
150 if (n == 4) { \
151 var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \
152 bilinear_filters[xoffset]); \
153 var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \
154 bilinear_filters[yoffset]); \
155 } else if (n == 8) { \
156 var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \
157 bilinear_filters[xoffset]); \
158 var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \
159 bilinear_filters[yoffset]); \
160 } else { \
161 var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
162 bilinear_filters[xoffset]); \
163 var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \
164 bilinear_filters[yoffset]); \
165 } \
166 \
167 vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \
168 \
169 return vpx_variance##n##x##m(temp0, n, b, b_stride, sse); \
170 }
171
172 sub_pixel_avg_varianceNxM(4, 4);
173 sub_pixel_avg_varianceNxM(4, 8);
174 sub_pixel_avg_varianceNxM(8, 4);
175 sub_pixel_avg_varianceNxM(8, 8);
176 sub_pixel_avg_varianceNxM(8, 16);
177 sub_pixel_avg_varianceNxM(16, 8);
178 sub_pixel_avg_varianceNxM(16, 16);
179 sub_pixel_avg_varianceNxM(16, 32);
180 sub_pixel_avg_varianceNxM(32, 16);
181 sub_pixel_avg_varianceNxM(32, 32);
182 sub_pixel_avg_varianceNxM(32, 64);
183 sub_pixel_avg_varianceNxM(64, 32);
184 sub_pixel_avg_varianceNxM(64, 64);
185