1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "./vp9_rtcd.h"
15
16 #include "vpx/vpx_integer.h"
17 #include "vp9/common/vp9_reconinter.h"
18 #include "vp9/encoder/vp9_context_tree.h"
19 #include "vp9/encoder/vp9_denoiser.h"
20 #include "vpx_mem/vpx_mem.h"
21
22 // Compute the sum of all pixel differences of this MB.
horizontal_add_s8x16(const int8x16_t v_sum_diff_total)23 static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
24 const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
25 const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
26 const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
27 const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
28 vget_low_s64(fedcba98_76543210));
29 const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
30 return sum_diff;
31 }
32
33 // Denoise a 16x1 vector.
denoiser_16x1_neon(const uint8_t * sig,const uint8_t * mc_running_avg_y,uint8_t * running_avg_y,const uint8x16_t v_level1_threshold,const uint8x16_t v_level2_threshold,const uint8x16_t v_level3_threshold,const uint8x16_t v_level1_adjustment,const uint8x16_t v_delta_level_1_and_2,const uint8x16_t v_delta_level_2_and_3,int8x16_t v_sum_diff_total)34 static INLINE int8x16_t denoiser_16x1_neon(
35 const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
36 const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
37 const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
38 const uint8x16_t v_delta_level_1_and_2,
39 const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
40 const uint8x16_t v_sig = vld1q_u8(sig);
41 const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
42
43 /* Calculate absolute difference and sign masks. */
44 const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
45 const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
46 const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
47
48 /* Figure out which level that put us in. */
49 const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
50 const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
51 const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
52
53 /* Calculate absolute adjustments for level 1, 2 and 3. */
54 const uint8x16_t v_level2_adjustment =
55 vandq_u8(v_level2_mask, v_delta_level_1_and_2);
56 const uint8x16_t v_level3_adjustment =
57 vandq_u8(v_level3_mask, v_delta_level_2_and_3);
58 const uint8x16_t v_level1and2_adjustment =
59 vaddq_u8(v_level1_adjustment, v_level2_adjustment);
60 const uint8x16_t v_level1and2and3_adjustment =
61 vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
62
63 /* Figure adjustment absolute value by selecting between the absolute
64 * difference if in level0 or the value for level 1, 2 and 3.
65 */
66 const uint8x16_t v_abs_adjustment =
67 vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
68
69 /* Calculate positive and negative adjustments. Apply them to the signal
70 * and accumulate them. Adjustments are less than eight and the maximum
71 * sum of them (7 * 16) can fit in a signed char.
72 */
73 const uint8x16_t v_pos_adjustment =
74 vandq_u8(v_diff_pos_mask, v_abs_adjustment);
75 const uint8x16_t v_neg_adjustment =
76 vandq_u8(v_diff_neg_mask, v_abs_adjustment);
77
78 uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
79 v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
80
81 /* Store results. */
82 vst1q_u8(running_avg_y, v_running_avg_y);
83
84 /* Sum all the accumulators to have the sum of all pixel differences
85 * for this macroblock.
86 */
87 {
88 const int8x16_t v_sum_diff =
89 vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
90 vreinterpretq_s8_u8(v_neg_adjustment));
91 v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
92 }
93 return v_sum_diff_total;
94 }
95
denoiser_adjust_16x1_neon(const uint8_t * sig,const uint8_t * mc_running_avg_y,uint8_t * running_avg_y,const uint8x16_t k_delta,int8x16_t v_sum_diff_total)96 static INLINE int8x16_t denoiser_adjust_16x1_neon(
97 const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
98 const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
99 uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
100 const uint8x16_t v_sig = vld1q_u8(sig);
101 const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
102
103 /* Calculate absolute difference and sign masks. */
104 const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
105 const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
106 const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
107 // Clamp absolute difference to delta to get the adjustment.
108 const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
109
110 const uint8x16_t v_pos_adjustment =
111 vandq_u8(v_diff_pos_mask, v_abs_adjustment);
112 const uint8x16_t v_neg_adjustment =
113 vandq_u8(v_diff_neg_mask, v_abs_adjustment);
114
115 v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
116 v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
117
118 /* Store results. */
119 vst1q_u8(running_avg_y, v_running_avg_y);
120
121 {
122 const int8x16_t v_sum_diff =
123 vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
124 vreinterpretq_s8_u8(v_pos_adjustment));
125 v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
126 }
127 return v_sum_diff_total;
128 }
129
130 // Denoise 8x8 and 8x16 blocks.
vp9_denoiser_8xN_neon(const uint8_t * sig,int sig_stride,const uint8_t * mc_running_avg_y,int mc_avg_y_stride,uint8_t * running_avg_y,int avg_y_stride,int increase_denoising,BLOCK_SIZE bs,int motion_magnitude,int width)131 static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
132 const uint8_t *mc_running_avg_y,
133 int mc_avg_y_stride, uint8_t *running_avg_y,
134 int avg_y_stride, int increase_denoising,
135 BLOCK_SIZE bs, int motion_magnitude,
136 int width) {
137 int sum_diff_thresh, r, sum_diff = 0;
138 const int shift_inc =
139 (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
140 ? 1
141 : 0;
142 uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
143
144 const uint8x16_t v_level1_adjustment = vmovq_n_u8(
145 (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
146 const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
147 const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
148 const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
149 const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
150 const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
151
152 const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
153
154 int8x16_t v_sum_diff_total = vdupq_n_s8(0);
155
156 for (r = 0; r < b_height; ++r) {
157 memcpy(sig_buffer[r], sig, width);
158 memcpy(sig_buffer[r] + width, sig + sig_stride, width);
159 memcpy(mc_running_buffer[r], mc_running_avg_y, width);
160 memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
161 width);
162 memcpy(running_buffer[r], running_avg_y, width);
163 memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
164 v_sum_diff_total = denoiser_16x1_neon(
165 sig_buffer[r], mc_running_buffer[r], running_buffer[r],
166 v_level1_threshold, v_level2_threshold, v_level3_threshold,
167 v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
168 v_sum_diff_total);
169 {
170 const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
171 const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
172 const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
173 vst1_u8(running_avg_y, v_running_buffer_low);
174 vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
175 }
176 // Update pointers for next iteration.
177 sig += (sig_stride << 1);
178 mc_running_avg_y += (mc_avg_y_stride << 1);
179 running_avg_y += (avg_y_stride << 1);
180 }
181
182 {
183 sum_diff = horizontal_add_s8x16(v_sum_diff_total);
184 sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
185 if (abs(sum_diff) > sum_diff_thresh) {
186 // Before returning to copy the block (i.e., apply no denoising),
187 // check if we can still apply some (weaker) temporal filtering to
188 // this block, that would otherwise not be denoised at all. Simplest
189 // is to apply an additional adjustment to running_avg_y to bring it
190 // closer to sig. The adjustment is capped by a maximum delta, and
191 // chosen such that in most cases the resulting sum_diff will be
192 // within the acceptable range given by sum_diff_thresh.
193
194 // The delta is set by the excess of absolute pixel diff over the
195 // threshold.
196 const int delta =
197 ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
198 // Only apply the adjustment for max delta up to 3.
199 if (delta < 4) {
200 const uint8x16_t k_delta = vmovq_n_u8(delta);
201 running_avg_y -= avg_y_stride * (b_height << 1);
202 for (r = 0; r < b_height; ++r) {
203 v_sum_diff_total = denoiser_adjust_16x1_neon(
204 sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
205 v_sum_diff_total);
206 {
207 const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
208 const uint8x8_t v_running_buffer_high =
209 vget_high_u8(v_running_buffer);
210 const uint8x8_t v_running_buffer_low =
211 vget_low_u8(v_running_buffer);
212 vst1_u8(running_avg_y, v_running_buffer_low);
213 vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
214 }
215 // Update pointers for next iteration.
216 running_avg_y += (avg_y_stride << 1);
217 }
218 sum_diff = horizontal_add_s8x16(v_sum_diff_total);
219 if (abs(sum_diff) > sum_diff_thresh) {
220 return COPY_BLOCK;
221 }
222 } else {
223 return COPY_BLOCK;
224 }
225 }
226 }
227
228 return FILTER_BLOCK;
229 }
230
231 // Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
vp9_denoiser_NxM_neon(const uint8_t * sig,int sig_stride,const uint8_t * mc_running_avg_y,int mc_avg_y_stride,uint8_t * running_avg_y,int avg_y_stride,int increase_denoising,BLOCK_SIZE bs,int motion_magnitude)232 static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
233 const uint8_t *mc_running_avg_y,
234 int mc_avg_y_stride, uint8_t *running_avg_y,
235 int avg_y_stride, int increase_denoising,
236 BLOCK_SIZE bs, int motion_magnitude) {
237 const int shift_inc =
238 (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
239 ? 1
240 : 0;
241 const uint8x16_t v_level1_adjustment = vmovq_n_u8(
242 (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
243 const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
244 const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
245 const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
246 const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
247 const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
248
249 const int b_width = (4 << b_width_log2_lookup[bs]);
250 const int b_height = (4 << b_height_log2_lookup[bs]);
251 const int b_width_shift4 = b_width >> 4;
252
253 int8x16_t v_sum_diff_total[4][4];
254 int r, c, sum_diff = 0;
255
256 for (r = 0; r < 4; ++r) {
257 for (c = 0; c < b_width_shift4; ++c) {
258 v_sum_diff_total[c][r] = vdupq_n_s8(0);
259 }
260 }
261
262 for (r = 0; r < b_height; ++r) {
263 for (c = 0; c < b_width_shift4; ++c) {
264 v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
265 sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
266 v_level2_threshold, v_level3_threshold, v_level1_adjustment,
267 v_delta_level_1_and_2, v_delta_level_2_and_3,
268 v_sum_diff_total[c][r >> 4]);
269
270 // Update pointers for next iteration.
271 sig += 16;
272 mc_running_avg_y += 16;
273 running_avg_y += 16;
274 }
275
276 if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
277 for (c = 0; c < b_width_shift4; ++c) {
278 sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
279 }
280 }
281
282 // Update pointers for next iteration.
283 sig = sig - b_width + sig_stride;
284 mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
285 running_avg_y = running_avg_y - b_width + avg_y_stride;
286 }
287
288 {
289 const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
290 if (abs(sum_diff) > sum_diff_thresh) {
291 const int delta =
292 ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
293 // Only apply the adjustment for max delta up to 3.
294 if (delta < 4) {
295 const uint8x16_t k_delta = vdupq_n_u8(delta);
296 sig -= sig_stride * b_height;
297 mc_running_avg_y -= mc_avg_y_stride * b_height;
298 running_avg_y -= avg_y_stride * b_height;
299 sum_diff = 0;
300
301 for (r = 0; r < b_height; ++r) {
302 for (c = 0; c < b_width_shift4; ++c) {
303 v_sum_diff_total[c][r >> 4] =
304 denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
305 k_delta, v_sum_diff_total[c][r >> 4]);
306
307 // Update pointers for next iteration.
308 sig += 16;
309 mc_running_avg_y += 16;
310 running_avg_y += 16;
311 }
312 if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
313 for (c = 0; c < b_width_shift4; ++c) {
314 sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
315 }
316 }
317
318 sig = sig - b_width + sig_stride;
319 mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
320 running_avg_y = running_avg_y - b_width + avg_y_stride;
321 }
322
323 if (abs(sum_diff) > sum_diff_thresh) {
324 return COPY_BLOCK;
325 }
326 } else {
327 return COPY_BLOCK;
328 }
329 }
330 }
331 return FILTER_BLOCK;
332 }
333
vp9_denoiser_filter_neon(const uint8_t * sig,int sig_stride,const uint8_t * mc_avg,int mc_avg_stride,uint8_t * avg,int avg_stride,int increase_denoising,BLOCK_SIZE bs,int motion_magnitude)334 int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
335 const uint8_t *mc_avg, int mc_avg_stride,
336 uint8_t *avg, int avg_stride,
337 int increase_denoising, BLOCK_SIZE bs,
338 int motion_magnitude) {
339 // Rank by frequency of the block type to have an early termination.
340 if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
341 bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
342 bs == BLOCK_32X64 || bs == BLOCK_64X32) {
343 return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
344 avg_stride, increase_denoising, bs,
345 motion_magnitude);
346 } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
347 return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
348 avg_stride, increase_denoising, bs,
349 motion_magnitude, 8);
350 }
351 return COPY_BLOCK;
352 }
353