• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16 #include "av1/encoder/encoder.h"
17 #include "av1/encoder/temporal_filter.h"
18 #include "aom_dsp/mathutils.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_dsp/arm/sum_neon.h"
21 
22 // For the squared error buffer, add padding for 4 samples.
23 #define SSE_STRIDE (BW + 4)
24 
25 // When using vld1q_u16_x4 compilers may insert an alignment hint of 256 bits.
26 DECLARE_ALIGNED(32, static const uint16_t, kSlidingWindowMask[]) = {
27   0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0x0000,
28   0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
29   0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
30   0x0000, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF
31 };
32 
get_squared_error(const uint8_t * frame1,const uint32_t stride1,const uint8_t * frame2,const uint32_t stride2,const uint32_t block_width,const uint32_t block_height,uint16_t * frame_sse,const unsigned int dst_stride)33 static inline void get_squared_error(
34     const uint8_t *frame1, const uint32_t stride1, const uint8_t *frame2,
35     const uint32_t stride2, const uint32_t block_width,
36     const uint32_t block_height, uint16_t *frame_sse,
37     const unsigned int dst_stride) {
38   uint16_t *dst = frame_sse;
39 
40   uint32_t i = 0;
41   do {
42     uint32_t j = 0;
43     do {
44       uint8x16_t s = vld1q_u8(frame1 + i * stride1 + j);
45       uint8x16_t r = vld1q_u8(frame2 + i * stride2 + j);
46 
47       uint8x16_t abs_diff = vabdq_u8(s, r);
48       uint16x8_t sse_lo =
49           vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
50       uint16x8_t sse_hi =
51           vmull_u8(vget_high_u8(abs_diff), vget_high_u8(abs_diff));
52 
53       vst1q_u16(dst + j + 2, sse_lo);
54       vst1q_u16(dst + j + 10, sse_hi);
55 
56       j += 16;
57     } while (j < block_width);
58 
59     dst += dst_stride;
60   } while (++i < block_height);
61 }
62 
load_and_pad(const uint16_t * src,const uint32_t col,const uint32_t block_width)63 static inline uint16x8_t load_and_pad(const uint16_t *src, const uint32_t col,
64                                       const uint32_t block_width) {
65   uint16x8_t s = vld1q_u16(src);
66 
67   if (col == 0) {
68     const uint16_t lane2 = vgetq_lane_u16(s, 2);
69     s = vsetq_lane_u16(lane2, s, 0);
70     s = vsetq_lane_u16(lane2, s, 1);
71   } else if (col >= block_width - 4) {
72     const uint16_t lane5 = vgetq_lane_u16(s, 5);
73     s = vsetq_lane_u16(lane5, s, 6);
74     s = vsetq_lane_u16(lane5, s, 7);
75   }
76   return s;
77 }
78 
apply_temporal_filter(const uint8_t * frame,const unsigned int stride,const uint32_t block_width,const uint32_t block_height,const int * subblock_mses,unsigned int * accumulator,uint16_t * count,const uint16_t * frame_sse,const uint32_t * luma_sse_sum,const double inv_num_ref_pixels,const double decay_factor,const double inv_factor,const double weight_factor,const double * d_factor,int tf_wgt_calc_lvl)79 static void apply_temporal_filter(
80     const uint8_t *frame, const unsigned int stride, const uint32_t block_width,
81     const uint32_t block_height, const int *subblock_mses,
82     unsigned int *accumulator, uint16_t *count, const uint16_t *frame_sse,
83     const uint32_t *luma_sse_sum, const double inv_num_ref_pixels,
84     const double decay_factor, const double inv_factor,
85     const double weight_factor, const double *d_factor, int tf_wgt_calc_lvl) {
86   assert(((block_width == 16) || (block_width == 32)) &&
87          ((block_height == 16) || (block_height == 32)));
88 
89   uint32_t diff_sse[BH][BW];
90   const uint16x8x4_t vmask = vld1q_u16_x4(kSlidingWindowMask);
91 
92   // Traverse 4 columns at a time - first and last two columns need padding.
93   for (uint32_t col = 0; col < block_width; col += 4) {
94     uint16x8_t vsrc[5];
95     const uint16_t *src = frame_sse + col;
96 
97     // Load and pad (for first and last two columns) 3 rows from the top.
98     for (int i = 2; i < 5; i++) {
99       vsrc[i] = load_and_pad(src, col, block_width);
100       src += SSE_STRIDE;
101     }
102 
103     // Pad the top 2 rows.
104     vsrc[0] = vsrc[2];
105     vsrc[1] = vsrc[2];
106 
107     uint32x4_t vsum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
108                            vdupq_n_u32(0) };
109 
110     for (int i = 0; i < 4; i++) {
111       vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[0], vmask.val[i]));
112       vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[1], vmask.val[i]));
113       vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[2], vmask.val[i]));
114       vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[3], vmask.val[i]));
115       vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[4], vmask.val[i]));
116     }
117 
118     for (unsigned int row = 0; row < block_height; row++) {
119       uint32x4_t sum_luma = vld1q_u32(luma_sse_sum + row * BW + col);
120       uint32x4_t sum_src = horizontal_add_4d_u32x4(vsum);
121 
122       vst1q_u32(&diff_sse[row][col], vaddq_u32(sum_src, sum_luma));
123 
124       for (int i = 0; i < 4; i++) {
125         uint32x4_t vsum_0 = vpaddlq_u16(vandq_u16(vsrc[0], vmask.val[i]));
126         vsum[i] = vsubq_u32(vsum[i], vsum_0);
127       }
128 
129       // Push all rows in the sliding window up one.
130       for (int i = 0; i < 4; i++) {
131         vsrc[i] = vsrc[i + 1];
132       }
133 
134       if (row <= block_height - 4) {
135         // Load next row into the bottom of the sliding window.
136         vsrc[4] = load_and_pad(src, col, block_width);
137         src += SSE_STRIDE;
138       } else {
139         // Pad the bottom 2 rows.
140         vsrc[4] = vsrc[3];
141       }
142 
143       for (int i = 0; i < 4; i++) {
144         vsum[i] = vpadalq_u16(vsum[i], vandq_u16(vsrc[4], vmask.val[i]));
145       }
146     }
147   }
148 
149   // Perform filtering.
150   if (tf_wgt_calc_lvl == 0) {
151     for (unsigned int i = 0, k = 0; i < block_height; i++) {
152       for (unsigned int j = 0; j < block_width; j++, k++) {
153         const int pixel_value = frame[i * stride + j];
154 
155         const double window_error = diff_sse[i][j] * inv_num_ref_pixels;
156         const int subblock_idx =
157             (i >= block_height / 2) * 2 + (j >= block_width / 2);
158         const double block_error = (double)subblock_mses[subblock_idx];
159         const double combined_error =
160             weight_factor * window_error + block_error * inv_factor;
161         // Compute filter weight.
162         double scaled_error =
163             combined_error * d_factor[subblock_idx] * decay_factor;
164         scaled_error = AOMMIN(scaled_error, 7);
165         const int weight = (int)(exp(-scaled_error) * TF_WEIGHT_SCALE);
166         accumulator[k] += weight * pixel_value;
167         count[k] += weight;
168       }
169     }
170   } else {
171     for (unsigned int i = 0, k = 0; i < block_height; i++) {
172       for (unsigned int j = 0; j < block_width; j++, k++) {
173         const int pixel_value = frame[i * stride + j];
174 
175         const double window_error = diff_sse[i][j] * inv_num_ref_pixels;
176         const int subblock_idx =
177             (i >= block_height / 2) * 2 + (j >= block_width / 2);
178         const double block_error = (double)subblock_mses[subblock_idx];
179         const double combined_error =
180             weight_factor * window_error + block_error * inv_factor;
181         // Compute filter weight.
182         double scaled_error =
183             combined_error * d_factor[subblock_idx] * decay_factor;
184         scaled_error = AOMMIN(scaled_error, 7);
185         const float fweight =
186             approx_exp((float)-scaled_error) * TF_WEIGHT_SCALE;
187         const int weight = iroundpf(fweight);
188         accumulator[k] += weight * pixel_value;
189         count[k] += weight;
190       }
191     }
192   }
193 }
194 
av1_apply_temporal_filter_neon(const YV12_BUFFER_CONFIG * frame_to_filter,const MACROBLOCKD * mbd,const BLOCK_SIZE block_size,const int mb_row,const int mb_col,const int num_planes,const double * noise_levels,const MV * subblock_mvs,const int * subblock_mses,const int q_factor,const int filter_strength,int tf_wgt_calc_lvl,const uint8_t * pred,uint32_t * accum,uint16_t * count)195 void av1_apply_temporal_filter_neon(
196     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
197     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
198     const int num_planes, const double *noise_levels, const MV *subblock_mvs,
199     const int *subblock_mses, const int q_factor, const int filter_strength,
200     int tf_wgt_calc_lvl, const uint8_t *pred, uint32_t *accum,
201     uint16_t *count) {
202   const int is_high_bitdepth = frame_to_filter->flags & YV12_FLAG_HIGHBITDEPTH;
203   assert(block_size == BLOCK_32X32 && "Only support 32x32 block with Neon!");
204   assert(TF_WINDOW_LENGTH == 5 && "Only support window length 5 with Neon!");
205   assert(!is_high_bitdepth && "Only support low bit-depth with Neon!");
206   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
207   (void)is_high_bitdepth;
208 
209   // Block information.
210   const int mb_height = block_size_high[block_size];
211   const int mb_width = block_size_wide[block_size];
212   // Frame information.
213   const int frame_height = frame_to_filter->y_crop_height;
214   const int frame_width = frame_to_filter->y_crop_width;
215   const int min_frame_size = AOMMIN(frame_height, frame_width);
216   // Variables to simplify combined error calculation.
217   const double inv_factor = 1.0 / ((TF_WINDOW_BLOCK_BALANCE_WEIGHT + 1) *
218                                    TF_SEARCH_ERROR_NORM_WEIGHT);
219   const double weight_factor =
220       (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
221   // Adjust filtering based on q.
222   // Larger q -> stronger filtering -> larger weight.
223   // Smaller q -> weaker filtering -> smaller weight.
224   double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
225   q_decay = CLIP(q_decay, 1e-5, 1);
226   if (q_factor >= TF_QINDEX_CUTOFF) {
227     // Max q_factor is 255, therefore the upper bound of q_decay is 8.
228     // We do not need a clip here.
229     q_decay = 0.5 * pow((double)q_factor / 64, 2);
230   }
231   // Smaller strength -> smaller filtering weight.
232   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
233   s_decay = CLIP(s_decay, 1e-5, 1);
234   double d_factor[4] = { 0 };
235   uint16_t frame_sse[SSE_STRIDE * BH] = { 0 };
236   uint32_t luma_sse_sum[BW * BH] = { 0 };
237 
238   for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
239     // Larger motion vector -> smaller filtering weight.
240     const MV mv = subblock_mvs[subblock_idx];
241     const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
242     double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
243     distance_threshold = AOMMAX(distance_threshold, 1);
244     d_factor[subblock_idx] = distance / distance_threshold;
245     d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
246   }
247 
248   // Handle planes in sequence.
249   int plane_offset = 0;
250   for (int plane = 0; plane < num_planes; ++plane) {
251     const uint32_t plane_h = mb_height >> mbd->plane[plane].subsampling_y;
252     const uint32_t plane_w = mb_width >> mbd->plane[plane].subsampling_x;
253     const uint32_t frame_stride =
254         frame_to_filter->strides[plane == AOM_PLANE_Y ? 0 : 1];
255     const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
256 
257     const uint8_t *ref = frame_to_filter->buffers[plane] + frame_offset;
258     const int ss_x_shift =
259         mbd->plane[plane].subsampling_x - mbd->plane[AOM_PLANE_Y].subsampling_x;
260     const int ss_y_shift =
261         mbd->plane[plane].subsampling_y - mbd->plane[AOM_PLANE_Y].subsampling_y;
262     const int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH +
263                                ((plane) ? (1 << (ss_x_shift + ss_y_shift)) : 0);
264     const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
265     // Larger noise -> larger filtering weight.
266     const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
267     // Decay factors for non-local mean approach.
268     const double decay_factor = 1 / (n_decay * q_decay * s_decay);
269 
270     // Filter U-plane and V-plane using Y-plane. This is because motion
271     // search is only done on Y-plane, so the information from Y-plane
272     // will be more accurate. The luma sse sum is reused in both chroma
273     // planes.
274     if (plane == AOM_PLANE_U) {
275       for (unsigned int i = 0; i < plane_h; i++) {
276         for (unsigned int j = 0; j < plane_w; j++) {
277           for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
278             for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
279               const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
280               const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
281               luma_sse_sum[i * BW + j] += frame_sse[yy * SSE_STRIDE + xx + 2];
282             }
283           }
284         }
285       }
286     }
287 
288     get_squared_error(ref, frame_stride, pred + plane_offset, plane_w, plane_w,
289                       plane_h, frame_sse, SSE_STRIDE);
290 
291     apply_temporal_filter(pred + plane_offset, plane_w, plane_w, plane_h,
292                           subblock_mses, accum + plane_offset,
293                           count + plane_offset, frame_sse, luma_sse_sum,
294                           inv_num_ref_pixels, decay_factor, inv_factor,
295                           weight_factor, d_factor, tf_wgt_calc_lvl);
296 
297     plane_offset += plane_h * plane_w;
298   }
299 }
300 
301 // TODO(aomedia:349450845): enable for armv7 after SIGBUS is fixed.
302 #if AOM_ARCH_AARCH64
av1_estimate_noise_from_single_plane_neon(const uint8_t * src,int height,int width,int stride,int edge_thresh)303 double av1_estimate_noise_from_single_plane_neon(const uint8_t *src, int height,
304                                                  int width, int stride,
305                                                  int edge_thresh) {
306   uint16x8_t thresh = vdupq_n_u16(edge_thresh);
307   uint32x4_t acc = vdupq_n_u32(0);
308   // Count is in theory positive as it counts the number of times we're under
309   // the threshold, but it will be counted negatively in order to make best use
310   // of the vclt instruction, which sets every bit of a lane to 1 when the
311   // condition is true.
312   int32x4_t count = vdupq_n_s32(0);
313   int final_count = 0;
314   int64_t final_acc = 0;
315   const uint8_t *src_start = src + stride + 1;
316   int h = 1;
317 
318   do {
319     int w = 1;
320     const uint8_t *src_ptr = src_start;
321 
322     while (w <= (width - 1) - 16) {
323       uint8x16_t mat[3][3];
324       mat[0][0] = vld1q_u8(src_ptr - stride - 1);
325       mat[0][1] = vld1q_u8(src_ptr - stride);
326       mat[0][2] = vld1q_u8(src_ptr - stride + 1);
327       mat[1][0] = vld1q_u8(src_ptr - 1);
328       mat[1][1] = vld1q_u8(src_ptr);
329       mat[1][2] = vld1q_u8(src_ptr + 1);
330       mat[2][0] = vld1q_u8(src_ptr + stride - 1);
331       mat[2][1] = vld1q_u8(src_ptr + stride);
332       mat[2][2] = vld1q_u8(src_ptr + stride + 1);
333 
334       // Compute Sobel gradients.
335       uint16x8_t gxa_lo =
336           vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[2][0]));
337       uint16x8_t gxa_hi =
338           vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[2][0]));
339       uint16x8_t gxb_lo =
340           vaddl_u8(vget_low_u8(mat[0][2]), vget_low_u8(mat[2][2]));
341       uint16x8_t gxb_hi =
342           vaddl_u8(vget_high_u8(mat[0][2]), vget_high_u8(mat[2][2]));
343       gxa_lo = vaddq_u16(
344           gxa_lo, vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][0])));
345       gxa_hi = vaddq_u16(
346           gxa_hi, vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][0])));
347       gxb_lo = vaddq_u16(
348           gxb_lo, vaddl_u8(vget_low_u8(mat[1][2]), vget_low_u8(mat[1][2])));
349       gxb_hi = vaddq_u16(
350           gxb_hi, vaddl_u8(vget_high_u8(mat[1][2]), vget_high_u8(mat[1][2])));
351 
352       uint16x8_t gya_lo =
353           vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
354       uint16x8_t gya_hi =
355           vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
356       uint16x8_t gyb_lo =
357           vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
358       uint16x8_t gyb_hi =
359           vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
360       gya_lo = vaddq_u16(
361           gya_lo, vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[0][1])));
362       gya_hi = vaddq_u16(
363           gya_hi, vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[0][1])));
364       gyb_lo = vaddq_u16(
365           gyb_lo, vaddl_u8(vget_low_u8(mat[2][1]), vget_low_u8(mat[2][1])));
366       gyb_hi = vaddq_u16(
367           gyb_hi, vaddl_u8(vget_high_u8(mat[2][1]), vget_high_u8(mat[2][1])));
368 
369       uint16x8_t ga_lo = vabaq_u16(vabdq_u16(gxa_lo, gxb_lo), gya_lo, gyb_lo);
370       uint16x8_t ga_hi = vabaq_u16(vabdq_u16(gxa_hi, gxb_hi), gya_hi, gyb_hi);
371 
372       // Check which vector elements are under the threshold. The Laplacian is
373       // then unconditionally computed and we accumulate zeros if we're not
374       // under the threshold. This is much faster than using an if statement.
375       uint16x8_t thresh_u16_lo = vcltq_u16(ga_lo, thresh);
376       uint16x8_t thresh_u16_hi = vcltq_u16(ga_hi, thresh);
377 
378       uint16x8_t center_lo = vshll_n_u8(vget_low_u8(mat[1][1]), 2);
379       uint16x8_t center_hi = vshll_n_u8(vget_high_u8(mat[1][1]), 2);
380 
381       uint16x8_t adj0_lo =
382           vaddl_u8(vget_low_u8(mat[0][1]), vget_low_u8(mat[2][1]));
383       uint16x8_t adj0_hi =
384           vaddl_u8(vget_high_u8(mat[0][1]), vget_high_u8(mat[2][1]));
385       uint16x8_t adj1_lo =
386           vaddl_u8(vget_low_u8(mat[1][0]), vget_low_u8(mat[1][2]));
387       uint16x8_t adj1_hi =
388           vaddl_u8(vget_high_u8(mat[1][0]), vget_high_u8(mat[1][2]));
389       uint16x8_t adj_lo = vaddq_u16(adj0_lo, adj1_lo);
390       adj_lo = vaddq_u16(adj_lo, adj_lo);
391       uint16x8_t adj_hi = vaddq_u16(adj0_hi, adj1_hi);
392       adj_hi = vaddq_u16(adj_hi, adj_hi);
393 
394       uint16x8_t diag0_lo =
395           vaddl_u8(vget_low_u8(mat[0][0]), vget_low_u8(mat[0][2]));
396       uint16x8_t diag0_hi =
397           vaddl_u8(vget_high_u8(mat[0][0]), vget_high_u8(mat[0][2]));
398       uint16x8_t diag1_lo =
399           vaddl_u8(vget_low_u8(mat[2][0]), vget_low_u8(mat[2][2]));
400       uint16x8_t diag1_hi =
401           vaddl_u8(vget_high_u8(mat[2][0]), vget_high_u8(mat[2][2]));
402       uint16x8_t diag_lo = vaddq_u16(diag0_lo, diag1_lo);
403       uint16x8_t diag_hi = vaddq_u16(diag0_hi, diag1_hi);
404 
405       uint16x8_t v_lo = vaddq_u16(center_lo, diag_lo);
406       v_lo = vabdq_u16(v_lo, adj_lo);
407       uint16x8_t v_hi = vaddq_u16(center_hi, diag_hi);
408       v_hi = vabdq_u16(v_hi, adj_hi);
409 
410       acc = vpadalq_u16(acc, vandq_u16(v_lo, thresh_u16_lo));
411       acc = vpadalq_u16(acc, vandq_u16(v_hi, thresh_u16_hi));
412 
413       // Add -1 for each lane where the gradient is under the threshold.
414       count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_lo));
415       count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16_hi));
416 
417       w += 16;
418       src_ptr += 16;
419     }
420 
421     if (w <= (width - 1) - 8) {
422       uint8x8_t mat[3][3];
423       mat[0][0] = vld1_u8(src_ptr - stride - 1);
424       mat[0][1] = vld1_u8(src_ptr - stride);
425       mat[0][2] = vld1_u8(src_ptr - stride + 1);
426       mat[1][0] = vld1_u8(src_ptr - 1);
427       mat[1][1] = vld1_u8(src_ptr);
428       mat[1][2] = vld1_u8(src_ptr + 1);
429       mat[2][0] = vld1_u8(src_ptr + stride - 1);
430       mat[2][1] = vld1_u8(src_ptr + stride);
431       mat[2][2] = vld1_u8(src_ptr + stride + 1);
432 
433       // Compute Sobel gradients.
434       uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
435       uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
436       gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
437       gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
438 
439       uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
440       uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
441       gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
442       gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
443 
444       uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
445 
446       // Check which vector elements are under the threshold. The Laplacian is
447       // then unconditionally computed and we accumulate zeros if we're not
448       // under the threshold. This is much faster than using an if statement.
449       uint16x8_t thresh_u16 = vcltq_u16(ga, thresh);
450 
451       uint16x8_t center = vshll_n_u8(mat[1][1], 2);
452 
453       uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
454       uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
455       uint16x8_t adj = vaddq_u16(adj0, adj1);
456       adj = vaddq_u16(adj, adj);
457 
458       uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
459       uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
460       uint16x8_t diag = vaddq_u16(diag0, diag1);
461 
462       uint16x8_t v = vaddq_u16(center, diag);
463       v = vabdq_u16(v, adj);
464 
465       acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
466       // Add -1 for each lane where the gradient is under the threshold.
467       count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
468 
469       w += 8;
470       src_ptr += 8;
471     }
472 
473     if (w <= (width - 1) - 4) {
474       uint16x8_t mask = vcombine_u16(vdup_n_u16(65535), vdup_n_u16(0));
475       uint8x8_t mat[3][3];
476       mat[0][0] = load_unaligned_u8_4x1(src_ptr - stride - 1);
477       mat[0][1] = load_unaligned_u8_4x1(src_ptr - stride);
478       mat[0][2] = load_unaligned_u8_4x1(src_ptr - stride + 1);
479       mat[1][0] = load_unaligned_u8_4x1(src_ptr - 1);
480       mat[1][1] = load_unaligned_u8_4x1(src_ptr);
481       mat[1][2] = load_unaligned_u8_4x1(src_ptr + 1);
482       mat[2][0] = load_unaligned_u8_4x1(src_ptr + stride - 1);
483       mat[2][1] = load_unaligned_u8_4x1(src_ptr + stride);
484       mat[2][2] = load_unaligned_u8_4x1(src_ptr + stride + 1);
485 
486       // Compute Sobel gradients.
487       uint16x8_t gxa = vaddl_u8(mat[0][0], mat[2][0]);
488       uint16x8_t gxb = vaddl_u8(mat[0][2], mat[2][2]);
489       gxa = vaddq_u16(gxa, vaddl_u8(mat[1][0], mat[1][0]));
490       gxb = vaddq_u16(gxb, vaddl_u8(mat[1][2], mat[1][2]));
491 
492       uint16x8_t gya = vaddl_u8(mat[0][0], mat[0][2]);
493       uint16x8_t gyb = vaddl_u8(mat[2][0], mat[2][2]);
494       gya = vaddq_u16(gya, vaddl_u8(mat[0][1], mat[0][1]));
495       gyb = vaddq_u16(gyb, vaddl_u8(mat[2][1], mat[2][1]));
496 
497       uint16x8_t ga = vabaq_u16(vabdq_u16(gxa, gxb), gya, gyb);
498 
499       // Check which vector elements are under the threshold. The Laplacian is
500       // then unconditionally computed and we accumulate zeros if we're not
501       // under the threshold. This is much faster than using an if statement.
502       uint16x8_t thresh_u16 = vandq_u16(vcltq_u16(ga, thresh), mask);
503 
504       uint16x8_t center = vshll_n_u8(mat[1][1], 2);
505 
506       uint16x8_t adj0 = vaddl_u8(mat[0][1], mat[2][1]);
507       uint16x8_t adj1 = vaddl_u8(mat[1][0], mat[1][2]);
508       uint16x8_t adj = vaddq_u16(adj0, adj1);
509       adj = vaddq_u16(adj, adj);
510 
511       uint16x8_t diag0 = vaddl_u8(mat[0][0], mat[0][2]);
512       uint16x8_t diag1 = vaddl_u8(mat[2][0], mat[2][2]);
513       uint16x8_t diag = vaddq_u16(diag0, diag1);
514 
515       uint16x8_t v = vaddq_u16(center, diag);
516       v = vabdq_u16(v, adj);
517 
518       acc = vpadalq_u16(acc, vandq_u16(v, thresh_u16));
519       // Add -1 for each lane where the gradient is under the threshold.
520       count = vpadalq_s16(count, vreinterpretq_s16_u16(thresh_u16));
521 
522       w += 4;
523       src_ptr += 4;
524     }
525 
526     while (w < width - 1) {
527       int mat[3][3];
528       mat[0][0] = *(src_ptr - stride - 1);
529       mat[0][1] = *(src_ptr - stride);
530       mat[0][2] = *(src_ptr - stride + 1);
531       mat[1][0] = *(src_ptr - 1);
532       mat[1][1] = *(src_ptr);
533       mat[1][2] = *(src_ptr + 1);
534       mat[2][0] = *(src_ptr + stride - 1);
535       mat[2][1] = *(src_ptr + stride);
536       mat[2][2] = *(src_ptr + stride + 1);
537 
538       // Compute Sobel gradients.
539       const int gx = (mat[0][0] - mat[0][2]) + (mat[2][0] - mat[2][2]) +
540                      2 * (mat[1][0] - mat[1][2]);
541       const int gy = (mat[0][0] - mat[2][0]) + (mat[0][2] - mat[2][2]) +
542                      2 * (mat[0][1] - mat[2][1]);
543       const int ga = abs(gx) + abs(gy);
544 
545       // Accumulate Laplacian.
546       const int is_under = ga < edge_thresh;
547       const int v = 4 * mat[1][1] -
548                     2 * (mat[0][1] + mat[2][1] + mat[1][0] + mat[1][2]) +
549                     (mat[0][0] + mat[0][2] + mat[2][0] + mat[2][2]);
550       final_acc += abs(v) * is_under;
551       final_count += is_under;
552 
553       src_ptr++;
554       w++;
555     }
556     src_start += stride;
557   } while (++h < height - 1);
558 
559   // We counted negatively, so subtract to get the final value.
560   final_count -= horizontal_add_s32x4(count);
561   final_acc += horizontal_long_add_u32x4(acc);
562   return (final_count < 16)
563              ? -1.0
564              : (double)final_acc / (6 * final_count) * SQRT_PI_BY_2;
565 }
566 #endif  // AOM_ARCH_AARCH64
567