1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <arm_neon.h>
13
14 #include "vpx_dsp/vpx_dsp_common.h"
15 #include "vp9/encoder/vp9_encoder.h"
16 #include "vpx_ports/mem.h"
17
18 #ifdef __GNUC__
19 #define LIKELY(v) __builtin_expect(v, 1)
20 #define UNLIKELY(v) __builtin_expect(v, 0)
21 #else
22 #define LIKELY(v) (v)
23 #define UNLIKELY(v) (v)
24 #endif
25
pack_int_mv(int16_t row,int16_t col)26 static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
27 int_mv result;
28 result.as_mv.row = row;
29 result.as_mv.col = col;
30 return result;
31 }
32
33 /*****************************************************************************
34 * This function utilizes 3 properties of the cost function lookup tables, *
35 * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in *
36 * vp9_encoder.c. *
37 * For the joint cost: *
38 * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] *
39 * For the component costs: *
40 * - For all i: mvsadcost[0][i] == mvsadcost[1][i] *
41 * (Equal costs for both components) *
42 * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] *
43 * (Cost function is even) *
44 * If these do not hold, then this function cannot be used without *
45 * modification, in which case you can revert to using the C implementation, *
46 * which does not rely on these properties. *
47 *****************************************************************************/
vp9_diamond_search_sad_neon(const MACROBLOCK * x,const search_site_config * cfg,MV * ref_mv,uint32_t start_mv_sad,MV * best_mv,int search_param,int sad_per_bit,int * num00,const vp9_sad_fn_ptr_t * sad_fn_ptr,const MV * center_mv)48 int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
49 const search_site_config *cfg, MV *ref_mv,
50 uint32_t start_mv_sad, MV *best_mv,
51 int search_param, int sad_per_bit, int *num00,
52 const vp9_sad_fn_ptr_t *sad_fn_ptr,
53 const MV *center_mv) {
54 static const uint32_t data[4] = { 0, 1, 2, 3 };
55 const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
56
57 const int32x4_t zero_s32 = vdupq_n_s32(0);
58 const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
59 const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int));
60 const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
61 const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int));
62
63 const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit);
64
65 const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]);
66 const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]);
67
68 // search_param determines the length of the initial step and hence the number
69 // of iterations.
70 // 0 = initial step (MAX_FIRST_STEP) pel
71 // 1 = (MAX_FIRST_STEP/2) pel,
72 // 2 = (MAX_FIRST_STEP/4) pel...
73 const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
74 const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
75 const int tot_steps = cfg->total_steps - search_param;
76
77 const int_mv fcenter_mv =
78 pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
79 const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int));
80
81 const int ref_row = ref_mv->row;
82 const int ref_col = ref_mv->col;
83
84 int_mv bmv = pack_int_mv(ref_row, ref_col);
85 int_mv new_bmv = bmv;
86 int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
87
88 const int what_stride = x->plane[0].src.stride;
89 const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
90 const uint8_t *const what = x->plane[0].src.buf;
91 const uint8_t *const in_what =
92 x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
93
94 // Work out the start point for the search
95 const uint8_t *best_address = in_what;
96 const uint8_t *new_best_address = best_address;
97 #if VPX_ARCH_AARCH64
98 int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
99 #else
100 int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
101 #endif
102 // Starting position
103 unsigned int best_sad = start_mv_sad;
104 int i, j, step;
105
106 // Check the prerequisite cost function properties that are easy to check
107 // in an assert. See the function-level documentation for details on all
108 // prerequisites.
109 assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
110 assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
111
112 *num00 = 0;
113
114 for (i = 0, step = 0; step < tot_steps; step++) {
115 for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
116 int16x8_t v_diff_mv_w;
117 int8x16_t v_inside_d;
118 uint32x4_t v_outside_d;
119 int32x4_t v_cost_d, v_sad_d;
120 #if VPX_ARCH_AARCH64
121 int64x2_t v_blocka[2];
122 #else
123 int32x4_t v_blocka[1];
124 uint32x2_t horiz_max_0, horiz_max_1;
125 #endif
126
127 uint32_t horiz_max;
128 // Compute the candidate motion vectors
129 const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]);
130 const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w);
131 // Clamp them to the search bounds
132 int16x8_t v_these_mv_clamp_w = v_these_mv_w;
133 v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w);
134 v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w);
135 // The ones that did not change are inside the search area
136 v_inside_d = vreinterpretq_s8_u32(
137 vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w),
138 vreinterpretq_s32_s16(v_these_mv_w)));
139
140 // If none of them are inside, then move on
141 #if VPX_ARCH_AARCH64
142 horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
143 #else
144 horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
145 vget_high_u32(vreinterpretq_u32_s8(v_inside_d)));
146 horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0);
147 vst1_lane_u32(&horiz_max, horiz_max_1, 0);
148 #endif
149 if (LIKELY(horiz_max == 0)) {
150 continue;
151 }
152
153 // The inverse mask indicates which of the MVs are outside
154 v_outside_d =
155 vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff)));
156 // Shift right to keep the sign bit clear, we will use this later
157 // to set the cost to the maximum value.
158 v_outside_d = vshrq_n_u32(v_outside_d, 1);
159
160 // Compute the difference MV
161 v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv);
162 // We utilise the fact that the cost function is even, and use the
163 // absolute difference. This allows us to use unsigned indexes later
164 // and reduces cache pressure somewhat as only a half of the table
165 // is ever referenced.
166 v_diff_mv_w = vabsq_s16(v_diff_mv_w);
167
168 // Compute the SIMD pointer offsets.
169 {
170 #if VPX_ARCH_AARCH64 // sizeof(intptr_t) == 8
171 // Load the offsets
172 int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
173 int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
174 // Set the ones falling outside to zero
175 v_bo10_q = vandq_s64(
176 v_bo10_q,
177 vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d))));
178 v_bo32_q = vandq_s64(
179 v_bo32_q,
180 vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d))));
181 // Compute the candidate addresses
182 v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q);
183 v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q);
184 #else // sizeof(intptr_t) == 4
185 int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]);
186 v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d));
187 v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d);
188 #endif
189 }
190
191 sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
192 in_what_stride, (uint32_t *)&v_sad_d);
193
194 // Look up the component cost of the residual motion vector
195 {
196 uint32_t cost[4];
197 DECLARE_ALIGNED(16, int16_t, rowcol[8]);
198 vst1q_s16(rowcol, v_diff_mv_w);
199
200 // Note: This is a use case for gather instruction
201 cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]];
202 cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]];
203 cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]];
204 cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]];
205
206 v_cost_d = vld1q_s32((int32_t *)cost);
207 }
208
209 // Now add in the joint cost
210 {
211 const uint32x4_t v_sel_d =
212 vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32);
213 const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8(
214 vbslq_u8(vreinterpretq_u8_u32(v_sel_d),
215 vreinterpretq_u8_s32(v_joint_cost_0_d),
216 vreinterpretq_u8_s32(v_joint_cost_1_d)));
217 v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d);
218 }
219
220 // Multiply by sad_per_bit
221 v_cost_d = vmulq_s32(v_cost_d, v_spb_d);
222 // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
223 v_cost_d =
224 vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1)));
225 v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT);
226 // Add the cost to the sad
227 v_sad_d = vaddq_s32(v_sad_d, v_cost_d);
228
229 // Make the motion vectors outside the search area have max cost
230 // by or'ing in the comparison mask, this way the minimum search won't
231 // pick them.
232 v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d));
233
234 // Find the minimum value and index horizontally in v_sad_d
235 {
236 uint32_t local_best_sad;
237 #if VPX_ARCH_AARCH64
238 local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
239 #else
240 uint32x2_t horiz_min_0 =
241 vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)),
242 vget_high_u32(vreinterpretq_u32_s32(v_sad_d)));
243 uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
244 vst1_lane_u32(&local_best_sad, horiz_min_1, 0);
245 #endif
246
247 // Update the global minimum if the local minimum is smaller
248 if (LIKELY(local_best_sad < best_sad)) {
249 #if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
250 #pragma GCC diagnostic push
251 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
252 #endif
253 uint32_t local_best_idx;
254 const uint32x4_t v_sel_d =
255 vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad));
256 uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
257 v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
258
259 #if VPX_ARCH_AARCH64
260 local_best_idx = vminvq_u32(v_mask_d);
261 #else
262 horiz_min_0 =
263 vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d));
264 horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
265 vst1_lane_u32(&local_best_idx, horiz_min_1, 0);
266 #endif
267
268 new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
269 #if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
270 #pragma GCC diagnostic pop
271 #endif
272 new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
273
274 best_sad = local_best_sad;
275 }
276 }
277 }
278
279 bmv = new_bmv;
280 best_address = new_best_address;
281
282 v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
283 #if VPX_ARCH_AARCH64
284 v_ba_q = vdupq_n_s64((intptr_t)best_address);
285 #else
286 v_ba_d = vdupq_n_s32((intptr_t)best_address);
287 #endif
288
289 if (UNLIKELY(best_address == in_what)) {
290 (*num00)++;
291 }
292 }
293
294 *best_mv = bmv.as_mv;
295 return best_sad;
296 }
297