1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13
14 #include <arm_neon.h>
15
16 #include "av1/encoder/rdopt.h"
17 #include "config/av1_rtcd.h"
18
19 // Process horizontal and vertical correlations in a 4x4 block of pixels.
20 // We actually use the 4x4 pixels to calculate correlations corresponding to
21 // the top-left 3x3 pixels, so this function must be called with 1x1 overlap,
22 // moving the window along/down by 3 pixels at a time.
horver_correlation_4x4(const int16_t * diff,int stride,int32x4_t * xy_sum_32,int32x4_t * xz_sum_32,int32x4_t * x_sum_32,int32x4_t * x2_sum_32)23 INLINE static void horver_correlation_4x4(const int16_t *diff, int stride,
24 int32x4_t *xy_sum_32,
25 int32x4_t *xz_sum_32,
26 int32x4_t *x_sum_32,
27 int32x4_t *x2_sum_32) {
28 // Pixels in this 4x4 [ a b c d ]
29 // are referred to as: [ e f g h ]
30 // [ i j k l ]
31 // [ m n o p ]
32
33 const int16x4_t pixelsa_2_lo = vld1_s16(diff + (0 * stride));
34 const int16x4_t pixelsa_2_sli =
35 vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_2_lo), 16));
36 const int16x4_t pixelsb_2_lo = vld1_s16(diff + (1 * stride));
37 const int16x4_t pixelsb_2_sli =
38 vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_2_lo), 16));
39 const int16x4_t pixelsa_1_lo = vld1_s16(diff + (2 * stride));
40 const int16x4_t pixelsa_1_sli =
41 vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsa_1_lo), 16));
42 const int16x4_t pixelsb_1_lo = vld1_s16(diff + (3 * stride));
43 const int16x4_t pixelsb_1_sli =
44 vreinterpret_s16_s64(vshl_n_s64(vreinterpret_s64_s16(pixelsb_1_lo), 16));
45
46 const int16x8_t slli_a = vcombine_s16(pixelsa_1_sli, pixelsa_2_sli);
47
48 *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_1_lo, pixelsa_1_sli);
49 *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsa_2_lo, pixelsa_2_sli);
50 *xy_sum_32 = vmlal_s16(*xy_sum_32, pixelsb_2_lo, pixelsb_2_sli);
51
52 *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_1_sli);
53 *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_2_sli, pixelsb_2_sli);
54 *xz_sum_32 = vmlal_s16(*xz_sum_32, pixelsa_1_sli, pixelsb_2_sli);
55
56 // Now calculate the straight sums, x_sum += a+b+c+e+f+g+i+j+k
57 // (sum up every element in slli_a and swap_b)
58 *x_sum_32 = vpadalq_s16(*x_sum_32, slli_a);
59 *x_sum_32 = vaddw_s16(*x_sum_32, pixelsb_2_sli);
60
61 // Also sum their squares
62 *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_1_sli, pixelsa_1_sli);
63 *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsa_2_sli, pixelsa_2_sli);
64 *x2_sum_32 = vmlal_s16(*x2_sum_32, pixelsb_2_sli, pixelsb_2_sli);
65 }
66
av1_get_horver_correlation_full_neon(const int16_t * diff,int stride,int width,int height,float * hcorr,float * vcorr)67 void av1_get_horver_correlation_full_neon(const int16_t *diff, int stride,
68 int width, int height, float *hcorr,
69 float *vcorr) {
70 // The following notation is used:
71 // x - current pixel
72 // y - right neighbour pixel
73 // z - below neighbour pixel
74 // w - down-right neighbour pixel
75 int64_t xy_sum = 0, xz_sum = 0;
76 int64_t x_sum = 0, x2_sum = 0;
77 int32x4_t zero = vdupq_n_s32(0);
78 int64x2_t v_x_sum = vreinterpretq_s64_s32(zero);
79 int64x2_t v_xy_sum = vreinterpretq_s64_s32(zero);
80 int64x2_t v_xz_sum = vreinterpretq_s64_s32(zero);
81 int64x2_t v_x2_sum = vreinterpretq_s64_s32(zero);
82 // Process horizontal and vertical correlations through the body in 4x4
83 // blocks. This excludes the final row and column and possibly one extra
84 // column depending how 3 divides into width and height
85
86 for (int i = 0; i <= height - 4; i += 3) {
87 int32x4_t xy_sum_32 = zero;
88 int32x4_t xz_sum_32 = zero;
89 int32x4_t x_sum_32 = zero;
90 int32x4_t x2_sum_32 = zero;
91 for (int j = 0; j <= width - 4; j += 3) {
92 horver_correlation_4x4(&diff[i * stride + j], stride, &xy_sum_32,
93 &xz_sum_32, &x_sum_32, &x2_sum_32);
94 }
95 v_xy_sum = vpadalq_s32(v_xy_sum, xy_sum_32);
96 v_xz_sum = vpadalq_s32(v_xz_sum, xz_sum_32);
97 v_x_sum = vpadalq_s32(v_x_sum, x_sum_32);
98 v_x2_sum = vpadalq_s32(v_x2_sum, x2_sum_32);
99 }
100 #if defined(__aarch64__)
101 xy_sum = vaddvq_s64(v_xy_sum);
102 xz_sum = vaddvq_s64(v_xz_sum);
103 x2_sum = vaddvq_s64(v_x2_sum);
104 x_sum = vaddvq_s64(v_x_sum);
105 #else
106 xy_sum = vget_lane_s64(
107 vadd_s64(vget_low_s64(v_xy_sum), vget_high_s64(v_xy_sum)), 0);
108 xz_sum = vget_lane_s64(
109 vadd_s64(vget_low_s64(v_xz_sum), vget_high_s64(v_xz_sum)), 0);
110 x2_sum = vget_lane_s64(
111 vadd_s64(vget_low_s64(v_x2_sum), vget_high_s64(v_x2_sum)), 0);
112 x_sum =
113 vget_lane_s64(vadd_s64(vget_low_s64(v_x_sum), vget_high_s64(v_x_sum)), 0);
114 #endif
115 // x_sum now covers every pixel except the final 1-2 rows and 1-2 cols
116 int64_t x_finalrow = 0, x_finalcol = 0, x2_finalrow = 0, x2_finalcol = 0;
117
118 // Do we have 2 rows remaining or just the one? Note that width and height
119 // are powers of 2, so each modulo 3 must be 1 or 2.
120 if (height % 3 == 1) { // Just horiz corrs on the final row
121 const int16_t x0 = diff[(height - 1) * stride];
122 x_sum += x0;
123 x_finalrow += x0;
124 x2_sum += x0 * x0;
125 x2_finalrow += x0 * x0;
126 if (width >= 8) {
127 int32x4_t v_y_sum = zero;
128 int32x4_t v_y2_sum = zero;
129 int32x4_t v_xy_sum_a = zero;
130 int k = width - 1;
131 int j = 0;
132 while ((k - 8) > 0) {
133 const int16x8_t v_x = vld1q_s16(&diff[(height - 1) * stride + j]);
134 const int16x8_t v_y = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
135 const int16x4_t v_x_lo = vget_low_s16(v_x);
136 const int16x4_t v_x_hi = vget_high_s16(v_x);
137 const int16x4_t v_y_lo = vget_low_s16(v_y);
138 const int16x4_t v_y_hi = vget_high_s16(v_y);
139 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
140 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
141 v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
142 v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
143 v_y_sum = vpadalq_s16(v_y_sum, v_y);
144 k -= 8;
145 j += 8;
146 }
147
148 const int16x8_t v_l = vld1q_s16(&diff[(height - 1) * stride] + j);
149 const int16x8_t v_x =
150 vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
151 vreinterpretq_s16_s32(zero), 1);
152 const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
153 const int16x4_t v_x_lo = vget_low_s16(v_x);
154 const int16x4_t v_x_hi = vget_high_s16(v_x);
155 const int16x4_t v_y_lo = vget_low_s16(v_y);
156 const int16x4_t v_y_hi = vget_high_s16(v_y);
157 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
158 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
159 v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
160 v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
161 const int32x4_t v_y_sum_a = vpadalq_s16(v_y_sum, v_y);
162 const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
163 #if defined(__aarch64__)
164 const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
165 xy_sum += vaddvq_s64(v_xy_sum2);
166 const int32_t y = vaddvq_s32(v_y_sum_a);
167 const int64_t y2 = vaddvq_s64(v_y2_sum_a);
168 #else
169 xy_sum += vget_lane_s64(
170 vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
171 const int64x2_t v_y_a = vpaddlq_s32(v_y_sum_a);
172 const int64_t y =
173 vget_lane_s64(vadd_s64(vget_low_s64(v_y_a), vget_high_s64(v_y_a)), 0);
174 const int64x2_t v_y2_sum_b = vpaddlq_s32(v_y2_sum);
175 int64_t y2 = vget_lane_s64(
176 vadd_s64(vget_low_s64(v_y2_sum_b), vget_high_s64(v_y2_sum_b)), 0);
177 #endif
178 x_sum += y;
179 x2_sum += y2;
180 x_finalrow += y;
181 x2_finalrow += y2;
182 } else {
183 for (int j = 0; j < width - 1; ++j) {
184 const int16_t x = diff[(height - 1) * stride + j];
185 const int16_t y = diff[(height - 1) * stride + j + 1];
186 xy_sum += x * y;
187 x_sum += y;
188 x2_sum += y * y;
189 x_finalrow += y;
190 x2_finalrow += y * y;
191 }
192 }
193 } else { // Two rows remaining to do
194 const int16_t x0 = diff[(height - 2) * stride];
195 const int16_t z0 = diff[(height - 1) * stride];
196 x_sum += x0 + z0;
197 x2_sum += x0 * x0 + z0 * z0;
198 x_finalrow += z0;
199 x2_finalrow += z0 * z0;
200 if (width >= 8) {
201 int32x4_t v_y2_sum = zero;
202 int32x4_t v_w2_sum = zero;
203 int32x4_t v_xy_sum_a = zero;
204 int32x4_t v_xz_sum_a = zero;
205 int32x4_t v_x_sum_a = zero;
206 int32x4_t v_w_sum = zero;
207 int k = width - 1;
208 int j = 0;
209 while ((k - 8) > 0) {
210 const int16x8_t v_x = vld1q_s16(&diff[(height - 2) * stride + j]);
211 const int16x8_t v_y = vld1q_s16(&diff[(height - 2) * stride + j + 1]);
212 const int16x8_t v_z = vld1q_s16(&diff[(height - 1) * stride + j]);
213 const int16x8_t v_w = vld1q_s16(&diff[(height - 1) * stride + j + 1]);
214
215 const int16x4_t v_x_lo = vget_low_s16(v_x);
216 const int16x4_t v_y_lo = vget_low_s16(v_y);
217 const int16x4_t v_z_lo = vget_low_s16(v_z);
218 const int16x4_t v_w_lo = vget_low_s16(v_w);
219 const int16x4_t v_x_hi = vget_high_s16(v_x);
220 const int16x4_t v_y_hi = vget_high_s16(v_y);
221 const int16x4_t v_z_hi = vget_high_s16(v_z);
222 const int16x4_t v_w_hi = vget_high_s16(v_w);
223
224 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
225 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
226 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
227 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
228
229 v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
230 v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
231
232 v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
233 v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
234 v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
235 v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
236
237 v_w_sum = vpadalq_s16(v_w_sum, v_w);
238 v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
239 v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
240
241 k -= 8;
242 j += 8;
243 }
244 const int16x8_t v_l = vld1q_s16(&diff[(height - 2) * stride] + j);
245 const int16x8_t v_x =
246 vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l, 7),
247 vreinterpretq_s16_s32(zero), 1);
248 const int16x8_t v_y = vextq_s16(v_l, vreinterpretq_s16_s32(zero), 1);
249 const int16x8_t v_l_2 = vld1q_s16(&diff[(height - 1) * stride] + j);
250 const int16x8_t v_z =
251 vextq_s16(vextq_s16(vreinterpretq_s16_s32(zero), v_l_2, 7),
252 vreinterpretq_s16_s32(zero), 1);
253 const int16x8_t v_w = vextq_s16(v_l_2, vreinterpretq_s16_s32(zero), 1);
254
255 const int16x4_t v_x_lo = vget_low_s16(v_x);
256 const int16x4_t v_y_lo = vget_low_s16(v_y);
257 const int16x4_t v_z_lo = vget_low_s16(v_z);
258 const int16x4_t v_w_lo = vget_low_s16(v_w);
259 const int16x4_t v_x_hi = vget_high_s16(v_x);
260 const int16x4_t v_y_hi = vget_high_s16(v_y);
261 const int16x4_t v_z_hi = vget_high_s16(v_z);
262 const int16x4_t v_w_hi = vget_high_s16(v_w);
263
264 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_lo, v_y_lo);
265 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_x_hi, v_y_hi);
266 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_lo, v_w_lo);
267 v_xy_sum_a = vmlal_s16(v_xy_sum_a, v_z_hi, v_w_hi);
268
269 v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_lo, v_z_lo);
270 v_xz_sum_a = vmlal_s16(v_xz_sum_a, v_x_hi, v_z_hi);
271
272 v_w2_sum = vmlal_s16(v_w2_sum, v_w_lo, v_w_lo);
273 v_w2_sum = vmlal_s16(v_w2_sum, v_w_hi, v_w_hi);
274 v_y2_sum = vmlal_s16(v_y2_sum, v_y_lo, v_y_lo);
275 v_y2_sum = vmlal_s16(v_y2_sum, v_y_hi, v_y_hi);
276
277 v_w_sum = vpadalq_s16(v_w_sum, v_w);
278 v_x_sum_a = vpadalq_s16(v_x_sum_a, v_y);
279 v_x_sum_a = vpadalq_s16(v_x_sum_a, v_w);
280
281 #if defined(__aarch64__)
282 xy_sum += vaddvq_s64(vpaddlq_s32(v_xy_sum_a));
283 xz_sum += vaddvq_s64(vpaddlq_s32(v_xz_sum_a));
284 x_sum += vaddvq_s32(v_x_sum_a);
285 x_finalrow += vaddvq_s32(v_w_sum);
286 int64_t y2 = vaddvq_s64(vpaddlq_s32(v_y2_sum));
287 int64_t w2 = vaddvq_s64(vpaddlq_s32(v_w2_sum));
288 #else
289 const int64x2_t v_xy_sum2 = vpaddlq_s32(v_xy_sum_a);
290 xy_sum += vget_lane_s64(
291 vadd_s64(vget_low_s64(v_xy_sum2), vget_high_s64(v_xy_sum2)), 0);
292 const int64x2_t v_xz_sum2 = vpaddlq_s32(v_xz_sum_a);
293 xz_sum += vget_lane_s64(
294 vadd_s64(vget_low_s64(v_xz_sum2), vget_high_s64(v_xz_sum2)), 0);
295 const int64x2_t v_x_sum2 = vpaddlq_s32(v_x_sum_a);
296 x_sum += vget_lane_s64(
297 vadd_s64(vget_low_s64(v_x_sum2), vget_high_s64(v_x_sum2)), 0);
298 const int64x2_t v_w_sum_a = vpaddlq_s32(v_w_sum);
299 x_finalrow += vget_lane_s64(
300 vadd_s64(vget_low_s64(v_w_sum_a), vget_high_s64(v_w_sum_a)), 0);
301 const int64x2_t v_y2_sum_a = vpaddlq_s32(v_y2_sum);
302 int64_t y2 = vget_lane_s64(
303 vadd_s64(vget_low_s64(v_y2_sum_a), vget_high_s64(v_y2_sum_a)), 0);
304 const int64x2_t v_w2_sum_a = vpaddlq_s32(v_w2_sum);
305 int64_t w2 = vget_lane_s64(
306 vadd_s64(vget_low_s64(v_w2_sum_a), vget_high_s64(v_w2_sum_a)), 0);
307 #endif
308 x2_sum += y2 + w2;
309 x2_finalrow += w2;
310 } else {
311 for (int j = 0; j < width - 1; ++j) {
312 const int16_t x = diff[(height - 2) * stride + j];
313 const int16_t y = diff[(height - 2) * stride + j + 1];
314 const int16_t z = diff[(height - 1) * stride + j];
315 const int16_t w = diff[(height - 1) * stride + j + 1];
316
317 // Horizontal and vertical correlations for the penultimate row:
318 xy_sum += x * y;
319 xz_sum += x * z;
320
321 // Now just horizontal correlations for the final row:
322 xy_sum += z * w;
323
324 x_sum += y + w;
325 x2_sum += y * y + w * w;
326 x_finalrow += w;
327 x2_finalrow += w * w;
328 }
329 }
330 }
331
332 // Do we have 2 columns remaining or just the one?
333 if (width % 3 == 1) { // Just vert corrs on the final col
334 const int16_t x0 = diff[width - 1];
335 x_sum += x0;
336 x_finalcol += x0;
337 x2_sum += x0 * x0;
338 x2_finalcol += x0 * x0;
339 for (int i = 0; i < height - 1; ++i) {
340 const int16_t x = diff[i * stride + width - 1];
341 const int16_t z = diff[(i + 1) * stride + width - 1];
342 xz_sum += x * z;
343 x_finalcol += z;
344 x2_finalcol += z * z;
345 // So the bottom-right elements don't get counted twice:
346 if (i < height - (height % 3 == 1 ? 2 : 3)) {
347 x_sum += z;
348 x2_sum += z * z;
349 }
350 }
351 } else { // Two cols remaining
352 const int16_t x0 = diff[width - 2];
353 const int16_t y0 = diff[width - 1];
354 x_sum += x0 + y0;
355 x2_sum += x0 * x0 + y0 * y0;
356 x_finalcol += y0;
357 x2_finalcol += y0 * y0;
358 for (int i = 0; i < height - 1; ++i) {
359 const int16_t x = diff[i * stride + width - 2];
360 const int16_t y = diff[i * stride + width - 1];
361 const int16_t z = diff[(i + 1) * stride + width - 2];
362 const int16_t w = diff[(i + 1) * stride + width - 1];
363
364 // Horizontal and vertical correlations for the penultimate col:
365 // Skip these on the last iteration of this loop if we also had two
366 // rows remaining, otherwise the final horizontal and vertical correlation
367 // get erroneously processed twice
368 if (i < height - 2 || height % 3 == 1) {
369 xy_sum += x * y;
370 xz_sum += x * z;
371 }
372
373 x_finalcol += w;
374 x2_finalcol += w * w;
375 // So the bottom-right elements don't get counted twice:
376 if (i < height - (height % 3 == 1 ? 2 : 3)) {
377 x_sum += z + w;
378 x2_sum += z * z + w * w;
379 }
380
381 // Now just vertical correlations for the final column:
382 xz_sum += y * w;
383 }
384 }
385
386 // Calculate the simple sums and squared-sums
387 int64_t x_firstrow = 0, x_firstcol = 0;
388 int64_t x2_firstrow = 0, x2_firstcol = 0;
389
390 if (width >= 8) {
391 int32x4_t v_x_firstrow = zero;
392 int32x4_t v_x2_firstrow = zero;
393 for (int j = 0; j < width; j += 8) {
394 const int16x8_t v_diff = vld1q_s16(diff + j);
395 const int16x4_t v_diff_lo = vget_low_s16(v_diff);
396 const int16x4_t v_diff_hi = vget_high_s16(v_diff);
397 v_x_firstrow = vpadalq_s16(v_x_firstrow, v_diff);
398 v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_lo, v_diff_lo);
399 v_x2_firstrow = vmlal_s16(v_x2_firstrow, v_diff_hi, v_diff_hi);
400 }
401 #if defined(__aarch64__)
402 x_firstrow += vaddvq_s32(v_x_firstrow);
403 x2_firstrow += vaddvq_s32(v_x2_firstrow);
404 #else
405 const int64x2_t v_x_firstrow_64 = vpaddlq_s32(v_x_firstrow);
406 x_firstrow += vget_lane_s64(
407 vadd_s64(vget_low_s64(v_x_firstrow_64), vget_high_s64(v_x_firstrow_64)),
408 0);
409 const int64x2_t v_x2_firstrow_64 = vpaddlq_s32(v_x2_firstrow);
410 x2_firstrow += vget_lane_s64(vadd_s64(vget_low_s64(v_x2_firstrow_64),
411 vget_high_s64(v_x2_firstrow_64)),
412 0);
413 #endif
414 } else {
415 for (int j = 0; j < width; ++j) {
416 x_firstrow += diff[j];
417 x2_firstrow += diff[j] * diff[j];
418 }
419 }
420 for (int i = 0; i < height; ++i) {
421 x_firstcol += diff[i * stride];
422 x2_firstcol += diff[i * stride] * diff[i * stride];
423 }
424
425 int64_t xhor_sum = x_sum - x_finalcol;
426 int64_t xver_sum = x_sum - x_finalrow;
427 int64_t y_sum = x_sum - x_firstcol;
428 int64_t z_sum = x_sum - x_firstrow;
429 int64_t x2hor_sum = x2_sum - x2_finalcol;
430 int64_t x2ver_sum = x2_sum - x2_finalrow;
431 int64_t y2_sum = x2_sum - x2_firstcol;
432 int64_t z2_sum = x2_sum - x2_firstrow;
433
434 const float num_hor = (float)(height * (width - 1));
435 const float num_ver = (float)((height - 1) * width);
436
437 const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
438 const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
439
440 const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
441 const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
442
443 const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
444 const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
445
446 if (xhor_var_n > 0 && y_var_n > 0) {
447 *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
448 *hcorr = *hcorr < 0 ? 0 : *hcorr;
449 } else {
450 *hcorr = 1.0;
451 }
452 if (xver_var_n > 0 && z_var_n > 0) {
453 *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
454 *vcorr = *vcorr < 0 ? 0 : *vcorr;
455 } else {
456 *vcorr = 1.0;
457 }
458 }
459