1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_config.h"
12
13 #include "vp9/encoder/vp9_variance.h"
14 #include "vp9/common/vp9_pragmas.h"
15 #include "vpx_ports/mem.h"
16
17 extern unsigned int vp9_get4x4var_mmx
18 (
19 const unsigned char *src_ptr,
20 int source_stride,
21 const unsigned char *ref_ptr,
22 int recon_stride,
23 unsigned int *SSE,
24 int *Sum
25 );
26
27 unsigned int vp9_get16x16var_sse2
28 (
29 const unsigned char *src_ptr,
30 int source_stride,
31 const unsigned char *ref_ptr,
32 int recon_stride,
33 unsigned int *SSE,
34 int *Sum
35 );
36 unsigned int vp9_get8x8var_sse2
37 (
38 const unsigned char *src_ptr,
39 int source_stride,
40 const unsigned char *ref_ptr,
41 int recon_stride,
42 unsigned int *SSE,
43 int *Sum
44 );
45 void vp9_half_horiz_vert_variance8x_h_sse2
46 (
47 const unsigned char *ref_ptr,
48 int ref_pixels_per_line,
49 const unsigned char *src_ptr,
50 int src_pixels_per_line,
51 unsigned int Height,
52 int *sum,
53 unsigned int *sumsquared
54 );
55 void vp9_half_horiz_vert_variance16x_h_sse2
56 (
57 const unsigned char *ref_ptr,
58 int ref_pixels_per_line,
59 const unsigned char *src_ptr,
60 int src_pixels_per_line,
61 unsigned int Height,
62 int *sum,
63 unsigned int *sumsquared
64 );
65 void vp9_half_horiz_variance8x_h_sse2
66 (
67 const unsigned char *ref_ptr,
68 int ref_pixels_per_line,
69 const unsigned char *src_ptr,
70 int src_pixels_per_line,
71 unsigned int Height,
72 int *sum,
73 unsigned int *sumsquared
74 );
75 void vp9_half_horiz_variance16x_h_sse2
76 (
77 const unsigned char *ref_ptr,
78 int ref_pixels_per_line,
79 const unsigned char *src_ptr,
80 int src_pixels_per_line,
81 unsigned int Height,
82 int *sum,
83 unsigned int *sumsquared
84 );
85 void vp9_half_vert_variance8x_h_sse2
86 (
87 const unsigned char *ref_ptr,
88 int ref_pixels_per_line,
89 const unsigned char *src_ptr,
90 int src_pixels_per_line,
91 unsigned int Height,
92 int *sum,
93 unsigned int *sumsquared
94 );
95 void vp9_half_vert_variance16x_h_sse2
96 (
97 const unsigned char *ref_ptr,
98 int ref_pixels_per_line,
99 const unsigned char *src_ptr,
100 int src_pixels_per_line,
101 unsigned int Height,
102 int *sum,
103 unsigned int *sumsquared
104 );
105
106 typedef unsigned int (*get_var_sse2) (
107 const unsigned char *src_ptr,
108 int source_stride,
109 const unsigned char *ref_ptr,
110 int recon_stride,
111 unsigned int *SSE,
112 int *Sum
113 );
114
variance_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,int w,int h,unsigned int * sse,int * sum,get_var_sse2 var_fn,int block_size)115 static void variance_sse2(const unsigned char *src_ptr, int source_stride,
116 const unsigned char *ref_ptr, int recon_stride,
117 int w, int h, unsigned int *sse, int *sum,
118 get_var_sse2 var_fn, int block_size) {
119 unsigned int sse0;
120 int sum0;
121 int i, j;
122
123 *sse = 0;
124 *sum = 0;
125
126 for (i = 0; i < h; i += block_size) {
127 for (j = 0; j < w; j += block_size) {
128 var_fn(src_ptr + source_stride * i + j, source_stride,
129 ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
130 *sse += sse0;
131 *sum += sum0;
132 }
133 }
134 }
135
vp9_variance4x4_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)136 unsigned int vp9_variance4x4_sse2(
137 const unsigned char *src_ptr,
138 int source_stride,
139 const unsigned char *ref_ptr,
140 int recon_stride,
141 unsigned int *sse) {
142 unsigned int var;
143 int avg;
144
145 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
146 &var, &avg, vp9_get4x4var_mmx, 4);
147 *sse = var;
148 return (var - (((unsigned int)avg * avg) >> 4));
149 }
150
vp9_variance8x4_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)151 unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
152 int source_stride,
153 const uint8_t *ref_ptr,
154 int recon_stride,
155 unsigned int *sse) {
156 unsigned int var;
157 int avg;
158
159 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
160 &var, &avg, vp9_get4x4var_mmx, 4);
161 *sse = var;
162 return (var - (((unsigned int)avg * avg) >> 5));
163 }
164
vp9_variance4x8_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)165 unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
166 int source_stride,
167 const uint8_t *ref_ptr,
168 int recon_stride,
169 unsigned int *sse) {
170 unsigned int var;
171 int avg;
172
173 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
174 &var, &avg, vp9_get4x4var_mmx, 4);
175 *sse = var;
176 return (var - (((unsigned int)avg * avg) >> 5));
177 }
178
vp9_variance8x8_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)179 unsigned int vp9_variance8x8_sse2
180 (
181 const unsigned char *src_ptr,
182 int source_stride,
183 const unsigned char *ref_ptr,
184 int recon_stride,
185 unsigned int *sse) {
186 unsigned int var;
187 int avg;
188
189 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
190 &var, &avg, vp9_get8x8var_sse2, 8);
191 *sse = var;
192 return (var - (((unsigned int)avg * avg) >> 6));
193 }
194
vp9_variance16x8_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)195 unsigned int vp9_variance16x8_sse2
196 (
197 const unsigned char *src_ptr,
198 int source_stride,
199 const unsigned char *ref_ptr,
200 int recon_stride,
201 unsigned int *sse) {
202 unsigned int var;
203 int avg;
204
205 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
206 &var, &avg, vp9_get8x8var_sse2, 8);
207 *sse = var;
208 return (var - (((unsigned int)avg * avg) >> 7));
209 }
210
vp9_variance8x16_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)211 unsigned int vp9_variance8x16_sse2
212 (
213 const unsigned char *src_ptr,
214 int source_stride,
215 const unsigned char *ref_ptr,
216 int recon_stride,
217 unsigned int *sse) {
218 unsigned int var;
219 int avg;
220
221 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
222 &var, &avg, vp9_get8x8var_sse2, 8);
223 *sse = var;
224 return (var - (((unsigned int)avg * avg) >> 7));
225 }
226
vp9_variance16x16_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)227 unsigned int vp9_variance16x16_sse2
228 (
229 const unsigned char *src_ptr,
230 int source_stride,
231 const unsigned char *ref_ptr,
232 int recon_stride,
233 unsigned int *sse) {
234 unsigned int var;
235 int avg;
236
237 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
238 &var, &avg, vp9_get16x16var_sse2, 16);
239 *sse = var;
240 return (var - (((unsigned int)avg * avg) >> 8));
241 }
242
vp9_mse16x16_sse2(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)243 unsigned int vp9_mse16x16_sse2(
244 const unsigned char *src_ptr,
245 int source_stride,
246 const unsigned char *ref_ptr,
247 int recon_stride,
248 unsigned int *sse) {
249 unsigned int sse0;
250 int sum0;
251 vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
252 &sum0);
253 *sse = sse0;
254 return sse0;
255 }
256
vp9_variance32x32_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)257 unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
258 int source_stride,
259 const uint8_t *ref_ptr,
260 int recon_stride,
261 unsigned int *sse) {
262 unsigned int var;
263 int avg;
264
265 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
266 &var, &avg, vp9_get16x16var_sse2, 16);
267 *sse = var;
268 return (var - (((int64_t)avg * avg) >> 10));
269 }
270
vp9_variance32x16_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)271 unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
272 int source_stride,
273 const uint8_t *ref_ptr,
274 int recon_stride,
275 unsigned int *sse) {
276 unsigned int var;
277 int avg;
278
279 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
280 &var, &avg, vp9_get16x16var_sse2, 16);
281 *sse = var;
282 return (var - (((int64_t)avg * avg) >> 9));
283 }
284
vp9_variance16x32_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)285 unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
286 int source_stride,
287 const uint8_t *ref_ptr,
288 int recon_stride,
289 unsigned int *sse) {
290 unsigned int var;
291 int avg;
292
293 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
294 &var, &avg, vp9_get16x16var_sse2, 16);
295 *sse = var;
296 return (var - (((int64_t)avg * avg) >> 9));
297 }
298
vp9_variance64x64_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)299 unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
300 int source_stride,
301 const uint8_t *ref_ptr,
302 int recon_stride,
303 unsigned int *sse) {
304 unsigned int var;
305 int avg;
306
307 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
308 &var, &avg, vp9_get16x16var_sse2, 16);
309 *sse = var;
310 return (var - (((int64_t)avg * avg) >> 12));
311 }
312
vp9_variance64x32_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)313 unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
314 int source_stride,
315 const uint8_t *ref_ptr,
316 int recon_stride,
317 unsigned int *sse) {
318 unsigned int var;
319 int avg;
320
321 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
322 &var, &avg, vp9_get16x16var_sse2, 16);
323 *sse = var;
324 return (var - (((int64_t)avg * avg) >> 11));
325 }
326
vp9_variance32x64_sse2(const uint8_t * src_ptr,int source_stride,const uint8_t * ref_ptr,int recon_stride,unsigned int * sse)327 unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
328 int source_stride,
329 const uint8_t *ref_ptr,
330 int recon_stride,
331 unsigned int *sse) {
332 unsigned int var;
333 int avg;
334
335 variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
336 &var, &avg, vp9_get16x16var_sse2, 16);
337 *sse = var;
338 return (var - (((int64_t)avg * avg) >> 11));
339 }
340
341 #define DECL(w, opt) \
342 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
343 ptrdiff_t src_stride, \
344 int x_offset, int y_offset, \
345 const uint8_t *dst, \
346 ptrdiff_t dst_stride, \
347 int height, unsigned int *sse)
348 #define DECLS(opt1, opt2) \
349 DECL(4, opt2); \
350 DECL(8, opt1); \
351 DECL(16, opt1)
352
353 DECLS(sse2, sse);
354 DECLS(ssse3, ssse3);
355 #undef DECLS
356 #undef DECL
357
358 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
359 unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
360 int src_stride, \
361 int x_offset, \
362 int y_offset, \
363 const uint8_t *dst, \
364 int dst_stride, \
365 unsigned int *sse_ptr) { \
366 unsigned int sse; \
367 int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
368 y_offset, dst, dst_stride, \
369 h, &sse); \
370 if (w > wf) { \
371 unsigned int sse2; \
372 int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
373 x_offset, y_offset, \
374 dst + 16, dst_stride, \
375 h, &sse2); \
376 se += se2; \
377 sse += sse2; \
378 if (w > wf * 2) { \
379 se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
380 x_offset, y_offset, \
381 dst + 32, dst_stride, \
382 h, &sse2); \
383 se += se2; \
384 sse += sse2; \
385 se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
386 x_offset, y_offset, \
387 dst + 48, dst_stride, \
388 h, &sse2); \
389 se += se2; \
390 sse += sse2; \
391 } \
392 } \
393 *sse_ptr = sse; \
394 return sse - ((cast se * se) >> (wlog2 + hlog2)); \
395 }
396
397 #define FNS(opt1, opt2) \
398 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
399 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
400 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
401 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
402 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
403 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
404 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
405 FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
406 FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
407 FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
408 FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
409 FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
410 FN(4, 4, 4, 2, 2, opt2, (unsigned int))
411
412 FNS(sse2, sse);
413 FNS(ssse3, ssse3);
414
415 #undef FNS
416 #undef FN
417
418 #define DECL(w, opt) \
419 int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
420 ptrdiff_t src_stride, \
421 int x_offset, int y_offset, \
422 const uint8_t *dst, \
423 ptrdiff_t dst_stride, \
424 const uint8_t *sec, \
425 ptrdiff_t sec_stride, \
426 int height, unsigned int *sse)
427 #define DECLS(opt1, opt2) \
428 DECL(4, opt2); \
429 DECL(8, opt1); \
430 DECL(16, opt1)
431
432 DECLS(sse2, sse);
433 DECLS(ssse3, ssse3);
434 #undef DECL
435 #undef DECLS
436
437 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
438 unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
439 int src_stride, \
440 int x_offset, \
441 int y_offset, \
442 const uint8_t *dst, \
443 int dst_stride, \
444 unsigned int *sseptr, \
445 const uint8_t *sec) { \
446 unsigned int sse; \
447 int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
448 y_offset, dst, dst_stride, \
449 sec, w, h, &sse); \
450 if (w > wf) { \
451 unsigned int sse2; \
452 int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
453 x_offset, y_offset, \
454 dst + 16, dst_stride, \
455 sec + 16, w, h, &sse2); \
456 se += se2; \
457 sse += sse2; \
458 if (w > wf * 2) { \
459 se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
460 x_offset, y_offset, \
461 dst + 32, dst_stride, \
462 sec + 32, w, h, &sse2); \
463 se += se2; \
464 sse += sse2; \
465 se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
466 x_offset, y_offset, \
467 dst + 48, dst_stride, \
468 sec + 48, w, h, &sse2); \
469 se += se2; \
470 sse += sse2; \
471 } \
472 } \
473 *sseptr = sse; \
474 return sse - ((cast se * se) >> (wlog2 + hlog2)); \
475 }
476
477 #define FNS(opt1, opt2) \
478 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
479 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
480 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
481 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
482 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
483 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
484 FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
485 FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \
486 FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \
487 FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \
488 FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \
489 FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \
490 FN(4, 4, 4, 2, 2, opt2, (unsigned int))
491
492 FNS(sse2, sse);
493 FNS(ssse3, ssse3);
494
495 #undef FNS
496 #undef FN
497
vp9_variance_halfpixvar16x16_h_sse2(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)498 unsigned int vp9_variance_halfpixvar16x16_h_sse2(
499 const unsigned char *src_ptr,
500 int src_pixels_per_line,
501 const unsigned char *dst_ptr,
502 int dst_pixels_per_line,
503 unsigned int *sse) {
504 int xsum0;
505 unsigned int xxsum0;
506
507 vp9_half_horiz_variance16x_h_sse2(
508 src_ptr, src_pixels_per_line,
509 dst_ptr, dst_pixels_per_line, 16,
510 &xsum0, &xxsum0);
511
512 *sse = xxsum0;
513 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
514 }
515
516
vp9_variance_halfpixvar16x16_v_sse2(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)517 unsigned int vp9_variance_halfpixvar16x16_v_sse2(
518 const unsigned char *src_ptr,
519 int src_pixels_per_line,
520 const unsigned char *dst_ptr,
521 int dst_pixels_per_line,
522 unsigned int *sse) {
523 int xsum0;
524 unsigned int xxsum0;
525 vp9_half_vert_variance16x_h_sse2(
526 src_ptr, src_pixels_per_line,
527 dst_ptr, dst_pixels_per_line, 16,
528 &xsum0, &xxsum0);
529
530 *sse = xxsum0;
531 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
532 }
533
534
vp9_variance_halfpixvar16x16_hv_sse2(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)535 unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
536 const unsigned char *src_ptr,
537 int src_pixels_per_line,
538 const unsigned char *dst_ptr,
539 int dst_pixels_per_line,
540 unsigned int *sse) {
541 int xsum0;
542 unsigned int xxsum0;
543
544 vp9_half_horiz_vert_variance16x_h_sse2(
545 src_ptr, src_pixels_per_line,
546 dst_ptr, dst_pixels_per_line, 16,
547 &xsum0, &xxsum0);
548
549 *sse = xxsum0;
550 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
551 }
552