1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 #include "vp8/encoder/variance.h"
13 #include "vp8/common/pragmas.h"
14 #include "vpx_ports/mem.h"
15
16 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
17 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
18 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
19 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
20
21 extern void vp8_filter_block2d_bil4x4_var_mmx
22 (
23 const unsigned char *ref_ptr,
24 int ref_pixels_per_line,
25 const unsigned char *src_ptr,
26 int src_pixels_per_line,
27 const short *HFilter,
28 const short *VFilter,
29 int *sum,
30 unsigned int *sumsquared
31 );
32
33 extern unsigned int vp8_get4x4var_mmx
34 (
35 const unsigned char *src_ptr,
36 int source_stride,
37 const unsigned char *ref_ptr,
38 int recon_stride,
39 unsigned int *SSE,
40 int *Sum
41 );
42
43 unsigned int vp8_get_mb_ss_sse2
44 (
45 const short *src_ptr
46 );
47 unsigned int vp8_get16x16var_sse2
48 (
49 const unsigned char *src_ptr,
50 int source_stride,
51 const unsigned char *ref_ptr,
52 int recon_stride,
53 unsigned int *SSE,
54 int *Sum
55 );
56 unsigned int vp8_get16x16pred_error_sse2
57 (
58 const unsigned char *src_ptr,
59 int src_stride,
60 const unsigned char *ref_ptr,
61 int ref_stride
62 );
63 unsigned int vp8_get8x8var_sse2
64 (
65 const unsigned char *src_ptr,
66 int source_stride,
67 const unsigned char *ref_ptr,
68 int recon_stride,
69 unsigned int *SSE,
70 int *Sum
71 );
72 void vp8_filter_block2d_bil_var_sse2
73 (
74 const unsigned char *ref_ptr,
75 int ref_pixels_per_line,
76 const unsigned char *src_ptr,
77 int src_pixels_per_line,
78 unsigned int Height,
79 int xoffset,
80 int yoffset,
81 int *sum,
82 unsigned int *sumsquared
83 );
84 void vp8_half_horiz_vert_variance8x_h_sse2
85 (
86 const unsigned char *ref_ptr,
87 int ref_pixels_per_line,
88 const unsigned char *src_ptr,
89 int src_pixels_per_line,
90 unsigned int Height,
91 int *sum,
92 unsigned int *sumsquared
93 );
94 void vp8_half_horiz_vert_variance16x_h_sse2
95 (
96 const unsigned char *ref_ptr,
97 int ref_pixels_per_line,
98 const unsigned char *src_ptr,
99 int src_pixels_per_line,
100 unsigned int Height,
101 int *sum,
102 unsigned int *sumsquared
103 );
104 void vp8_half_horiz_variance8x_h_sse2
105 (
106 const unsigned char *ref_ptr,
107 int ref_pixels_per_line,
108 const unsigned char *src_ptr,
109 int src_pixels_per_line,
110 unsigned int Height,
111 int *sum,
112 unsigned int *sumsquared
113 );
114 void vp8_half_horiz_variance16x_h_sse2
115 (
116 const unsigned char *ref_ptr,
117 int ref_pixels_per_line,
118 const unsigned char *src_ptr,
119 int src_pixels_per_line,
120 unsigned int Height,
121 int *sum,
122 unsigned int *sumsquared
123 );
124 void vp8_half_vert_variance8x_h_sse2
125 (
126 const unsigned char *ref_ptr,
127 int ref_pixels_per_line,
128 const unsigned char *src_ptr,
129 int src_pixels_per_line,
130 unsigned int Height,
131 int *sum,
132 unsigned int *sumsquared
133 );
134 void vp8_half_vert_variance16x_h_sse2
135 (
136 const unsigned char *ref_ptr,
137 int ref_pixels_per_line,
138 const unsigned char *src_ptr,
139 int src_pixels_per_line,
140 unsigned int Height,
141 int *sum,
142 unsigned int *sumsquared
143 );
144
145 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
146
vp8_variance4x4_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride)147 unsigned int vp8_variance4x4_wmt(
148 const unsigned char *src_ptr,
149 int source_stride,
150 const unsigned char *ref_ptr,
151 int recon_stride)
152 {
153 unsigned int var;
154 int avg;
155
156 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
157 return (var - ((avg * avg) >> 4));
158
159 }
160
161
162
vp8_variance8x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride)163 unsigned int vp8_variance8x8_wmt
164 (
165 const unsigned char *src_ptr,
166 int source_stride,
167 const unsigned char *ref_ptr,
168 int recon_stride)
169 {
170 unsigned int var;
171 int avg;
172
173 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
174
175 return (var - ((avg * avg) >> 6));
176
177 }
178
179
vp8_variance16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)180 unsigned int vp8_variance16x16_wmt
181 (
182 const unsigned char *src_ptr,
183 int source_stride,
184 const unsigned char *ref_ptr,
185 int recon_stride,
186 unsigned int *sse)
187 {
188 unsigned int sse0;
189 int sum0;
190
191
192 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
193 *sse = sse0;
194 return (sse0 - ((sum0 * sum0) >> 8));
195 }
vp8_mse16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)196 unsigned int vp8_mse16x16_wmt(
197 const unsigned char *src_ptr,
198 int source_stride,
199 const unsigned char *ref_ptr,
200 int recon_stride,
201 unsigned int *sse)
202 {
203
204 unsigned int sse0;
205 int sum0;
206 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
207 *sse = sse0;
208 return sse0;
209
210 }
211
212
vp8_variance16x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)213 unsigned int vp8_variance16x8_wmt
214 (
215 const unsigned char *src_ptr,
216 int source_stride,
217 const unsigned char *ref_ptr,
218 int recon_stride,
219 unsigned int *sse)
220 {
221 unsigned int sse0, sse1, var;
222 int sum0, sum1, avg;
223
224 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
225 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
226
227 var = sse0 + sse1;
228 avg = sum0 + sum1;
229 *sse = var;
230 return (var - ((avg * avg) >> 7));
231
232 }
233
vp8_variance8x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)234 unsigned int vp8_variance8x16_wmt
235 (
236 const unsigned char *src_ptr,
237 int source_stride,
238 const unsigned char *ref_ptr,
239 int recon_stride,
240 unsigned int *sse)
241 {
242 unsigned int sse0, sse1, var;
243 int sum0, sum1, avg;
244
245 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
246 vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
247
248 var = sse0 + sse1;
249 avg = sum0 + sum1;
250 *sse = var;
251 return (var - ((avg * avg) >> 7));
252
253 }
254
vp8_sub_pixel_variance4x4_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)255 unsigned int vp8_sub_pixel_variance4x4_wmt
256 (
257 const unsigned char *src_ptr,
258 int src_pixels_per_line,
259 int xoffset,
260 int yoffset,
261 const unsigned char *dst_ptr,
262 int dst_pixels_per_line,
263 unsigned int *sse
264 )
265 {
266 int xsum;
267 unsigned int xxsum;
268 vp8_filter_block2d_bil4x4_var_mmx(
269 src_ptr, src_pixels_per_line,
270 dst_ptr, dst_pixels_per_line,
271 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
272 &xsum, &xxsum
273 );
274 *sse = xxsum;
275 return (xxsum - ((xsum * xsum) >> 4));
276 }
277
278
vp8_sub_pixel_variance8x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)279 unsigned int vp8_sub_pixel_variance8x8_wmt
280 (
281 const unsigned char *src_ptr,
282 int src_pixels_per_line,
283 int xoffset,
284 int yoffset,
285 const unsigned char *dst_ptr,
286 int dst_pixels_per_line,
287 unsigned int *sse
288 )
289 {
290 int xsum;
291 unsigned int xxsum;
292
293 if (xoffset == 4 && yoffset == 0)
294 {
295 vp8_half_horiz_variance8x_h_sse2(
296 src_ptr, src_pixels_per_line,
297 dst_ptr, dst_pixels_per_line, 8,
298 &xsum, &xxsum);
299 }
300 else if (xoffset == 0 && yoffset == 4)
301 {
302 vp8_half_vert_variance8x_h_sse2(
303 src_ptr, src_pixels_per_line,
304 dst_ptr, dst_pixels_per_line, 8,
305 &xsum, &xxsum);
306 }
307 else if (xoffset == 4 && yoffset == 4)
308 {
309 vp8_half_horiz_vert_variance8x_h_sse2(
310 src_ptr, src_pixels_per_line,
311 dst_ptr, dst_pixels_per_line, 8,
312 &xsum, &xxsum);
313 }
314 else
315 {
316 vp8_filter_block2d_bil_var_sse2(
317 src_ptr, src_pixels_per_line,
318 dst_ptr, dst_pixels_per_line, 8,
319 xoffset, yoffset,
320 &xsum, &xxsum);
321 }
322
323 *sse = xxsum;
324 return (xxsum - ((xsum * xsum) >> 6));
325 }
326
vp8_sub_pixel_variance16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)327 unsigned int vp8_sub_pixel_variance16x16_wmt
328 (
329 const unsigned char *src_ptr,
330 int src_pixels_per_line,
331 int xoffset,
332 int yoffset,
333 const unsigned char *dst_ptr,
334 int dst_pixels_per_line,
335 unsigned int *sse
336 )
337 {
338 int xsum0, xsum1;
339 unsigned int xxsum0, xxsum1;
340
341
342 // note we could avoid these if statements if the calling function
343 // just called the appropriate functions inside.
344 if (xoffset == 4 && yoffset == 0)
345 {
346 vp8_half_horiz_variance16x_h_sse2(
347 src_ptr, src_pixels_per_line,
348 dst_ptr, dst_pixels_per_line, 16,
349 &xsum0, &xxsum0);
350 }
351 else if (xoffset == 0 && yoffset == 4)
352 {
353 vp8_half_vert_variance16x_h_sse2(
354 src_ptr, src_pixels_per_line,
355 dst_ptr, dst_pixels_per_line, 16,
356 &xsum0, &xxsum0);
357 }
358 else if (xoffset == 4 && yoffset == 4)
359 {
360 vp8_half_horiz_vert_variance16x_h_sse2(
361 src_ptr, src_pixels_per_line,
362 dst_ptr, dst_pixels_per_line, 16,
363 &xsum0, &xxsum0);
364 }
365 else
366 {
367 vp8_filter_block2d_bil_var_sse2(
368 src_ptr, src_pixels_per_line,
369 dst_ptr, dst_pixels_per_line, 16,
370 xoffset, yoffset,
371 &xsum0, &xxsum0
372 );
373
374 vp8_filter_block2d_bil_var_sse2(
375 src_ptr + 8, src_pixels_per_line,
376 dst_ptr + 8, dst_pixels_per_line, 16,
377 xoffset, yoffset,
378 &xsum1, &xxsum1
379 );
380 xsum0 += xsum1;
381 xxsum0 += xxsum1;
382 }
383
384 *sse = xxsum0;
385 return (xxsum0 - ((xsum0 * xsum0) >> 8));
386 }
387
vp8_sub_pixel_mse16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)388 unsigned int vp8_sub_pixel_mse16x16_wmt(
389 const unsigned char *src_ptr,
390 int src_pixels_per_line,
391 int xoffset,
392 int yoffset,
393 const unsigned char *dst_ptr,
394 int dst_pixels_per_line,
395 unsigned int *sse
396 )
397 {
398 vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
399 return *sse;
400 }
401
vp8_sub_pixel_variance16x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)402 unsigned int vp8_sub_pixel_variance16x8_wmt
403 (
404 const unsigned char *src_ptr,
405 int src_pixels_per_line,
406 int xoffset,
407 int yoffset,
408 const unsigned char *dst_ptr,
409 int dst_pixels_per_line,
410 unsigned int *sse
411
412 )
413 {
414 int xsum0, xsum1;
415 unsigned int xxsum0, xxsum1;
416
417 if (xoffset == 4 && yoffset == 0)
418 {
419 vp8_half_horiz_variance16x_h_sse2(
420 src_ptr, src_pixels_per_line,
421 dst_ptr, dst_pixels_per_line, 8,
422 &xsum0, &xxsum0);
423 }
424 else if (xoffset == 0 && yoffset == 4)
425 {
426 vp8_half_vert_variance16x_h_sse2(
427 src_ptr, src_pixels_per_line,
428 dst_ptr, dst_pixels_per_line, 8,
429 &xsum0, &xxsum0);
430 }
431 else if (xoffset == 4 && yoffset == 4)
432 {
433 vp8_half_horiz_vert_variance16x_h_sse2(
434 src_ptr, src_pixels_per_line,
435 dst_ptr, dst_pixels_per_line, 8,
436 &xsum0, &xxsum0);
437 }
438 else
439 {
440 vp8_filter_block2d_bil_var_sse2(
441 src_ptr, src_pixels_per_line,
442 dst_ptr, dst_pixels_per_line, 8,
443 xoffset, yoffset,
444 &xsum0, &xxsum0);
445
446 vp8_filter_block2d_bil_var_sse2(
447 src_ptr + 8, src_pixels_per_line,
448 dst_ptr + 8, dst_pixels_per_line, 8,
449 xoffset, yoffset,
450 &xsum1, &xxsum1);
451 xsum0 += xsum1;
452 xxsum0 += xxsum1;
453 }
454
455 *sse = xxsum0;
456 return (xxsum0 - ((xsum0 * xsum0) >> 7));
457 }
458
vp8_sub_pixel_variance8x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)459 unsigned int vp8_sub_pixel_variance8x16_wmt
460 (
461 const unsigned char *src_ptr,
462 int src_pixels_per_line,
463 int xoffset,
464 int yoffset,
465 const unsigned char *dst_ptr,
466 int dst_pixels_per_line,
467 unsigned int *sse
468 )
469 {
470 int xsum;
471 unsigned int xxsum;
472
473 if (xoffset == 4 && yoffset == 0)
474 {
475 vp8_half_horiz_variance8x_h_sse2(
476 src_ptr, src_pixels_per_line,
477 dst_ptr, dst_pixels_per_line, 16,
478 &xsum, &xxsum);
479 }
480 else if (xoffset == 0 && yoffset == 4)
481 {
482 vp8_half_vert_variance8x_h_sse2(
483 src_ptr, src_pixels_per_line,
484 dst_ptr, dst_pixels_per_line, 16,
485 &xsum, &xxsum);
486 }
487 else if (xoffset == 4 && yoffset == 4)
488 {
489 vp8_half_horiz_vert_variance8x_h_sse2(
490 src_ptr, src_pixels_per_line,
491 dst_ptr, dst_pixels_per_line, 16,
492 &xsum, &xxsum);
493 }
494 else
495 {
496 vp8_filter_block2d_bil_var_sse2(
497 src_ptr, src_pixels_per_line,
498 dst_ptr, dst_pixels_per_line, 16,
499 xoffset, yoffset,
500 &xsum, &xxsum);
501 }
502
503 *sse = xxsum;
504 return (xxsum - ((xsum * xsum) >> 7));
505 }
506
507
vp8_variance_halfpixvar16x16_h_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)508 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
509 const unsigned char *src_ptr,
510 int src_pixels_per_line,
511 const unsigned char *dst_ptr,
512 int dst_pixels_per_line,
513 unsigned int *sse)
514 {
515 int xsum0;
516 unsigned int xxsum0;
517
518 vp8_half_horiz_variance16x_h_sse2(
519 src_ptr, src_pixels_per_line,
520 dst_ptr, dst_pixels_per_line, 16,
521 &xsum0, &xxsum0);
522
523 *sse = xxsum0;
524 return (xxsum0 - ((xsum0 * xsum0) >> 8));
525 }
526
527
vp8_variance_halfpixvar16x16_v_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)528 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
529 const unsigned char *src_ptr,
530 int src_pixels_per_line,
531 const unsigned char *dst_ptr,
532 int dst_pixels_per_line,
533 unsigned int *sse)
534 {
535 int xsum0;
536 unsigned int xxsum0;
537 vp8_half_vert_variance16x_h_sse2(
538 src_ptr, src_pixels_per_line,
539 dst_ptr, dst_pixels_per_line, 16,
540 &xsum0, &xxsum0);
541
542 *sse = xxsum0;
543 return (xxsum0 - ((xsum0 * xsum0) >> 8));
544 }
545
546
vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)547 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
548 const unsigned char *src_ptr,
549 int src_pixels_per_line,
550 const unsigned char *dst_ptr,
551 int dst_pixels_per_line,
552 unsigned int *sse)
553 {
554 int xsum0;
555 unsigned int xxsum0;
556
557 vp8_half_horiz_vert_variance16x_h_sse2(
558 src_ptr, src_pixels_per_line,
559 dst_ptr, dst_pixels_per_line, 16,
560 &xsum0, &xxsum0);
561
562 *sse = xxsum0;
563 return (xxsum0 - ((xsum0 * xsum0) >> 8));
564 }
565