1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 #include "variance.h"
13 #include "pragmas.h"
14 #include "vpx_ports/mem.h"
15
16 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
17 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
18 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
19 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
20
21 extern void vp8_filter_block2d_bil4x4_var_mmx
22 (
23 const unsigned char *ref_ptr,
24 int ref_pixels_per_line,
25 const unsigned char *src_ptr,
26 int src_pixels_per_line,
27 const short *HFilter,
28 const short *VFilter,
29 int *sum,
30 unsigned int *sumsquared
31 );
32
33 extern unsigned int vp8_get4x4var_mmx
34 (
35 const unsigned char *src_ptr,
36 int source_stride,
37 const unsigned char *ref_ptr,
38 int recon_stride,
39 unsigned int *SSE,
40 int *Sum
41 );
42
43 unsigned int vp8_get_mb_ss_sse2
44 (
45 const short *src_ptr
46 );
47 unsigned int vp8_get16x16var_sse2
48 (
49 const unsigned char *src_ptr,
50 int source_stride,
51 const unsigned char *ref_ptr,
52 int recon_stride,
53 unsigned int *SSE,
54 int *Sum
55 );
56 unsigned int vp8_get16x16pred_error_sse2
57 (
58 const unsigned char *src_ptr,
59 int src_stride,
60 const unsigned char *ref_ptr,
61 int ref_stride
62 );
63 unsigned int vp8_get8x8var_sse2
64 (
65 const unsigned char *src_ptr,
66 int source_stride,
67 const unsigned char *ref_ptr,
68 int recon_stride,
69 unsigned int *SSE,
70 int *Sum
71 );
72 void vp8_filter_block2d_bil_var_sse2
73 (
74 const unsigned char *ref_ptr,
75 int ref_pixels_per_line,
76 const unsigned char *src_ptr,
77 int src_pixels_per_line,
78 unsigned int Height,
79 const short *HFilter,
80 const short *VFilter,
81 int *sum,
82 unsigned int *sumsquared
83 );
84 void vp8_half_horiz_vert_variance16x_h_sse2
85 (
86 const unsigned char *ref_ptr,
87 int ref_pixels_per_line,
88 const unsigned char *src_ptr,
89 int src_pixels_per_line,
90 unsigned int Height,
91 int *sum,
92 unsigned int *sumsquared
93 );
94 void vp8_half_horiz_variance16x_h_sse2
95 (
96 const unsigned char *ref_ptr,
97 int ref_pixels_per_line,
98 const unsigned char *src_ptr,
99 int src_pixels_per_line,
100 unsigned int Height,
101 int *sum,
102 unsigned int *sumsquared
103 );
104 void vp8_half_vert_variance16x_h_sse2
105 (
106 const unsigned char *ref_ptr,
107 int ref_pixels_per_line,
108 const unsigned char *src_ptr,
109 int src_pixels_per_line,
110 unsigned int Height,
111 int *sum,
112 unsigned int *sumsquared
113 );
114
115 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
116
vp8_variance4x4_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride)117 unsigned int vp8_variance4x4_wmt(
118 const unsigned char *src_ptr,
119 int source_stride,
120 const unsigned char *ref_ptr,
121 int recon_stride)
122 {
123 unsigned int var;
124 int avg;
125
126 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
127 return (var - ((avg * avg) >> 4));
128
129 }
130
131
132
vp8_variance8x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride)133 unsigned int vp8_variance8x8_wmt
134 (
135 const unsigned char *src_ptr,
136 int source_stride,
137 const unsigned char *ref_ptr,
138 int recon_stride)
139 {
140 unsigned int var;
141 int avg;
142
143 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
144
145 return (var - ((avg * avg) >> 6));
146
147 }
148
149
vp8_variance16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)150 unsigned int vp8_variance16x16_wmt
151 (
152 const unsigned char *src_ptr,
153 int source_stride,
154 const unsigned char *ref_ptr,
155 int recon_stride,
156 unsigned int *sse)
157 {
158 unsigned int sse0;
159 int sum0;
160
161
162 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
163 *sse = sse0;
164 return (sse0 - ((sum0 * sum0) >> 8));
165 }
vp8_mse16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)166 unsigned int vp8_mse16x16_wmt(
167 const unsigned char *src_ptr,
168 int source_stride,
169 const unsigned char *ref_ptr,
170 int recon_stride,
171 unsigned int *sse)
172 {
173
174 unsigned int sse0;
175 int sum0;
176 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
177 *sse = sse0;
178 return sse0;
179
180 }
181
182
vp8_variance16x8_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)183 unsigned int vp8_variance16x8_wmt
184 (
185 const unsigned char *src_ptr,
186 int source_stride,
187 const unsigned char *ref_ptr,
188 int recon_stride,
189 unsigned int *sse)
190 {
191 unsigned int sse0, sse1, var;
192 int sum0, sum1, avg;
193
194 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
195 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
196
197 var = sse0 + sse1;
198 avg = sum0 + sum1;
199 *sse = var;
200 return (var - ((avg * avg) >> 7));
201
202 }
203
vp8_variance8x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)204 unsigned int vp8_variance8x16_wmt
205 (
206 const unsigned char *src_ptr,
207 int source_stride,
208 const unsigned char *ref_ptr,
209 int recon_stride,
210 unsigned int *sse)
211 {
212 unsigned int sse0, sse1, var;
213 int sum0, sum1, avg;
214
215 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
216 vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
217
218 var = sse0 + sse1;
219 avg = sum0 + sum1;
220 *sse = var;
221 return (var - ((avg * avg) >> 7));
222
223 }
224
225 ///////////////////////////////////////////////////////////////////////////
226 // the mmx function that does the bilinear filtering and var calculation //
227 // int one pass //
228 ///////////////////////////////////////////////////////////////////////////
229 DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
230 {
231 { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
232 { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
233 { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
234 { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
235 { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
236 { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
237 { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
238 { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
239 };
vp8_sub_pixel_variance4x4_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)240 unsigned int vp8_sub_pixel_variance4x4_wmt
241 (
242 const unsigned char *src_ptr,
243 int src_pixels_per_line,
244 int xoffset,
245 int yoffset,
246 const unsigned char *dst_ptr,
247 int dst_pixels_per_line,
248 unsigned int *sse
249 )
250 {
251 int xsum;
252 unsigned int xxsum;
253 vp8_filter_block2d_bil4x4_var_mmx(
254 src_ptr, src_pixels_per_line,
255 dst_ptr, dst_pixels_per_line,
256 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
257 &xsum, &xxsum
258 );
259 *sse = xxsum;
260 return (xxsum - ((xsum * xsum) >> 4));
261 }
262
263
vp8_sub_pixel_variance8x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)264 unsigned int vp8_sub_pixel_variance8x8_wmt
265 (
266 const unsigned char *src_ptr,
267 int src_pixels_per_line,
268 int xoffset,
269 int yoffset,
270 const unsigned char *dst_ptr,
271 int dst_pixels_per_line,
272 unsigned int *sse
273 )
274 {
275
276 int xsum;
277 unsigned int xxsum;
278 vp8_filter_block2d_bil_var_sse2(
279 src_ptr, src_pixels_per_line,
280 dst_ptr, dst_pixels_per_line, 8,
281 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
282 &xsum, &xxsum
283 );
284
285 *sse = xxsum;
286 return (xxsum - ((xsum * xsum) >> 6));
287 }
288
vp8_sub_pixel_variance16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)289 unsigned int vp8_sub_pixel_variance16x16_wmt
290 (
291 const unsigned char *src_ptr,
292 int src_pixels_per_line,
293 int xoffset,
294 int yoffset,
295 const unsigned char *dst_ptr,
296 int dst_pixels_per_line,
297 unsigned int *sse
298 )
299 {
300 int xsum0, xsum1;
301 unsigned int xxsum0, xxsum1;
302
303
304 // note we could avoid these if statements if the calling function
305 // just called the appropriate functions inside.
306 if (xoffset == 4 && yoffset == 0)
307 {
308 vp8_half_horiz_variance16x_h_sse2(
309 src_ptr, src_pixels_per_line,
310 dst_ptr, dst_pixels_per_line, 16,
311 &xsum0, &xxsum0);
312
313 vp8_half_horiz_variance16x_h_sse2(
314 src_ptr + 8, src_pixels_per_line,
315 dst_ptr + 8, dst_pixels_per_line, 16,
316 &xsum1, &xxsum1);
317 }
318 else if (xoffset == 0 && yoffset == 4)
319 {
320 vp8_half_vert_variance16x_h_sse2(
321 src_ptr, src_pixels_per_line,
322 dst_ptr, dst_pixels_per_line, 16,
323 &xsum0, &xxsum0);
324
325 vp8_half_vert_variance16x_h_sse2(
326 src_ptr + 8, src_pixels_per_line,
327 dst_ptr + 8, dst_pixels_per_line, 16,
328 &xsum1, &xxsum1);
329 }
330 else if (xoffset == 4 && yoffset == 4)
331 {
332 vp8_half_horiz_vert_variance16x_h_sse2(
333 src_ptr, src_pixels_per_line,
334 dst_ptr, dst_pixels_per_line, 16,
335 &xsum0, &xxsum0);
336
337 vp8_half_horiz_vert_variance16x_h_sse2(
338 src_ptr + 8, src_pixels_per_line,
339 dst_ptr + 8, dst_pixels_per_line, 16,
340 &xsum1, &xxsum1);
341 }
342 else
343 {
344 vp8_filter_block2d_bil_var_sse2(
345 src_ptr, src_pixels_per_line,
346 dst_ptr, dst_pixels_per_line, 16,
347 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
348 &xsum0, &xxsum0
349 );
350
351
352 vp8_filter_block2d_bil_var_sse2(
353 src_ptr + 8, src_pixels_per_line,
354 dst_ptr + 8, dst_pixels_per_line, 16,
355 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
356 &xsum1, &xxsum1
357 );
358 }
359
360 xsum0 += xsum1;
361 xxsum0 += xxsum1;
362 *sse = xxsum0;
363 return (xxsum0 - ((xsum0 * xsum0) >> 8));
364 }
365
vp8_sub_pixel_mse16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)366 unsigned int vp8_sub_pixel_mse16x16_wmt(
367 const unsigned char *src_ptr,
368 int src_pixels_per_line,
369 int xoffset,
370 int yoffset,
371 const unsigned char *dst_ptr,
372 int dst_pixels_per_line,
373 unsigned int *sse
374 )
375 {
376 vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
377 return *sse;
378 }
379
vp8_sub_pixel_variance16x8_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)380 unsigned int vp8_sub_pixel_variance16x8_wmt
381 (
382 const unsigned char *src_ptr,
383 int src_pixels_per_line,
384 int xoffset,
385 int yoffset,
386 const unsigned char *dst_ptr,
387 int dst_pixels_per_line,
388 unsigned int *sse
389
390 )
391 {
392 int xsum0, xsum1;
393 unsigned int xxsum0, xxsum1;
394
395
396 vp8_filter_block2d_bil_var_sse2(
397 src_ptr, src_pixels_per_line,
398 dst_ptr, dst_pixels_per_line, 8,
399 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
400 &xsum0, &xxsum0
401 );
402
403
404 vp8_filter_block2d_bil_var_sse2(
405 src_ptr + 8, src_pixels_per_line,
406 dst_ptr + 8, dst_pixels_per_line, 8,
407 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
408 &xsum1, &xxsum1
409 );
410
411 xsum0 += xsum1;
412 xxsum0 += xxsum1;
413
414 *sse = xxsum0;
415 return (xxsum0 - ((xsum0 * xsum0) >> 7));
416 }
417
vp8_sub_pixel_variance8x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)418 unsigned int vp8_sub_pixel_variance8x16_wmt
419 (
420 const unsigned char *src_ptr,
421 int src_pixels_per_line,
422 int xoffset,
423 int yoffset,
424 const unsigned char *dst_ptr,
425 int dst_pixels_per_line,
426 unsigned int *sse
427 )
428 {
429 int xsum;
430 unsigned int xxsum;
431 vp8_filter_block2d_bil_var_sse2(
432 src_ptr, src_pixels_per_line,
433 dst_ptr, dst_pixels_per_line, 16,
434 vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
435 &xsum, &xxsum
436 );
437
438 *sse = xxsum;
439 return (xxsum - ((xsum * xsum) >> 7));
440 }
441
vp8_i_variance16x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)442 unsigned int vp8_i_variance16x16_wmt(
443 const unsigned char *src_ptr,
444 int source_stride,
445 const unsigned char *ref_ptr,
446 int recon_stride,
447 unsigned int *sse)
448 {
449 unsigned int sse0, sse1, sse2, sse3, var;
450 int sum0, sum1, sum2, sum3, avg;
451
452
453 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
454 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
455 vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
456 vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
457
458 var = sse0 + sse1 + sse2 + sse3;
459 avg = sum0 + sum1 + sum2 + sum3;
460
461 *sse = var;
462 return (var - ((avg * avg) >> 8));
463
464 }
465
vp8_i_variance8x16_wmt(const unsigned char * src_ptr,int source_stride,const unsigned char * ref_ptr,int recon_stride,unsigned int * sse)466 unsigned int vp8_i_variance8x16_wmt(
467 const unsigned char *src_ptr,
468 int source_stride,
469 const unsigned char *ref_ptr,
470 int recon_stride,
471 unsigned int *sse)
472 {
473 unsigned int sse0, sse1, var;
474 int sum0, sum1, avg;
475 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
476 vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
477
478 var = sse0 + sse1;
479 avg = sum0 + sum1;
480
481 *sse = var;
482 return (var - ((avg * avg) >> 7));
483
484 }
485
486
vp8_i_sub_pixel_variance16x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)487 unsigned int vp8_i_sub_pixel_variance16x16_wmt
488 (
489 const unsigned char *src_ptr,
490 int src_pixels_per_line,
491 int xoffset,
492 int yoffset,
493 const unsigned char *dst_ptr,
494 int dst_pixels_per_line,
495 unsigned int *sse
496 )
497 {
498 return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
499 }
500
501
vp8_i_sub_pixel_variance8x16_wmt(const unsigned char * src_ptr,int src_pixels_per_line,int xoffset,int yoffset,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)502 unsigned int vp8_i_sub_pixel_variance8x16_wmt
503 (
504 const unsigned char *src_ptr,
505 int src_pixels_per_line,
506 int xoffset,
507 int yoffset,
508 const unsigned char *dst_ptr,
509 int dst_pixels_per_line,
510 unsigned int *sse
511 )
512 {
513
514 return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
515 }
516
517
vp8_variance_halfpixvar16x16_h_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)518 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
519 const unsigned char *src_ptr,
520 int src_pixels_per_line,
521 const unsigned char *dst_ptr,
522 int dst_pixels_per_line,
523 unsigned int *sse)
524 {
525 int xsum0, xsum1;
526 unsigned int xxsum0, xxsum1;
527
528 vp8_half_horiz_variance16x_h_sse2(
529 src_ptr, src_pixels_per_line,
530 dst_ptr, dst_pixels_per_line, 16,
531 &xsum0, &xxsum0);
532
533 vp8_half_horiz_variance16x_h_sse2(
534 src_ptr + 8, src_pixels_per_line,
535 dst_ptr + 8, dst_pixels_per_line, 16,
536 &xsum1, &xxsum1);
537
538 xsum0 += xsum1;
539 xxsum0 += xxsum1;
540 *sse = xxsum0;
541 return (xxsum0 - ((xsum0 * xsum0) >> 8));
542 }
543
544
vp8_variance_halfpixvar16x16_v_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)545 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
546 const unsigned char *src_ptr,
547 int src_pixels_per_line,
548 const unsigned char *dst_ptr,
549 int dst_pixels_per_line,
550 unsigned int *sse)
551 {
552 int xsum0, xsum1;
553 unsigned int xxsum0, xxsum1;
554
555 vp8_half_vert_variance16x_h_sse2(
556 src_ptr, src_pixels_per_line,
557 dst_ptr, dst_pixels_per_line, 16,
558 &xsum0, &xxsum0);
559
560 vp8_half_vert_variance16x_h_sse2(
561 src_ptr + 8, src_pixels_per_line,
562 dst_ptr + 8, dst_pixels_per_line, 16,
563 &xsum1, &xxsum1);
564
565 xsum0 += xsum1;
566 xxsum0 += xxsum1;
567 *sse = xxsum0;
568 return (xxsum0 - ((xsum0 * xsum0) >> 8));
569 }
570
571
vp8_variance_halfpixvar16x16_hv_wmt(const unsigned char * src_ptr,int src_pixels_per_line,const unsigned char * dst_ptr,int dst_pixels_per_line,unsigned int * sse)572 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
573 const unsigned char *src_ptr,
574 int src_pixels_per_line,
575 const unsigned char *dst_ptr,
576 int dst_pixels_per_line,
577 unsigned int *sse)
578 {
579 int xsum0, xsum1;
580 unsigned int xxsum0, xxsum1;
581
582 vp8_half_horiz_vert_variance16x_h_sse2(
583 src_ptr, src_pixels_per_line,
584 dst_ptr, dst_pixels_per_line, 16,
585 &xsum0, &xxsum0);
586
587 vp8_half_horiz_vert_variance16x_h_sse2(
588 src_ptr + 8, src_pixels_per_line,
589 dst_ptr + 8, dst_pixels_per_line, 16,
590 &xsum1, &xxsum1);
591
592 xsum0 += xsum1;
593 xxsum0 += xxsum1;
594 *sse = xxsum0;
595 return (xxsum0 - ((xsum0 * xsum0) >> 8));
596 }
597