1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 #include <math.h>
13 #include <stdlib.h>
14 #include "vpx_scale/yv12config.h"
15 #include "pragmas.h"
16
17 #define VP8_FILTER_WEIGHT 128
18 #define VP8_FILTER_SHIFT 7
19
20
21
22 /* static constants */
23 __declspec(align(16))
24 const static short Blur[48] =
25 {
26
27 16, 16, 16, 16, 16, 16, 16, 16,
28 16, 16, 16, 16, 16, 16, 16, 16,
29 64, 64, 64, 64, 64, 64, 64, 64,
30 16, 16, 16, 16, 16, 16, 16, 16,
31 16, 16, 16, 16, 16, 16, 16, 16,
32 0, 0, 0, 0, 0, 0, 0, 0,
33
34 };
35 #define RD __declspec(align(16)) __int64 rd = 0x0040004000400040;
36 #define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
37
38 #ifndef RELOCATEABLE
39 const static RD;
40 const static R4D2;
41 #endif
42
43
44 /* external references */
45 extern double vp8_gaussian(double sigma, double mu, double x);
46 extern short vp8_rv[];
47 extern int vp8_q2mbl(int x) ;
48
49
50
vp8_post_proc_down_and_across_mmx(unsigned char * src_ptr,unsigned char * dst_ptr,int src_pixels_per_line,int dst_pixels_per_line,int rows,int cols,int flimit)51 void vp8_post_proc_down_and_across_mmx
52 (
53 unsigned char *src_ptr,
54 unsigned char *dst_ptr,
55 int src_pixels_per_line,
56 int dst_pixels_per_line,
57 int rows,
58 int cols,
59 int flimit
60 )
61 {
62 #ifdef RELOCATEABLE
63 RD
64 R4D2
65 #endif
66
67 __asm
68 {
69 push ebx
70 lea ebx, Blur
71 movd mm2, flimit
72 punpcklwd mm2, mm2
73 punpckldq mm2, mm2
74
75 mov esi, src_ptr
76 mov edi, dst_ptr
77
78 mov ecx, DWORD PTR rows
79 mov eax, src_pixels_per_line ;
80 destination pitch?
81 pxor mm0, mm0 ;
82 mm0 = 00000000
83
84 nextrow:
85
86 xor edx, edx ;
87
88 clear out edx for use as loop counter
89 nextcol:
90
91 pxor mm7, mm7 ;
92
93 mm7 = 00000000
94 movq mm6, [ebx + 32 ] ;
95 mm6 = kernel 2 taps
96 movq mm3, [esi] ;
97 mm4 = r0 p0..p7
98 punpcklbw mm3, mm0 ;
99 mm3 = p0..p3
100 movq mm1, mm3 ;
101 mm1 = p0..p3
102 pmullw mm3, mm6 ;
103 mm3 *= kernel 2 modifiers
104
105 movq mm6, [ebx + 48] ;
106 mm6 = kernel 3 taps
107 movq mm5, [esi + eax] ;
108 mm4 = r1 p0..p7
109 punpcklbw mm5, mm0 ;
110 mm5 = r1 p0..p3
111 pmullw mm6, mm5 ;
112 mm6 *= p0..p3 * kernel 3 modifiers
113 paddusw mm3, mm6 ;
114 mm3 += mm6
115
116 ;
117 thresholding
118 movq mm7, mm1 ;
119 mm7 = r0 p0..p3
120 psubusw mm7, mm5 ;
121 mm7 = r0 p0..p3 - r1 p0..p3
122 psubusw mm5, mm1 ;
123 mm5 = r1 p0..p3 - r0 p0..p3
124 paddusw mm7, mm5 ;
125 mm7 = abs(r0 p0..p3 - r1 p0..p3)
126 pcmpgtw mm7, mm2
127
128 movq mm6, [ebx + 64 ] ;
129 mm6 = kernel 4 modifiers
130 movq mm5, [esi + 2*eax] ;
131 mm4 = r2 p0..p7
132 punpcklbw mm5, mm0 ;
133 mm5 = r2 p0..p3
134 pmullw mm6, mm5 ;
135 mm5 *= kernel 4 modifiers
136 paddusw mm3, mm6 ;
137 mm3 += mm5
138
139 ;
140 thresholding
141 movq mm6, mm1 ;
142 mm6 = r0 p0..p3
143 psubusw mm6, mm5 ;
144 mm6 = r0 p0..p3 - r2 p0..p3
145 psubusw mm5, mm1 ;
146 mm5 = r2 p0..p3 - r2 p0..p3
147 paddusw mm6, mm5 ;
148 mm6 = abs(r0 p0..p3 - r2 p0..p3)
149 pcmpgtw mm6, mm2
150 por mm7, mm6 ;
151 accumulate thresholds
152
153
154 neg eax
155 movq mm6, [ebx ] ;
156 kernel 0 taps
157 movq mm5, [esi+2*eax] ;
158 mm4 = r-2 p0..p7
159 punpcklbw mm5, mm0 ;
160 mm5 = r-2 p0..p3
161 pmullw mm6, mm5 ;
162 mm5 *= kernel 0 modifiers
163 paddusw mm3, mm6 ;
164 mm3 += mm5
165
166 ;
167 thresholding
168 movq mm6, mm1 ;
169 mm6 = r0 p0..p3
170 psubusw mm6, mm5 ;
171 mm6 = p0..p3 - r-2 p0..p3
172 psubusw mm5, mm1 ;
173 mm5 = r-2 p0..p3 - p0..p3
174 paddusw mm6, mm5 ;
175 mm6 = abs(r0 p0..p3 - r-2 p0..p3)
176 pcmpgtw mm6, mm2
177 por mm7, mm6 ;
178 accumulate thresholds
179
180 movq mm6, [ebx + 16] ;
181 kernel 1 taps
182 movq mm4, [esi+eax] ;
183 mm4 = r-1 p0..p7
184 punpcklbw mm4, mm0 ;
185 mm4 = r-1 p0..p3
186 pmullw mm6, mm4 ;
187 mm4 *= kernel 1 modifiers.
188 paddusw mm3, mm6 ;
189 mm3 += mm5
190
191 ;
192 thresholding
193 movq mm6, mm1 ;
194 mm6 = r0 p0..p3
195 psubusw mm6, mm4 ;
196 mm6 = p0..p3 - r-2 p0..p3
197 psubusw mm4, mm1 ;
198 mm5 = r-1 p0..p3 - p0..p3
199 paddusw mm6, mm4 ;
200 mm6 = abs(r0 p0..p3 - r-1 p0..p3)
201 pcmpgtw mm6, mm2
202 por mm7, mm6 ;
203 accumulate thresholds
204
205
206 paddusw mm3, rd ;
207 mm3 += round value
208 psraw mm3, VP8_FILTER_SHIFT ;
209 mm3 /= 128
210
211 pand mm1, mm7 ;
212 mm1 select vals > thresh from source
213 pandn mm7, mm3 ;
214 mm7 select vals < thresh from blurred result
215 paddusw mm1, mm7 ;
216 combination
217
218 packuswb mm1, mm0 ;
219 pack to bytes
220
221 movd [edi], mm1 ;
222 neg eax ;
223 pitch is positive
224
225
226 add esi, 4
227 add edi, 4
228 add edx, 4
229
230 cmp edx, cols
231 jl nextcol
232 // done with the all cols, start the across filtering in place
233 sub esi, edx
234 sub edi, edx
235
236
237 push eax
238 xor edx, edx
239 mov eax, [edi-4];
240
241 acrossnextcol:
242 pxor mm7, mm7 ;
243 mm7 = 00000000
244 movq mm6, [ebx + 32 ] ;
245 movq mm4, [edi+edx] ;
246 mm4 = p0..p7
247 movq mm3, mm4 ;
248 mm3 = p0..p7
249 punpcklbw mm3, mm0 ;
250 mm3 = p0..p3
251 movq mm1, mm3 ;
252 mm1 = p0..p3
253 pmullw mm3, mm6 ;
254 mm3 *= kernel 2 modifiers
255
256 movq mm6, [ebx + 48]
257 psrlq mm4, 8 ;
258 mm4 = p1..p7
259 movq mm5, mm4 ;
260 mm5 = p1..p7
261 punpcklbw mm5, mm0 ;
262 mm5 = p1..p4
263 pmullw mm6, mm5 ;
264 mm6 *= p1..p4 * kernel 3 modifiers
265 paddusw mm3, mm6 ;
266 mm3 += mm6
267
268 ;
269 thresholding
270 movq mm7, mm1 ;
271 mm7 = p0..p3
272 psubusw mm7, mm5 ;
273 mm7 = p0..p3 - p1..p4
274 psubusw mm5, mm1 ;
275 mm5 = p1..p4 - p0..p3
276 paddusw mm7, mm5 ;
277 mm7 = abs(p0..p3 - p1..p4)
278 pcmpgtw mm7, mm2
279
280 movq mm6, [ebx + 64 ]
281 psrlq mm4, 8 ;
282 mm4 = p2..p7
283 movq mm5, mm4 ;
284 mm5 = p2..p7
285 punpcklbw mm5, mm0 ;
286 mm5 = p2..p5
287 pmullw mm6, mm5 ;
288 mm5 *= kernel 4 modifiers
289 paddusw mm3, mm6 ;
290 mm3 += mm5
291
292 ;
293 thresholding
294 movq mm6, mm1 ;
295 mm6 = p0..p3
296 psubusw mm6, mm5 ;
297 mm6 = p0..p3 - p1..p4
298 psubusw mm5, mm1 ;
299 mm5 = p1..p4 - p0..p3
300 paddusw mm6, mm5 ;
301 mm6 = abs(p0..p3 - p1..p4)
302 pcmpgtw mm6, mm2
303 por mm7, mm6 ;
304 accumulate thresholds
305
306
307 movq mm6, [ebx ]
308 movq mm4, [edi+edx-2] ;
309 mm4 = p-2..p5
310 movq mm5, mm4 ;
311 mm5 = p-2..p5
312 punpcklbw mm5, mm0 ;
313 mm5 = p-2..p1
314 pmullw mm6, mm5 ;
315 mm5 *= kernel 0 modifiers
316 paddusw mm3, mm6 ;
317 mm3 += mm5
318
319 ;
320 thresholding
321 movq mm6, mm1 ;
322 mm6 = p0..p3
323 psubusw mm6, mm5 ;
324 mm6 = p0..p3 - p1..p4
325 psubusw mm5, mm1 ;
326 mm5 = p1..p4 - p0..p3
327 paddusw mm6, mm5 ;
328 mm6 = abs(p0..p3 - p1..p4)
329 pcmpgtw mm6, mm2
330 por mm7, mm6 ;
331 accumulate thresholds
332
333 movq mm6, [ebx + 16]
334 psrlq mm4, 8 ;
335 mm4 = p-1..p5
336 punpcklbw mm4, mm0 ;
337 mm4 = p-1..p2
338 pmullw mm6, mm4 ;
339 mm4 *= kernel 1 modifiers.
340 paddusw mm3, mm6 ;
341 mm3 += mm5
342
343 ;
344 thresholding
345 movq mm6, mm1 ;
346 mm6 = p0..p3
347 psubusw mm6, mm4 ;
348 mm6 = p0..p3 - p1..p4
349 psubusw mm4, mm1 ;
350 mm5 = p1..p4 - p0..p3
351 paddusw mm6, mm4 ;
352 mm6 = abs(p0..p3 - p1..p4)
353 pcmpgtw mm6, mm2
354 por mm7, mm6 ;
355 accumulate thresholds
356
357 paddusw mm3, rd ;
358 mm3 += round value
359 psraw mm3, VP8_FILTER_SHIFT ;
360 mm3 /= 128
361
362 pand mm1, mm7 ;
363 mm1 select vals > thresh from source
364 pandn mm7, mm3 ;
365 mm7 select vals < thresh from blurred result
366 paddusw mm1, mm7 ;
367 combination
368
369 packuswb mm1, mm0 ;
370 pack to bytes
371 mov DWORD PTR [edi+edx-4], eax ;
372 store previous four bytes
373 movd eax, mm1
374
375 add edx, 4
376 cmp edx, cols
377 jl acrossnextcol;
378
379 mov DWORD PTR [edi+edx-4], eax
380 pop eax
381
382 // done with this rwo
383 add esi, eax ;
384 next line
385 mov eax, dst_pixels_per_line ;
386 destination pitch?
387 add edi, eax ;
388 next destination
389 mov eax, src_pixels_per_line ;
390 destination pitch?
391
392 dec ecx ;
393 decrement count
394 jnz nextrow ;
395 next row
396 pop ebx
397
398 }
399 }
400
401
402
vp8_post_proc_down_and_across_xmm(unsigned char * src_ptr,unsigned char * dst_ptr,int src_pixels_per_line,int dst_pixels_per_line,int rows,int cols,int flimit)403 void vp8_post_proc_down_and_across_xmm
404 (
405 unsigned char *src_ptr,
406 unsigned char *dst_ptr,
407 int src_pixels_per_line,
408 int dst_pixels_per_line,
409 int rows,
410 int cols,
411 int flimit
412 )
413 {
414 #ifdef RELOCATEABLE
415 R4D2
416 #endif
417
418 __asm
419 {
420 movd xmm2, flimit
421 punpcklwd xmm2, xmm2
422 punpckldq xmm2, xmm2
423 punpcklqdq xmm2, xmm2
424
425 mov esi, src_ptr
426 mov edi, dst_ptr
427
428 mov ecx, DWORD PTR rows
429 mov eax, src_pixels_per_line ;
430 destination pitch?
431 pxor xmm0, xmm0 ;
432 mm0 = 00000000
433
434 nextrow:
435
436 xor edx, edx ;
437
438 clear out edx for use as loop counter
439 nextcol:
440 movq xmm3, QWORD PTR [esi] ;
441
442 mm4 = r0 p0..p7
443 punpcklbw xmm3, xmm0 ;
444 mm3 = p0..p3
445 movdqa xmm1, xmm3 ;
446 mm1 = p0..p3
447 psllw xmm3, 2 ;
448
449 movq xmm5, QWORD PTR [esi + eax] ;
450 mm4 = r1 p0..p7
451 punpcklbw xmm5, xmm0 ;
452 mm5 = r1 p0..p3
453 paddusw xmm3, xmm5 ;
454 mm3 += mm6
455
456 ;
457 thresholding
458 movdqa xmm7, xmm1 ;
459 mm7 = r0 p0..p3
460 psubusw xmm7, xmm5 ;
461 mm7 = r0 p0..p3 - r1 p0..p3
462 psubusw xmm5, xmm1 ;
463 mm5 = r1 p0..p3 - r0 p0..p3
464 paddusw xmm7, xmm5 ;
465 mm7 = abs(r0 p0..p3 - r1 p0..p3)
466 pcmpgtw xmm7, xmm2
467
468 movq xmm5, QWORD PTR [esi + 2*eax] ;
469 mm4 = r2 p0..p7
470 punpcklbw xmm5, xmm0 ;
471 mm5 = r2 p0..p3
472 paddusw xmm3, xmm5 ;
473 mm3 += mm5
474
475 ;
476 thresholding
477 movdqa xmm6, xmm1 ;
478 mm6 = r0 p0..p3
479 psubusw xmm6, xmm5 ;
480 mm6 = r0 p0..p3 - r2 p0..p3
481 psubusw xmm5, xmm1 ;
482 mm5 = r2 p0..p3 - r2 p0..p3
483 paddusw xmm6, xmm5 ;
484 mm6 = abs(r0 p0..p3 - r2 p0..p3)
485 pcmpgtw xmm6, xmm2
486 por xmm7, xmm6 ;
487 accumulate thresholds
488
489
490 neg eax
491 movq xmm5, QWORD PTR [esi+2*eax] ;
492 mm4 = r-2 p0..p7
493 punpcklbw xmm5, xmm0 ;
494 mm5 = r-2 p0..p3
495 paddusw xmm3, xmm5 ;
496 mm3 += mm5
497
498 ;
499 thresholding
500 movdqa xmm6, xmm1 ;
501 mm6 = r0 p0..p3
502 psubusw xmm6, xmm5 ;
503 mm6 = p0..p3 - r-2 p0..p3
504 psubusw xmm5, xmm1 ;
505 mm5 = r-2 p0..p3 - p0..p3
506 paddusw xmm6, xmm5 ;
507 mm6 = abs(r0 p0..p3 - r-2 p0..p3)
508 pcmpgtw xmm6, xmm2
509 por xmm7, xmm6 ;
510 accumulate thresholds
511
512 movq xmm4, QWORD PTR [esi+eax] ;
513 mm4 = r-1 p0..p7
514 punpcklbw xmm4, xmm0 ;
515 mm4 = r-1 p0..p3
516 paddusw xmm3, xmm4 ;
517 mm3 += mm5
518
519 ;
520 thresholding
521 movdqa xmm6, xmm1 ;
522 mm6 = r0 p0..p3
523 psubusw xmm6, xmm4 ;
524 mm6 = p0..p3 - r-2 p0..p3
525 psubusw xmm4, xmm1 ;
526 mm5 = r-1 p0..p3 - p0..p3
527 paddusw xmm6, xmm4 ;
528 mm6 = abs(r0 p0..p3 - r-1 p0..p3)
529 pcmpgtw xmm6, xmm2
530 por xmm7, xmm6 ;
531 accumulate thresholds
532
533
534 paddusw xmm3, rd42 ;
535 mm3 += round value
536 psraw xmm3, 3 ;
537 mm3 /= 8
538
539 pand xmm1, xmm7 ;
540 mm1 select vals > thresh from source
541 pandn xmm7, xmm3 ;
542 mm7 select vals < thresh from blurred result
543 paddusw xmm1, xmm7 ;
544 combination
545
546 packuswb xmm1, xmm0 ;
547 pack to bytes
548 movq QWORD PTR [edi], xmm1 ;
549
550 neg eax ;
551 pitch is positive
552 add esi, 8
553 add edi, 8
554
555 add edx, 8
556 cmp edx, cols
557
558 jl nextcol
559
560 // done with the all cols, start the across filtering in place
561 sub esi, edx
562 sub edi, edx
563
564 xor edx, edx
565 movq mm0, QWORD PTR [edi-8];
566
567 acrossnextcol:
568 movq xmm7, QWORD PTR [edi +edx -2]
569 movd xmm4, DWORD PTR [edi +edx +6]
570
571 pslldq xmm4, 8
572 por xmm4, xmm7
573
574 movdqa xmm3, xmm4
575 psrldq xmm3, 2
576 punpcklbw xmm3, xmm0 ;
577 mm3 = p0..p3
578 movdqa xmm1, xmm3 ;
579 mm1 = p0..p3
580 psllw xmm3, 2
581
582
583 movdqa xmm5, xmm4
584 psrldq xmm5, 3
585 punpcklbw xmm5, xmm0 ;
586 mm5 = p1..p4
587 paddusw xmm3, xmm5 ;
588 mm3 += mm6
589
590 ;
591 thresholding
592 movdqa xmm7, xmm1 ;
593 mm7 = p0..p3
594 psubusw xmm7, xmm5 ;
595 mm7 = p0..p3 - p1..p4
596 psubusw xmm5, xmm1 ;
597 mm5 = p1..p4 - p0..p3
598 paddusw xmm7, xmm5 ;
599 mm7 = abs(p0..p3 - p1..p4)
600 pcmpgtw xmm7, xmm2
601
602 movdqa xmm5, xmm4
603 psrldq xmm5, 4
604 punpcklbw xmm5, xmm0 ;
605 mm5 = p2..p5
606 paddusw xmm3, xmm5 ;
607 mm3 += mm5
608
609 ;
610 thresholding
611 movdqa xmm6, xmm1 ;
612 mm6 = p0..p3
613 psubusw xmm6, xmm5 ;
614 mm6 = p0..p3 - p1..p4
615 psubusw xmm5, xmm1 ;
616 mm5 = p1..p4 - p0..p3
617 paddusw xmm6, xmm5 ;
618 mm6 = abs(p0..p3 - p1..p4)
619 pcmpgtw xmm6, xmm2
620 por xmm7, xmm6 ;
621 accumulate thresholds
622
623
624 movdqa xmm5, xmm4 ;
625 mm5 = p-2..p5
626 punpcklbw xmm5, xmm0 ;
627 mm5 = p-2..p1
628 paddusw xmm3, xmm5 ;
629 mm3 += mm5
630
631 ;
632 thresholding
633 movdqa xmm6, xmm1 ;
634 mm6 = p0..p3
635 psubusw xmm6, xmm5 ;
636 mm6 = p0..p3 - p1..p4
637 psubusw xmm5, xmm1 ;
638 mm5 = p1..p4 - p0..p3
639 paddusw xmm6, xmm5 ;
640 mm6 = abs(p0..p3 - p1..p4)
641 pcmpgtw xmm6, xmm2
642 por xmm7, xmm6 ;
643 accumulate thresholds
644
645 psrldq xmm4, 1 ;
646 mm4 = p-1..p5
647 punpcklbw xmm4, xmm0 ;
648 mm4 = p-1..p2
649 paddusw xmm3, xmm4 ;
650 mm3 += mm5
651
652 ;
653 thresholding
654 movdqa xmm6, xmm1 ;
655 mm6 = p0..p3
656 psubusw xmm6, xmm4 ;
657 mm6 = p0..p3 - p1..p4
658 psubusw xmm4, xmm1 ;
659 mm5 = p1..p4 - p0..p3
660 paddusw xmm6, xmm4 ;
661 mm6 = abs(p0..p3 - p1..p4)
662 pcmpgtw xmm6, xmm2
663 por xmm7, xmm6 ;
664 accumulate thresholds
665
666 paddusw xmm3, rd42 ;
667 mm3 += round value
668 psraw xmm3, 3 ;
669 mm3 /= 8
670
671 pand xmm1, xmm7 ;
672 mm1 select vals > thresh from source
673 pandn xmm7, xmm3 ;
674 mm7 select vals < thresh from blurred result
675 paddusw xmm1, xmm7 ;
676 combination
677
678 packuswb xmm1, xmm0 ;
679 pack to bytes
680 movq QWORD PTR [edi+edx-8], mm0 ;
681 store previous four bytes
682 movdq2q mm0, xmm1
683
684 add edx, 8
685 cmp edx, cols
686 jl acrossnextcol;
687
688 // last 8 pixels
689 movq QWORD PTR [edi+edx-8], mm0
690
691 // done with this rwo
692 add esi, eax ;
693 next line
694 mov eax, dst_pixels_per_line ;
695 destination pitch?
696 add edi, eax ;
697 next destination
698 mov eax, src_pixels_per_line ;
699 destination pitch?
700
701 dec ecx ;
702 decrement count
703 jnz nextrow ;
704 next row
705 }
706 }
707
708
vp8_mbpost_proc_down_mmx(unsigned char * dst,int pitch,int rows,int cols,int flimit)709 void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
710 {
711 int c, i;
712 __declspec(align(16))
713 int flimit2[2];
714 __declspec(align(16))
715 unsigned char d[16][8];
716
717 flimit = vp8_q2mbl(flimit);
718
719 for (i = 0; i < 2; i++)
720 flimit2[i] = flimit;
721
722 rows += 8;
723
724 for (c = 0; c < cols; c += 4)
725 {
726 unsigned char *s = &dst[c];
727
728 __asm
729 {
730 mov esi, s ;
731 pxor mm0, mm0 ;
732
733 mov eax, pitch ;
734 neg eax // eax = -pitch
735
736 lea esi, [esi + eax*8]; // edi = s[-pitch*8]
737 neg eax
738
739
740 pxor mm5, mm5
741 pxor mm6, mm6 ;
742
743 pxor mm7, mm7 ;
744 mov edi, esi
745
746 mov ecx, 15 ;
747
748 loop_initvar:
749 movd mm1, DWORD PTR [edi];
750 punpcklbw mm1, mm0 ;
751
752 paddw mm5, mm1 ;
753 pmullw mm1, mm1 ;
754
755 movq mm2, mm1 ;
756 punpcklwd mm1, mm0 ;
757
758 punpckhwd mm2, mm0 ;
759 paddd mm6, mm1 ;
760
761 paddd mm7, mm2 ;
762 lea edi, [edi+eax] ;
763
764 dec ecx
765 jne loop_initvar
766 //save the var and sum
767 xor edx, edx
768 loop_row:
769 movd mm1, DWORD PTR [esi] // [s-pitch*8]
770 movd mm2, DWORD PTR [edi] // [s+pitch*7]
771
772 punpcklbw mm1, mm0
773 punpcklbw mm2, mm0
774
775 paddw mm5, mm2
776 psubw mm5, mm1
777
778 pmullw mm2, mm2
779 movq mm4, mm2
780
781 punpcklwd mm2, mm0
782 punpckhwd mm4, mm0
783
784 paddd mm6, mm2
785 paddd mm7, mm4
786
787 pmullw mm1, mm1
788 movq mm2, mm1
789
790 punpcklwd mm1, mm0
791 psubd mm6, mm1
792
793 punpckhwd mm2, mm0
794 psubd mm7, mm2
795
796
797 movq mm3, mm6
798 pslld mm3, 4
799
800 psubd mm3, mm6
801 movq mm1, mm5
802
803 movq mm4, mm5
804 pmullw mm1, mm1
805
806 pmulhw mm4, mm4
807 movq mm2, mm1
808
809 punpcklwd mm1, mm4
810 punpckhwd mm2, mm4
811
812 movq mm4, mm7
813 pslld mm4, 4
814
815 psubd mm4, mm7
816
817 psubd mm3, mm1
818 psubd mm4, mm2
819
820 psubd mm3, flimit2
821 psubd mm4, flimit2
822
823 psrad mm3, 31
824 psrad mm4, 31
825
826 packssdw mm3, mm4
827 packsswb mm3, mm0
828
829 movd mm1, DWORD PTR [esi+eax*8]
830
831 movq mm2, mm1
832 punpcklbw mm1, mm0
833
834 paddw mm1, mm5
835 mov ecx, edx
836
837 and ecx, 127
838 movq mm4, vp8_rv[ecx*2]
839
840 paddw mm1, mm4
841 //paddw xmm1, eight8s
842 psraw mm1, 4
843
844 packuswb mm1, mm0
845 pand mm1, mm3
846
847 pandn mm3, mm2
848 por mm1, mm3
849
850 and ecx, 15
851 movd DWORD PTR d[ecx*4], mm1
852
853 mov ecx, edx
854 sub ecx, 8
855
856 and ecx, 15
857 movd mm1, DWORD PTR d[ecx*4]
858
859 movd [esi], mm1
860 lea esi, [esi+eax]
861
862 lea edi, [edi+eax]
863 add edx, 1
864
865 cmp edx, rows
866 jl loop_row
867
868 }
869
870 }
871 }
872
vp8_mbpost_proc_down_xmm(unsigned char * dst,int pitch,int rows,int cols,int flimit)873 void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
874 {
875 int c, i;
876 __declspec(align(16))
877 int flimit4[4];
878 __declspec(align(16))
879 unsigned char d[16][8];
880
881 flimit = vp8_q2mbl(flimit);
882
883 for (i = 0; i < 4; i++)
884 flimit4[i] = flimit;
885
886 rows += 8;
887
888 for (c = 0; c < cols; c += 8)
889 {
890 unsigned char *s = &dst[c];
891
892 __asm
893 {
894 mov esi, s ;
895 pxor xmm0, xmm0 ;
896
897 mov eax, pitch ;
898 neg eax // eax = -pitch
899
900 lea esi, [esi + eax*8]; // edi = s[-pitch*8]
901 neg eax
902
903
904 pxor xmm5, xmm5
905 pxor xmm6, xmm6 ;
906
907 pxor xmm7, xmm7 ;
908 mov edi, esi
909
910 mov ecx, 15 ;
911
912 loop_initvar:
913 movq xmm1, QWORD PTR [edi];
914 punpcklbw xmm1, xmm0 ;
915
916 paddw xmm5, xmm1 ;
917 pmullw xmm1, xmm1 ;
918
919 movdqa xmm2, xmm1 ;
920 punpcklwd xmm1, xmm0 ;
921
922 punpckhwd xmm2, xmm0 ;
923 paddd xmm6, xmm1 ;
924
925 paddd xmm7, xmm2 ;
926 lea edi, [edi+eax] ;
927
928 dec ecx
929 jne loop_initvar
930 //save the var and sum
931 xor edx, edx
932 loop_row:
933 movq xmm1, QWORD PTR [esi] // [s-pitch*8]
934 movq xmm2, QWORD PTR [edi] // [s+pitch*7]
935
936 punpcklbw xmm1, xmm0
937 punpcklbw xmm2, xmm0
938
939 paddw xmm5, xmm2
940 psubw xmm5, xmm1
941
942 pmullw xmm2, xmm2
943 movdqa xmm4, xmm2
944
945 punpcklwd xmm2, xmm0
946 punpckhwd xmm4, xmm0
947
948 paddd xmm6, xmm2
949 paddd xmm7, xmm4
950
951 pmullw xmm1, xmm1
952 movdqa xmm2, xmm1
953
954 punpcklwd xmm1, xmm0
955 psubd xmm6, xmm1
956
957 punpckhwd xmm2, xmm0
958 psubd xmm7, xmm2
959
960
961 movdqa xmm3, xmm6
962 pslld xmm3, 4
963
964 psubd xmm3, xmm6
965 movdqa xmm1, xmm5
966
967 movdqa xmm4, xmm5
968 pmullw xmm1, xmm1
969
970 pmulhw xmm4, xmm4
971 movdqa xmm2, xmm1
972
973 punpcklwd xmm1, xmm4
974 punpckhwd xmm2, xmm4
975
976 movdqa xmm4, xmm7
977 pslld xmm4, 4
978
979 psubd xmm4, xmm7
980
981 psubd xmm3, xmm1
982 psubd xmm4, xmm2
983
984 psubd xmm3, flimit4
985 psubd xmm4, flimit4
986
987 psrad xmm3, 31
988 psrad xmm4, 31
989
990 packssdw xmm3, xmm4
991 packsswb xmm3, xmm0
992
993 movq xmm1, QWORD PTR [esi+eax*8]
994
995 movq xmm2, xmm1
996 punpcklbw xmm1, xmm0
997
998 paddw xmm1, xmm5
999 mov ecx, edx
1000
1001 and ecx, 127
1002 movdqu xmm4, vp8_rv[ecx*2]
1003
1004 paddw xmm1, xmm4
1005 //paddw xmm1, eight8s
1006 psraw xmm1, 4
1007
1008 packuswb xmm1, xmm0
1009 pand xmm1, xmm3
1010
1011 pandn xmm3, xmm2
1012 por xmm1, xmm3
1013
1014 and ecx, 15
1015 movq QWORD PTR d[ecx*8], xmm1
1016
1017 mov ecx, edx
1018 sub ecx, 8
1019
1020 and ecx, 15
1021 movq mm0, d[ecx*8]
1022
1023 movq [esi], mm0
1024 lea esi, [esi+eax]
1025
1026 lea edi, [edi+eax]
1027 add edx, 1
1028
1029 cmp edx, rows
1030 jl loop_row
1031
1032 }
1033
1034 }
1035 }
1036 #if 0
1037 /****************************************************************************
1038 *
1039 * ROUTINE : plane_add_noise_wmt
1040 *
1041 * INPUTS : unsigned char *Start starting address of buffer to add gaussian
1042 * noise to
1043 * unsigned int Width width of plane
1044 * unsigned int Height height of plane
1045 * int Pitch distance between subsequent lines of frame
1046 * int q quantizer used to determine amount of noise
1047 * to add
1048 *
1049 * OUTPUTS : None.
1050 *
1051 * RETURNS : void.
1052 *
1053 * FUNCTION : adds gaussian noise to a plane of pixels
1054 *
1055 * SPECIAL NOTES : None.
1056 *
1057 ****************************************************************************/
1058 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1059 {
1060 unsigned int i;
1061
1062 __declspec(align(16)) unsigned char blackclamp[16];
1063 __declspec(align(16)) unsigned char whiteclamp[16];
1064 __declspec(align(16)) unsigned char bothclamp[16];
1065 char char_dist[300];
1066 char Rand[2048];
1067 double sigma;
1068 // return;
1069 __asm emms
1070 sigma = a + .5 + .6 * (63 - q) / 63.0;
1071
1072 // set up a lookup table of 256 entries that matches
1073 // a gaussian distribution with sigma determined by q.
1074 //
1075 {
1076 double i;
1077 int next, j;
1078
1079 next = 0;
1080
1081 for (i = -32; i < 32; i++)
1082 {
1083 double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
1084 int a = (int)(g + .5);
1085
1086 if (a)
1087 {
1088 for (j = 0; j < a; j++)
1089 {
1090 char_dist[next+j] = (char) i;
1091 }
1092
1093 next = next + j;
1094 }
1095
1096 }
1097
1098 for (next = next; next < 256; next++)
1099 char_dist[next] = 0;
1100
1101 }
1102
1103 for (i = 0; i < 2048; i++)
1104 {
1105 Rand[i] = char_dist[rand() & 0xff];
1106 }
1107
1108 for (i = 0; i < 16; i++)
1109 {
1110 blackclamp[i] = -char_dist[0];
1111 whiteclamp[i] = -char_dist[0];
1112 bothclamp[i] = -2 * char_dist[0];
1113 }
1114
1115 for (i = 0; i < Height; i++)
1116 {
1117 unsigned char *Pos = Start + i * Pitch;
1118 char *Ref = Rand + (rand() & 0xff);
1119
1120 __asm
1121 {
1122 mov ecx, [Width]
1123 mov esi, Pos
1124 mov edi, Ref
1125 xor eax, eax
1126
1127 nextset:
1128 movdqu xmm1, [esi+eax] // get the source
1129
1130 psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise
1131 paddusb xmm1, bothclamp
1132 psubusb xmm1, whiteclamp
1133
1134 movdqu xmm2, [edi+eax] // get the noise for this line
1135 paddb xmm1, xmm2 // add it in
1136 movdqu [esi+eax], xmm1 // store the result
1137
1138 add eax, 16 // move to the next line
1139
1140 cmp eax, ecx
1141 jl nextset
1142
1143
1144 }
1145
1146 }
1147 }
1148 #endif
1149 __declspec(align(16))
1150 static const int four8s[4] = { 8, 8, 8, 8};
vp8_mbpost_proc_across_ip_xmm(unsigned char * src,int pitch,int rows,int cols,int flimit)1151 void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
1152 {
1153 int r, i;
1154 __declspec(align(16))
1155 int flimit4[4];
1156 unsigned char *s = src;
1157 int sumsq;
1158 int sum;
1159
1160
1161 flimit = vp8_q2mbl(flimit);
1162 flimit4[0] =
1163 flimit4[1] =
1164 flimit4[2] =
1165 flimit4[3] = flimit;
1166
1167 for (r = 0; r < rows; r++)
1168 {
1169
1170
1171 sumsq = 0;
1172 sum = 0;
1173
1174 for (i = -8; i <= 6; i++)
1175 {
1176 sumsq += s[i] * s[i];
1177 sum += s[i];
1178 }
1179
1180 __asm
1181 {
1182 mov eax, sumsq
1183 movd xmm7, eax
1184
1185 mov eax, sum
1186 movd xmm6, eax
1187
1188 mov esi, s
1189 xor ecx, ecx
1190
1191 mov edx, cols
1192 add edx, 8
1193 pxor mm0, mm0
1194 pxor mm1, mm1
1195
1196 pxor xmm0, xmm0
1197 nextcol4:
1198
1199 movd xmm1, DWORD PTR [esi+ecx-8] // -8 -7 -6 -5
1200 movd xmm2, DWORD PTR [esi+ecx+7] // +7 +8 +9 +10
1201
1202 punpcklbw xmm1, xmm0 // expanding
1203 punpcklbw xmm2, xmm0 // expanding
1204
1205 punpcklwd xmm1, xmm0 // expanding to dwords
1206 punpcklwd xmm2, xmm0 // expanding to dwords
1207
1208 psubd xmm2, xmm1 // 7--8 8--7 9--6 10--5
1209 paddd xmm1, xmm1 // -8*2 -7*2 -6*2 -5*2
1210
1211 paddd xmm1, xmm2 // 7+-8 8+-7 9+-6 10+-5
1212 pmaddwd xmm1, xmm2 // squared of 7+-8 8+-7 9+-6 10+-5
1213
1214 paddd xmm6, xmm2
1215 paddd xmm7, xmm1
1216
1217 pshufd xmm6, xmm6, 0 // duplicate the last ones
1218 pshufd xmm7, xmm7, 0 // duplicate the last ones
1219
1220 psrldq xmm1, 4 // 8--7 9--6 10--5 0000
1221 psrldq xmm2, 4 // 8--7 9--6 10--5 0000
1222
1223 pshufd xmm3, xmm1, 3 // 0000 8--7 8--7 8--7 squared
1224 pshufd xmm4, xmm2, 3 // 0000 8--7 8--7 8--7 squared
1225
1226 paddd xmm6, xmm4
1227 paddd xmm7, xmm3
1228
1229 pshufd xmm3, xmm1, 01011111b // 0000 0000 9--6 9--6 squared
1230 pshufd xmm4, xmm2, 01011111b // 0000 0000 9--6 9--6 squared
1231
1232 paddd xmm7, xmm3
1233 paddd xmm6, xmm4
1234
1235 pshufd xmm3, xmm1, 10111111b // 0000 0000 8--7 8--7 squared
1236 pshufd xmm4, xmm2, 10111111b // 0000 0000 8--7 8--7 squared
1237
1238 paddd xmm7, xmm3
1239 paddd xmm6, xmm4
1240
1241 movdqa xmm3, xmm6
1242 pmaddwd xmm3, xmm3
1243
1244 movdqa xmm5, xmm7
1245 pslld xmm5, 4
1246
1247 psubd xmm5, xmm7
1248 psubd xmm5, xmm3
1249
1250 psubd xmm5, flimit4
1251 psrad xmm5, 31
1252
1253 packssdw xmm5, xmm0
1254 packsswb xmm5, xmm0
1255
1256 movd xmm1, DWORD PTR [esi+ecx]
1257 movq xmm2, xmm1
1258
1259 punpcklbw xmm1, xmm0
1260 punpcklwd xmm1, xmm0
1261
1262 paddd xmm1, xmm6
1263 paddd xmm1, four8s
1264
1265 psrad xmm1, 4
1266 packssdw xmm1, xmm0
1267
1268 packuswb xmm1, xmm0
1269 pand xmm1, xmm5
1270
1271 pandn xmm5, xmm2
1272 por xmm5, xmm1
1273
1274 movd [esi+ecx-8], mm0
1275 movq mm0, mm1
1276
1277 movdq2q mm1, xmm5
1278 psrldq xmm7, 12
1279
1280 psrldq xmm6, 12
1281 add ecx, 4
1282
1283 cmp ecx, edx
1284 jl nextcol4
1285
1286 }
1287 s += pitch;
1288 }
1289 }
1290
1291 #if 0
1292
1293 /****************************************************************************
1294 *
1295 * ROUTINE : plane_add_noise_mmx
1296 *
1297 * INPUTS : unsigned char *Start starting address of buffer to add gaussian
1298 * noise to
1299 * unsigned int Width width of plane
1300 * unsigned int Height height of plane
1301 * int Pitch distance between subsequent lines of frame
1302 * int q quantizer used to determine amount of noise
1303 * to add
1304 *
1305 * OUTPUTS : None.
1306 *
1307 * RETURNS : void.
1308 *
1309 * FUNCTION : adds gaussian noise to a plane of pixels
1310 *
1311 * SPECIAL NOTES : None.
1312 *
1313 ****************************************************************************/
1314 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1315 {
1316 unsigned int i;
1317 int Pitch4 = Pitch * 4;
1318 const int noise_amount = 2;
1319 const int noise_adder = 2 * noise_amount + 1;
1320
1321 __declspec(align(16)) unsigned char blackclamp[16];
1322 __declspec(align(16)) unsigned char whiteclamp[16];
1323 __declspec(align(16)) unsigned char bothclamp[16];
1324
1325 char char_dist[300];
1326 char Rand[2048];
1327
1328 double sigma;
1329 __asm emms
1330 sigma = a + .5 + .6 * (63 - q) / 63.0;
1331
1332 // set up a lookup table of 256 entries that matches
1333 // a gaussian distribution with sigma determined by q.
1334 //
1335 {
1336 double i, sum = 0;
1337 int next, j;
1338
1339 next = 0;
1340
1341 for (i = -32; i < 32; i++)
1342 {
1343 int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
1344
1345 if (a)
1346 {
1347 for (j = 0; j < a; j++)
1348 {
1349 char_dist[next+j] = (char) i;
1350 }
1351
1352 next = next + j;
1353 }
1354
1355 }
1356
1357 for (next = next; next < 256; next++)
1358 char_dist[next] = 0;
1359
1360 }
1361
1362 for (i = 0; i < 2048; i++)
1363 {
1364 Rand[i] = char_dist[rand() & 0xff];
1365 }
1366
1367 for (i = 0; i < 16; i++)
1368 {
1369 blackclamp[i] = -char_dist[0];
1370 whiteclamp[i] = -char_dist[0];
1371 bothclamp[i] = -2 * char_dist[0];
1372 }
1373
1374 for (i = 0; i < Height; i++)
1375 {
1376 unsigned char *Pos = Start + i * Pitch;
1377 char *Ref = Rand + (rand() & 0xff);
1378
1379 __asm
1380 {
1381 mov ecx, [Width]
1382 mov esi, Pos
1383 mov edi, Ref
1384 xor eax, eax
1385
1386 nextset:
1387 movq mm1, [esi+eax] // get the source
1388
1389 psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise
1390 paddusb mm1, bothclamp
1391 psubusb mm1, whiteclamp
1392
1393 movq mm2, [edi+eax] // get the noise for this line
1394 paddb mm1, mm2 // add it in
1395 movq [esi+eax], mm1 // store the result
1396
1397 add eax, 8 // move to the next line
1398
1399 cmp eax, ecx
1400 jl nextset
1401
1402
1403 }
1404
1405 }
1406 }
1407 #else
1408 extern char an[8][64][3072];
1409 extern int cd[8][64];
1410
vp8_plane_add_noise_mmx(unsigned char * Start,unsigned int Width,unsigned int Height,int Pitch,int q,int a)1411 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1412 {
1413 unsigned int i;
1414 __declspec(align(16)) unsigned char blackclamp[16];
1415 __declspec(align(16)) unsigned char whiteclamp[16];
1416 __declspec(align(16)) unsigned char bothclamp[16];
1417
1418
1419 __asm emms
1420
1421 for (i = 0; i < 16; i++)
1422 {
1423 blackclamp[i] = -cd[a][q];
1424 whiteclamp[i] = -cd[a][q];
1425 bothclamp[i] = -2 * cd[a][q];
1426 }
1427
1428 for (i = 0; i < Height; i++)
1429 {
1430 unsigned char *Pos = Start + i * Pitch;
1431 char *Ref = an[a][q] + (rand() & 0xff);
1432
1433 __asm
1434 {
1435 mov ecx, [Width]
1436 mov esi, Pos
1437 mov edi, Ref
1438 xor eax, eax
1439
1440 nextset:
1441 movq mm1, [esi+eax] // get the source
1442
1443 psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise
1444 paddusb mm1, bothclamp
1445 psubusb mm1, whiteclamp
1446
1447 movq mm2, [edi+eax] // get the noise for this line
1448 paddb mm1, mm2 // add it in
1449 movq [esi+eax], mm1 // store the result
1450
1451 add eax, 8 // move to the next line
1452
1453 cmp eax, ecx
1454 jl nextset
1455 }
1456 }
1457 }
1458
1459
vp8_plane_add_noise_wmt(unsigned char * Start,unsigned int Width,unsigned int Height,int Pitch,int q,int a)1460 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1461 {
1462 unsigned int i;
1463
1464 __declspec(align(16)) unsigned char blackclamp[16];
1465 __declspec(align(16)) unsigned char whiteclamp[16];
1466 __declspec(align(16)) unsigned char bothclamp[16];
1467
1468 __asm emms
1469
1470 for (i = 0; i < 16; i++)
1471 {
1472 blackclamp[i] = -cd[a][q];
1473 whiteclamp[i] = -cd[a][q];
1474 bothclamp[i] = -2 * cd[a][q];
1475 }
1476
1477 for (i = 0; i < Height; i++)
1478 {
1479 unsigned char *Pos = Start + i * Pitch;
1480 char *Ref = an[a][q] + (rand() & 0xff);
1481
1482 __asm
1483 {
1484 mov ecx, [Width]
1485 mov esi, Pos
1486 mov edi, Ref
1487 xor eax, eax
1488
1489 nextset:
1490 movdqu xmm1, [esi+eax] // get the source
1491
1492 psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise
1493 paddusb xmm1, bothclamp
1494 psubusb xmm1, whiteclamp
1495
1496 movdqu xmm2, [edi+eax] // get the noise for this line
1497 paddb xmm1, xmm2 // add it in
1498 movdqu [esi+eax], xmm1 // store the result
1499
1500 add eax, 16 // move to the next line
1501
1502 cmp eax, ecx
1503 jl nextset
1504 }
1505 }
1506 }
1507
1508 #endif
1509