• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 
12 #include <math.h>
13 #include <stdlib.h>
14 #include "vpx_scale/yv12config.h"
15 #include "pragmas.h"
16 
17 #define VP8_FILTER_WEIGHT 128
18 #define VP8_FILTER_SHIFT  7
19 
20 
21 
22 /* static constants */
23 __declspec(align(16))
24 const static short  Blur[48] =
25 {
26 
27     16, 16, 16, 16, 16, 16, 16, 16,
28     16, 16, 16, 16, 16, 16, 16, 16,
29     64, 64, 64, 64, 64, 64, 64, 64,
30     16, 16, 16, 16, 16, 16, 16, 16,
31     16, 16, 16, 16, 16, 16, 16, 16,
32     0,  0,  0,  0,  0,  0,  0,  0,
33 
34 };
35 #define RD  __declspec(align(16)) __int64 rd  = 0x0040004000400040;
36 #define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
37 
38 #ifndef RELOCATEABLE
39 const static RD;
40 const static R4D2;
41 #endif
42 
43 
44 /* external references */
45 extern double vp8_gaussian(double sigma, double mu, double x);
46 extern short vp8_rv[];
47 extern int vp8_q2mbl(int x) ;
48 
49 
50 
vp8_post_proc_down_and_across_mmx(unsigned char * src_ptr,unsigned char * dst_ptr,int src_pixels_per_line,int dst_pixels_per_line,int rows,int cols,int flimit)51 void vp8_post_proc_down_and_across_mmx
52 (
53     unsigned char *src_ptr,
54     unsigned char *dst_ptr,
55     int src_pixels_per_line,
56     int dst_pixels_per_line,
57     int rows,
58     int cols,
59     int flimit
60 )
61 {
62 #ifdef RELOCATEABLE
63     RD
64     R4D2
65 #endif
66 
67     __asm
68     {
69         push        ebx
70         lea         ebx, Blur
71         movd        mm2, flimit
72         punpcklwd   mm2, mm2
73         punpckldq   mm2, mm2
74 
75         mov         esi,        src_ptr
76         mov         edi,        dst_ptr
77 
78         mov         ecx, DWORD PTR rows
79         mov         eax, src_pixels_per_line ;
80         destination pitch?
81         pxor        mm0, mm0              ;
82         mm0 = 00000000
83 
84         nextrow:
85 
86         xor         edx,        edx       ;
87 
88         clear out edx for use as loop counter
89         nextcol:
90 
91         pxor        mm7, mm7              ;
92 
93     mm7 = 00000000
94     movq        mm6, [ebx + 32 ]      ;
95         mm6 = kernel 2 taps
96         movq        mm3, [esi]            ;
97         mm4 = r0 p0..p7
98         punpcklbw   mm3, mm0              ;
99         mm3 = p0..p3
100         movq        mm1, mm3              ;
101         mm1 = p0..p3
102         pmullw      mm3, mm6              ;
103         mm3 *= kernel 2 modifiers
104 
105         movq        mm6, [ebx + 48]       ;
106         mm6 = kernel 3 taps
107         movq        mm5, [esi + eax]      ;
108         mm4 = r1 p0..p7
109         punpcklbw   mm5, mm0              ;
110         mm5 = r1 p0..p3
111         pmullw      mm6, mm5              ;
112         mm6 *= p0..p3 * kernel 3 modifiers
113         paddusw     mm3, mm6              ;
114         mm3 += mm6
115 
116         ;
117         thresholding
118         movq        mm7, mm1              ;
119         mm7 = r0 p0..p3
120         psubusw     mm7, mm5              ;
121         mm7 = r0 p0..p3 - r1 p0..p3
122         psubusw     mm5, mm1              ;
123         mm5 = r1 p0..p3 - r0 p0..p3
124         paddusw     mm7, mm5              ;
125         mm7 = abs(r0 p0..p3 - r1 p0..p3)
126         pcmpgtw     mm7, mm2
127 
128         movq        mm6, [ebx + 64 ]      ;
129         mm6 = kernel 4 modifiers
130         movq        mm5, [esi + 2*eax]    ;
131         mm4 = r2 p0..p7
132         punpcklbw   mm5, mm0              ;
133         mm5 = r2 p0..p3
134         pmullw      mm6, mm5              ;
135         mm5 *= kernel 4 modifiers
136         paddusw     mm3, mm6              ;
137         mm3 += mm5
138 
139         ;
140         thresholding
141         movq        mm6, mm1              ;
142         mm6 = r0 p0..p3
143         psubusw     mm6, mm5              ;
144         mm6 = r0 p0..p3 - r2 p0..p3
145         psubusw     mm5, mm1              ;
146         mm5 = r2 p0..p3 - r2 p0..p3
147         paddusw     mm6, mm5              ;
148         mm6 = abs(r0 p0..p3 - r2 p0..p3)
149         pcmpgtw     mm6, mm2
150         por         mm7, mm6              ;
151         accumulate thresholds
152 
153 
154         neg         eax
155         movq        mm6, [ebx ]           ;
156         kernel 0 taps
157         movq        mm5, [esi+2*eax]      ;
158         mm4 = r-2 p0..p7
159         punpcklbw   mm5, mm0              ;
160         mm5 = r-2 p0..p3
161         pmullw      mm6, mm5              ;
162         mm5 *= kernel 0 modifiers
163         paddusw     mm3, mm6              ;
164         mm3 += mm5
165 
166         ;
167         thresholding
168         movq        mm6, mm1              ;
169         mm6 = r0 p0..p3
170         psubusw     mm6, mm5              ;
171         mm6 = p0..p3 - r-2 p0..p3
172         psubusw     mm5, mm1              ;
173         mm5 = r-2 p0..p3 - p0..p3
174         paddusw     mm6, mm5              ;
175         mm6 = abs(r0 p0..p3 - r-2 p0..p3)
176         pcmpgtw     mm6, mm2
177         por         mm7, mm6              ;
178         accumulate thresholds
179 
180         movq        mm6, [ebx + 16]       ;
181         kernel 1 taps
182         movq        mm4, [esi+eax]        ;
183         mm4 = r-1 p0..p7
184         punpcklbw   mm4, mm0              ;
185         mm4 = r-1 p0..p3
186         pmullw      mm6, mm4              ;
187         mm4 *= kernel 1 modifiers.
188         paddusw     mm3, mm6              ;
189         mm3 += mm5
190 
191         ;
192         thresholding
193         movq        mm6, mm1              ;
194         mm6 = r0 p0..p3
195         psubusw     mm6, mm4              ;
196         mm6 = p0..p3 - r-2 p0..p3
197         psubusw     mm4, mm1              ;
198         mm5 = r-1 p0..p3 - p0..p3
199         paddusw     mm6, mm4              ;
200         mm6 = abs(r0 p0..p3 - r-1 p0..p3)
201         pcmpgtw     mm6, mm2
202         por         mm7, mm6              ;
203         accumulate thresholds
204 
205 
206         paddusw     mm3, rd               ;
207         mm3 += round value
208         psraw       mm3, VP8_FILTER_SHIFT     ;
209         mm3 /= 128
210 
211         pand        mm1, mm7              ;
212         mm1 select vals > thresh from source
213         pandn       mm7, mm3              ;
214         mm7 select vals < thresh from blurred result
215         paddusw     mm1, mm7              ;
216         combination
217 
218         packuswb    mm1, mm0              ;
219         pack to bytes
220 
221         movd        [edi], mm1            ;
222         neg         eax                   ;
223         pitch is positive
224 
225 
226         add         esi, 4
227         add         edi, 4
228         add         edx, 4
229 
230         cmp         edx, cols
231         jl          nextcol
232         // done with the all cols, start the across filtering in place
233         sub         esi, edx
234         sub         edi, edx
235 
236 
237         push        eax
238         xor         edx,    edx
239         mov         eax,    [edi-4];
240 
241         acrossnextcol:
242         pxor        mm7, mm7              ;
243         mm7 = 00000000
244         movq        mm6, [ebx + 32 ]      ;
245         movq        mm4, [edi+edx]        ;
246         mm4 = p0..p7
247         movq        mm3, mm4              ;
248         mm3 = p0..p7
249         punpcklbw   mm3, mm0              ;
250         mm3 = p0..p3
251         movq        mm1, mm3              ;
252         mm1 = p0..p3
253         pmullw      mm3, mm6              ;
254         mm3 *= kernel 2 modifiers
255 
256         movq        mm6, [ebx + 48]
257         psrlq       mm4, 8                ;
258         mm4 = p1..p7
259         movq        mm5, mm4              ;
260         mm5 = p1..p7
261         punpcklbw   mm5, mm0              ;
262         mm5 = p1..p4
263         pmullw      mm6, mm5              ;
264         mm6 *= p1..p4 * kernel 3 modifiers
265         paddusw     mm3, mm6              ;
266         mm3 += mm6
267 
268         ;
269         thresholding
270         movq        mm7, mm1              ;
271         mm7 = p0..p3
272         psubusw     mm7, mm5              ;
273         mm7 = p0..p3 - p1..p4
274         psubusw     mm5, mm1              ;
275         mm5 = p1..p4 - p0..p3
276         paddusw     mm7, mm5              ;
277         mm7 = abs(p0..p3 - p1..p4)
278         pcmpgtw     mm7, mm2
279 
280         movq        mm6, [ebx + 64 ]
281         psrlq       mm4, 8                ;
282         mm4 = p2..p7
283         movq        mm5, mm4              ;
284         mm5 = p2..p7
285         punpcklbw   mm5, mm0              ;
286         mm5 = p2..p5
287         pmullw      mm6, mm5              ;
288         mm5 *= kernel 4 modifiers
289         paddusw     mm3, mm6              ;
290         mm3 += mm5
291 
292         ;
293         thresholding
294         movq        mm6, mm1              ;
295         mm6 = p0..p3
296         psubusw     mm6, mm5              ;
297         mm6 = p0..p3 - p1..p4
298         psubusw     mm5, mm1              ;
299         mm5 = p1..p4 - p0..p3
300         paddusw     mm6, mm5              ;
301         mm6 = abs(p0..p3 - p1..p4)
302         pcmpgtw     mm6, mm2
303         por         mm7, mm6              ;
304         accumulate thresholds
305 
306 
307         movq        mm6, [ebx ]
308         movq        mm4, [edi+edx-2]      ;
309         mm4 = p-2..p5
310         movq        mm5, mm4              ;
311         mm5 = p-2..p5
312         punpcklbw   mm5, mm0              ;
313         mm5 = p-2..p1
314         pmullw      mm6, mm5              ;
315         mm5 *= kernel 0 modifiers
316         paddusw     mm3, mm6              ;
317         mm3 += mm5
318 
319         ;
320         thresholding
321         movq        mm6, mm1              ;
322         mm6 = p0..p3
323         psubusw     mm6, mm5              ;
324         mm6 = p0..p3 - p1..p4
325         psubusw     mm5, mm1              ;
326         mm5 = p1..p4 - p0..p3
327         paddusw     mm6, mm5              ;
328         mm6 = abs(p0..p3 - p1..p4)
329         pcmpgtw     mm6, mm2
330         por         mm7, mm6              ;
331         accumulate thresholds
332 
333         movq        mm6, [ebx + 16]
334         psrlq       mm4, 8                ;
335         mm4 = p-1..p5
336         punpcklbw   mm4, mm0              ;
337         mm4 = p-1..p2
338         pmullw      mm6, mm4              ;
339         mm4 *= kernel 1 modifiers.
340         paddusw     mm3, mm6              ;
341         mm3 += mm5
342 
343         ;
344         thresholding
345         movq        mm6, mm1              ;
346         mm6 = p0..p3
347         psubusw     mm6, mm4              ;
348         mm6 = p0..p3 - p1..p4
349         psubusw     mm4, mm1              ;
350         mm5 = p1..p4 - p0..p3
351         paddusw     mm6, mm4              ;
352         mm6 = abs(p0..p3 - p1..p4)
353         pcmpgtw     mm6, mm2
354         por         mm7, mm6              ;
355         accumulate thresholds
356 
357         paddusw     mm3, rd               ;
358         mm3 += round value
359         psraw       mm3, VP8_FILTER_SHIFT     ;
360         mm3 /= 128
361 
362         pand        mm1, mm7              ;
363         mm1 select vals > thresh from source
364         pandn       mm7, mm3              ;
365         mm7 select vals < thresh from blurred result
366         paddusw     mm1, mm7              ;
367         combination
368 
369         packuswb    mm1, mm0              ;
370         pack to bytes
371         mov         DWORD PTR [edi+edx-4],  eax   ;
372         store previous four bytes
373         movd        eax,    mm1
374 
375         add         edx, 4
376         cmp         edx, cols
377         jl          acrossnextcol;
378 
379         mov         DWORD PTR [edi+edx-4],  eax
380         pop         eax
381 
382         // done with this rwo
383         add         esi, eax               ;
384         next line
385         mov         eax, dst_pixels_per_line ;
386         destination pitch?
387         add         edi, eax               ;
388         next destination
389         mov         eax, src_pixels_per_line ;
390         destination pitch?
391 
392         dec         ecx                   ;
393         decrement count
394         jnz         nextrow               ;
395         next row
396         pop         ebx
397 
398     }
399 }
400 
401 
402 
vp8_post_proc_down_and_across_xmm(unsigned char * src_ptr,unsigned char * dst_ptr,int src_pixels_per_line,int dst_pixels_per_line,int rows,int cols,int flimit)403 void vp8_post_proc_down_and_across_xmm
404 (
405     unsigned char *src_ptr,
406     unsigned char *dst_ptr,
407     int src_pixels_per_line,
408     int dst_pixels_per_line,
409     int rows,
410     int cols,
411     int flimit
412 )
413 {
414 #ifdef RELOCATEABLE
415     R4D2
416 #endif
417 
418     __asm
419     {
420         movd        xmm2,       flimit
421         punpcklwd   xmm2,       xmm2
422         punpckldq   xmm2,       xmm2
423         punpcklqdq  xmm2,       xmm2
424 
425         mov         esi,        src_ptr
426         mov         edi,        dst_ptr
427 
428         mov         ecx,        DWORD PTR rows
429         mov         eax,        src_pixels_per_line ;
430         destination pitch?
431         pxor        xmm0,       xmm0              ;
432         mm0 = 00000000
433 
434         nextrow:
435 
436         xor         edx,        edx       ;
437 
438         clear out edx for use as loop counter
439         nextcol:
440         movq        xmm3,       QWORD PTR [esi]         ;
441 
442         mm4 = r0 p0..p7
443         punpcklbw   xmm3,       xmm0                    ;
444         mm3 = p0..p3
445         movdqa      xmm1,       xmm3                    ;
446         mm1 = p0..p3
447         psllw       xmm3,       2                       ;
448 
449         movq        xmm5,       QWORD PTR [esi + eax]   ;
450         mm4 = r1 p0..p7
451         punpcklbw   xmm5,       xmm0                    ;
452         mm5 = r1 p0..p3
453         paddusw     xmm3,       xmm5                    ;
454         mm3 += mm6
455 
456         ;
457         thresholding
458         movdqa      xmm7,       xmm1                    ;
459         mm7 = r0 p0..p3
460         psubusw     xmm7,       xmm5                    ;
461         mm7 = r0 p0..p3 - r1 p0..p3
462         psubusw     xmm5,       xmm1                    ;
463         mm5 = r1 p0..p3 - r0 p0..p3
464         paddusw     xmm7,       xmm5                    ;
465         mm7 = abs(r0 p0..p3 - r1 p0..p3)
466         pcmpgtw     xmm7,       xmm2
467 
468         movq        xmm5,       QWORD PTR [esi + 2*eax] ;
469         mm4 = r2 p0..p7
470         punpcklbw   xmm5,       xmm0                    ;
471         mm5 = r2 p0..p3
472         paddusw     xmm3,       xmm5                    ;
473         mm3 += mm5
474 
475         ;
476         thresholding
477         movdqa      xmm6,       xmm1                    ;
478         mm6 = r0 p0..p3
479         psubusw     xmm6,       xmm5                    ;
480         mm6 = r0 p0..p3 - r2 p0..p3
481         psubusw     xmm5,       xmm1                    ;
482         mm5 = r2 p0..p3 - r2 p0..p3
483         paddusw     xmm6,       xmm5                    ;
484         mm6 = abs(r0 p0..p3 - r2 p0..p3)
485         pcmpgtw     xmm6,       xmm2
486         por         xmm7,       xmm6                    ;
487         accumulate thresholds
488 
489 
490         neg         eax
491         movq        xmm5,       QWORD PTR [esi+2*eax]   ;
492         mm4 = r-2 p0..p7
493         punpcklbw   xmm5,       xmm0                    ;
494         mm5 = r-2 p0..p3
495         paddusw     xmm3,       xmm5                    ;
496         mm3 += mm5
497 
498         ;
499         thresholding
500         movdqa      xmm6,       xmm1                    ;
501         mm6 = r0 p0..p3
502         psubusw     xmm6,       xmm5                    ;
503         mm6 = p0..p3 - r-2 p0..p3
504         psubusw     xmm5,       xmm1                    ;
505         mm5 = r-2 p0..p3 - p0..p3
506         paddusw     xmm6,       xmm5                    ;
507         mm6 = abs(r0 p0..p3 - r-2 p0..p3)
508         pcmpgtw     xmm6,       xmm2
509         por         xmm7,       xmm6                    ;
510         accumulate thresholds
511 
512         movq        xmm4,       QWORD PTR [esi+eax]     ;
513         mm4 = r-1 p0..p7
514         punpcklbw   xmm4,       xmm0                    ;
515         mm4 = r-1 p0..p3
516         paddusw     xmm3,       xmm4                    ;
517         mm3 += mm5
518 
519         ;
520         thresholding
521         movdqa      xmm6,       xmm1                    ;
522         mm6 = r0 p0..p3
523         psubusw     xmm6,       xmm4                    ;
524         mm6 = p0..p3 - r-2 p0..p3
525         psubusw     xmm4,       xmm1                    ;
526         mm5 = r-1 p0..p3 - p0..p3
527         paddusw     xmm6,       xmm4                    ;
528         mm6 = abs(r0 p0..p3 - r-1 p0..p3)
529         pcmpgtw     xmm6,       xmm2
530         por         xmm7,       xmm6                    ;
531         accumulate thresholds
532 
533 
534         paddusw     xmm3,       rd42                    ;
535         mm3 += round value
536         psraw       xmm3,       3                       ;
537         mm3 /= 8
538 
539         pand        xmm1,       xmm7                    ;
540         mm1 select vals > thresh from source
541         pandn       xmm7,       xmm3                    ;
542         mm7 select vals < thresh from blurred result
543         paddusw     xmm1,       xmm7                    ;
544         combination
545 
546         packuswb    xmm1,       xmm0                    ;
547         pack to bytes
548         movq        QWORD PTR [edi], xmm1             ;
549 
550         neg         eax                   ;
551         pitch is positive
552         add         esi,        8
553         add         edi,        8
554 
555         add         edx,        8
556         cmp         edx,        cols
557 
558         jl          nextcol
559 
560         // done with the all cols, start the across filtering in place
561         sub         esi,        edx
562         sub         edi,        edx
563 
564         xor         edx,        edx
565         movq        mm0,        QWORD PTR [edi-8];
566 
567         acrossnextcol:
568         movq        xmm7,       QWORD PTR [edi +edx -2]
569         movd        xmm4,       DWORD PTR [edi +edx +6]
570 
571         pslldq      xmm4,       8
572         por         xmm4,       xmm7
573 
574         movdqa      xmm3,       xmm4
575         psrldq      xmm3,       2
576         punpcklbw   xmm3,       xmm0              ;
577         mm3 = p0..p3
578         movdqa      xmm1,       xmm3              ;
579         mm1 = p0..p3
580         psllw       xmm3,       2
581 
582 
583         movdqa      xmm5,       xmm4
584         psrldq      xmm5,       3
585         punpcklbw   xmm5,       xmm0              ;
586         mm5 = p1..p4
587         paddusw     xmm3,       xmm5              ;
588         mm3 += mm6
589 
590         ;
591         thresholding
592         movdqa      xmm7,       xmm1              ;
593         mm7 = p0..p3
594         psubusw     xmm7,       xmm5              ;
595         mm7 = p0..p3 - p1..p4
596         psubusw     xmm5,       xmm1              ;
597         mm5 = p1..p4 - p0..p3
598         paddusw     xmm7,       xmm5              ;
599         mm7 = abs(p0..p3 - p1..p4)
600         pcmpgtw     xmm7,       xmm2
601 
602         movdqa      xmm5,       xmm4
603         psrldq      xmm5,       4
604         punpcklbw   xmm5,       xmm0              ;
605         mm5 = p2..p5
606         paddusw     xmm3,       xmm5              ;
607         mm3 += mm5
608 
609         ;
610         thresholding
611         movdqa      xmm6,       xmm1              ;
612         mm6 = p0..p3
613         psubusw     xmm6,       xmm5              ;
614         mm6 = p0..p3 - p1..p4
615         psubusw     xmm5,       xmm1              ;
616         mm5 = p1..p4 - p0..p3
617         paddusw     xmm6,       xmm5              ;
618         mm6 = abs(p0..p3 - p1..p4)
619         pcmpgtw     xmm6,       xmm2
620         por         xmm7,       xmm6              ;
621         accumulate thresholds
622 
623 
624         movdqa      xmm5,       xmm4              ;
625         mm5 = p-2..p5
626         punpcklbw   xmm5,       xmm0              ;
627         mm5 = p-2..p1
628         paddusw     xmm3,       xmm5              ;
629         mm3 += mm5
630 
631         ;
632         thresholding
633         movdqa      xmm6,       xmm1              ;
634         mm6 = p0..p3
635         psubusw     xmm6,       xmm5              ;
636         mm6 = p0..p3 - p1..p4
637         psubusw     xmm5,       xmm1              ;
638         mm5 = p1..p4 - p0..p3
639         paddusw     xmm6,       xmm5              ;
640         mm6 = abs(p0..p3 - p1..p4)
641         pcmpgtw     xmm6,       xmm2
642         por         xmm7,       xmm6              ;
643         accumulate thresholds
644 
645         psrldq      xmm4,       1                   ;
646         mm4 = p-1..p5
647         punpcklbw   xmm4,       xmm0              ;
648         mm4 = p-1..p2
649         paddusw     xmm3,       xmm4              ;
650         mm3 += mm5
651 
652         ;
653         thresholding
654         movdqa      xmm6,       xmm1              ;
655         mm6 = p0..p3
656         psubusw     xmm6,       xmm4              ;
657         mm6 = p0..p3 - p1..p4
658         psubusw     xmm4,       xmm1              ;
659         mm5 = p1..p4 - p0..p3
660         paddusw     xmm6,       xmm4              ;
661         mm6 = abs(p0..p3 - p1..p4)
662         pcmpgtw     xmm6,       xmm2
663         por         xmm7,       xmm6              ;
664         accumulate thresholds
665 
666         paddusw     xmm3,       rd42              ;
667         mm3 += round value
668         psraw       xmm3,       3                 ;
669         mm3 /= 8
670 
671         pand        xmm1,       xmm7              ;
672         mm1 select vals > thresh from source
673         pandn       xmm7,       xmm3              ;
674         mm7 select vals < thresh from blurred result
675         paddusw     xmm1,       xmm7              ;
676         combination
677 
678         packuswb    xmm1,       xmm0              ;
679         pack to bytes
680         movq        QWORD PTR [edi+edx-8],  mm0   ;
681         store previous four bytes
682         movdq2q     mm0,        xmm1
683 
684         add         edx,        8
685         cmp         edx,        cols
686         jl          acrossnextcol;
687 
688         // last 8 pixels
689         movq        QWORD PTR [edi+edx-8],  mm0
690 
691         // done with this rwo
692         add         esi, eax               ;
693         next line
694         mov         eax, dst_pixels_per_line ;
695         destination pitch?
696         add         edi, eax               ;
697         next destination
698         mov         eax, src_pixels_per_line ;
699         destination pitch?
700 
701         dec         ecx                   ;
702         decrement count
703         jnz         nextrow               ;
704         next row
705     }
706 }
707 
708 
vp8_mbpost_proc_down_mmx(unsigned char * dst,int pitch,int rows,int cols,int flimit)709 void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
710 {
711     int c, i;
712     __declspec(align(16))
713     int flimit2[2];
714     __declspec(align(16))
715     unsigned char d[16][8];
716 
717     flimit = vp8_q2mbl(flimit);
718 
719     for (i = 0; i < 2; i++)
720         flimit2[i] = flimit;
721 
722     rows += 8;
723 
724     for (c = 0; c < cols; c += 4)
725     {
726         unsigned char *s = &dst[c];
727 
728         __asm
729         {
730             mov         esi,        s           ;
731             pxor        mm0,        mm0     ;
732 
733             mov         eax,        pitch       ;
734             neg         eax                                     // eax = -pitch
735 
736             lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
737             neg         eax
738 
739 
740             pxor        mm5,        mm5
741             pxor        mm6,        mm6     ;
742 
743             pxor        mm7,        mm7     ;
744             mov         edi,        esi
745 
746             mov         ecx,        15          ;
747 
748             loop_initvar:
749             movd        mm1,        DWORD PTR [edi];
750             punpcklbw   mm1,        mm0     ;
751 
752             paddw       mm5,        mm1     ;
753             pmullw      mm1,        mm1     ;
754 
755             movq        mm2,        mm1     ;
756             punpcklwd   mm1,        mm0     ;
757 
758             punpckhwd   mm2,        mm0     ;
759             paddd       mm6,        mm1     ;
760 
761             paddd       mm7,        mm2     ;
762             lea         edi,        [edi+eax]   ;
763 
764             dec         ecx
765             jne         loop_initvar
766             //save the var and sum
767             xor         edx,        edx
768             loop_row:
769             movd        mm1,        DWORD PTR [esi]     // [s-pitch*8]
770             movd        mm2,        DWORD PTR [edi]     // [s+pitch*7]
771 
772             punpcklbw   mm1,        mm0
773             punpcklbw   mm2,        mm0
774 
775             paddw       mm5,        mm2
776             psubw       mm5,        mm1
777 
778             pmullw      mm2,        mm2
779             movq        mm4,        mm2
780 
781             punpcklwd   mm2,        mm0
782             punpckhwd   mm4,        mm0
783 
784             paddd       mm6,        mm2
785             paddd       mm7,        mm4
786 
787             pmullw      mm1,        mm1
788             movq        mm2,        mm1
789 
790             punpcklwd   mm1,        mm0
791             psubd       mm6,        mm1
792 
793             punpckhwd   mm2,        mm0
794             psubd       mm7,        mm2
795 
796 
797             movq        mm3,        mm6
798             pslld       mm3,        4
799 
800             psubd       mm3,        mm6
801             movq        mm1,        mm5
802 
803             movq        mm4,        mm5
804             pmullw      mm1,        mm1
805 
806             pmulhw      mm4,        mm4
807             movq        mm2,        mm1
808 
809             punpcklwd   mm1,        mm4
810             punpckhwd   mm2,        mm4
811 
812             movq        mm4,        mm7
813             pslld       mm4,        4
814 
815             psubd       mm4,        mm7
816 
817             psubd       mm3,        mm1
818             psubd       mm4,        mm2
819 
820             psubd       mm3,        flimit2
821             psubd       mm4,        flimit2
822 
823             psrad       mm3,        31
824             psrad       mm4,        31
825 
826             packssdw    mm3,        mm4
827             packsswb    mm3,        mm0
828 
829             movd        mm1,        DWORD PTR [esi+eax*8]
830 
831             movq        mm2,        mm1
832             punpcklbw   mm1,        mm0
833 
834             paddw       mm1,        mm5
835             mov         ecx,        edx
836 
837             and         ecx,        127
838             movq        mm4,        vp8_rv[ecx*2]
839 
840             paddw       mm1,        mm4
841             //paddw     xmm1,       eight8s
842             psraw       mm1,        4
843 
844             packuswb    mm1,        mm0
845             pand        mm1,        mm3
846 
847             pandn       mm3,        mm2
848             por         mm1,        mm3
849 
850             and         ecx,        15
851             movd        DWORD PTR  d[ecx*4], mm1
852 
853             mov         ecx,        edx
854             sub         ecx,        8
855 
856             and         ecx,        15
857             movd        mm1,        DWORD PTR d[ecx*4]
858 
859             movd        [esi],      mm1
860             lea         esi,        [esi+eax]
861 
862             lea         edi,        [edi+eax]
863             add         edx,        1
864 
865             cmp         edx,        rows
866             jl          loop_row
867 
868         }
869 
870     }
871 }
872 
vp8_mbpost_proc_down_xmm(unsigned char * dst,int pitch,int rows,int cols,int flimit)873 void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
874 {
875     int c, i;
876     __declspec(align(16))
877     int flimit4[4];
878     __declspec(align(16))
879     unsigned char d[16][8];
880 
881     flimit = vp8_q2mbl(flimit);
882 
883     for (i = 0; i < 4; i++)
884         flimit4[i] = flimit;
885 
886     rows += 8;
887 
888     for (c = 0; c < cols; c += 8)
889     {
890         unsigned char *s = &dst[c];
891 
892         __asm
893         {
894             mov         esi,        s           ;
895             pxor        xmm0,       xmm0        ;
896 
897             mov         eax,        pitch       ;
898             neg         eax                                     // eax = -pitch
899 
900             lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
901             neg         eax
902 
903 
904             pxor        xmm5,       xmm5
905             pxor        xmm6,       xmm6        ;
906 
907             pxor        xmm7,       xmm7        ;
908             mov         edi,        esi
909 
910             mov         ecx,        15          ;
911 
912             loop_initvar:
913             movq        xmm1,       QWORD PTR [edi];
914             punpcklbw   xmm1,       xmm0        ;
915 
916             paddw       xmm5,       xmm1        ;
917             pmullw      xmm1,       xmm1        ;
918 
919             movdqa      xmm2,       xmm1        ;
920             punpcklwd   xmm1,       xmm0        ;
921 
922             punpckhwd   xmm2,       xmm0        ;
923             paddd       xmm6,       xmm1        ;
924 
925             paddd       xmm7,       xmm2        ;
926             lea         edi,        [edi+eax]   ;
927 
928             dec         ecx
929             jne         loop_initvar
930             //save the var and sum
931             xor         edx,        edx
932             loop_row:
933             movq        xmm1,       QWORD PTR [esi]     // [s-pitch*8]
934             movq        xmm2,       QWORD PTR [edi]     // [s+pitch*7]
935 
936             punpcklbw   xmm1,       xmm0
937             punpcklbw   xmm2,       xmm0
938 
939             paddw       xmm5,       xmm2
940             psubw       xmm5,       xmm1
941 
942             pmullw      xmm2,       xmm2
943             movdqa      xmm4,       xmm2
944 
945             punpcklwd   xmm2,       xmm0
946             punpckhwd   xmm4,       xmm0
947 
948             paddd       xmm6,       xmm2
949             paddd       xmm7,       xmm4
950 
951             pmullw      xmm1,       xmm1
952             movdqa      xmm2,       xmm1
953 
954             punpcklwd   xmm1,       xmm0
955             psubd       xmm6,       xmm1
956 
957             punpckhwd   xmm2,       xmm0
958             psubd       xmm7,       xmm2
959 
960 
961             movdqa      xmm3,       xmm6
962             pslld       xmm3,       4
963 
964             psubd       xmm3,       xmm6
965             movdqa      xmm1,       xmm5
966 
967             movdqa      xmm4,       xmm5
968             pmullw      xmm1,       xmm1
969 
970             pmulhw      xmm4,       xmm4
971             movdqa      xmm2,       xmm1
972 
973             punpcklwd   xmm1,       xmm4
974             punpckhwd   xmm2,       xmm4
975 
976             movdqa      xmm4,       xmm7
977             pslld       xmm4,       4
978 
979             psubd       xmm4,       xmm7
980 
981             psubd       xmm3,       xmm1
982             psubd       xmm4,       xmm2
983 
984             psubd       xmm3,       flimit4
985             psubd       xmm4,       flimit4
986 
987             psrad       xmm3,       31
988             psrad       xmm4,       31
989 
990             packssdw    xmm3,       xmm4
991             packsswb    xmm3,       xmm0
992 
993             movq        xmm1,       QWORD PTR [esi+eax*8]
994 
995             movq        xmm2,       xmm1
996             punpcklbw   xmm1,       xmm0
997 
998             paddw       xmm1,       xmm5
999             mov         ecx,        edx
1000 
1001             and         ecx,        127
1002             movdqu      xmm4,       vp8_rv[ecx*2]
1003 
1004             paddw       xmm1,       xmm4
1005             //paddw     xmm1,       eight8s
1006             psraw       xmm1,       4
1007 
1008             packuswb    xmm1,       xmm0
1009             pand        xmm1,       xmm3
1010 
1011             pandn       xmm3,       xmm2
1012             por         xmm1,       xmm3
1013 
1014             and         ecx,        15
1015             movq        QWORD PTR  d[ecx*8], xmm1
1016 
1017             mov         ecx,        edx
1018             sub         ecx,        8
1019 
1020             and         ecx,        15
1021             movq        mm0,        d[ecx*8]
1022 
1023             movq        [esi],      mm0
1024             lea         esi,        [esi+eax]
1025 
1026             lea         edi,        [edi+eax]
1027             add         edx,        1
1028 
1029             cmp         edx,        rows
1030             jl          loop_row
1031 
1032         }
1033 
1034     }
1035 }
1036 #if 0
1037 /****************************************************************************
1038  *
1039  *  ROUTINE       : plane_add_noise_wmt
1040  *
1041  *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
1042  *                                  noise to
1043  *                  unsigned int Width    width of plane
1044  *                  unsigned int Height   height of plane
1045  *                  int  Pitch    distance between subsequent lines of frame
1046  *                  int  q        quantizer used to determine amount of noise
1047  *                                  to add
1048  *
1049  *  OUTPUTS       : None.
1050  *
1051  *  RETURNS       : void.
1052  *
1053  *  FUNCTION      : adds gaussian noise to a plane of pixels
1054  *
1055  *  SPECIAL NOTES : None.
1056  *
1057  ****************************************************************************/
1058 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1059 {
1060     unsigned int i;
1061 
1062     __declspec(align(16)) unsigned char blackclamp[16];
1063     __declspec(align(16)) unsigned char whiteclamp[16];
1064     __declspec(align(16)) unsigned char bothclamp[16];
1065     char char_dist[300];
1066     char Rand[2048];
1067     double sigma;
1068 //    return;
1069     __asm emms
1070     sigma = a + .5 + .6 * (63 - q) / 63.0;
1071 
1072     // set up a lookup table of 256 entries that matches
1073     // a gaussian distribution with sigma determined by q.
1074     //
1075     {
1076         double i;
1077         int next, j;
1078 
1079         next = 0;
1080 
1081         for (i = -32; i < 32; i++)
1082         {
1083             double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
1084             int a = (int)(g + .5);
1085 
1086             if (a)
1087             {
1088                 for (j = 0; j < a; j++)
1089                 {
1090                     char_dist[next+j] = (char) i;
1091                 }
1092 
1093                 next = next + j;
1094             }
1095 
1096         }
1097 
1098         for (next = next; next < 256; next++)
1099             char_dist[next] = 0;
1100 
1101     }
1102 
1103     for (i = 0; i < 2048; i++)
1104     {
1105         Rand[i] = char_dist[rand() & 0xff];
1106     }
1107 
1108     for (i = 0; i < 16; i++)
1109     {
1110         blackclamp[i] = -char_dist[0];
1111         whiteclamp[i] = -char_dist[0];
1112         bothclamp[i] = -2 * char_dist[0];
1113     }
1114 
1115     for (i = 0; i < Height; i++)
1116     {
1117         unsigned char *Pos = Start + i * Pitch;
1118         char  *Ref = Rand + (rand() & 0xff);
1119 
1120         __asm
1121         {
1122             mov ecx, [Width]
1123             mov esi, Pos
1124             mov edi, Ref
1125             xor         eax, eax
1126 
1127             nextset:
1128             movdqu      xmm1, [esi+eax]        // get the source
1129 
1130             psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
1131             paddusb     xmm1, bothclamp
1132             psubusb     xmm1, whiteclamp
1133 
1134             movdqu      xmm2, [edi+eax]        // get the noise for this line
1135             paddb       xmm1, xmm2             // add it in
1136             movdqu      [esi+eax], xmm1        // store the result
1137 
1138             add         eax, 16                // move to the next line
1139 
1140             cmp         eax, ecx
1141             jl          nextset
1142 
1143 
1144         }
1145 
1146     }
1147 }
1148 #endif
1149 __declspec(align(16))
1150 static const int four8s[4] = { 8, 8, 8, 8};
vp8_mbpost_proc_across_ip_xmm(unsigned char * src,int pitch,int rows,int cols,int flimit)1151 void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
1152 {
1153     int r, i;
1154     __declspec(align(16))
1155     int flimit4[4];
1156     unsigned char *s = src;
1157     int sumsq;
1158     int sum;
1159 
1160 
1161     flimit = vp8_q2mbl(flimit);
1162     flimit4[0] =
1163         flimit4[1] =
1164             flimit4[2] =
1165                 flimit4[3] = flimit;
1166 
1167     for (r = 0; r < rows; r++)
1168     {
1169 
1170 
1171         sumsq = 0;
1172         sum = 0;
1173 
1174         for (i = -8; i <= 6; i++)
1175         {
1176             sumsq += s[i] * s[i];
1177             sum   += s[i];
1178         }
1179 
1180         __asm
1181         {
1182             mov         eax,    sumsq
1183             movd        xmm7,   eax
1184 
1185             mov         eax,    sum
1186             movd        xmm6,   eax
1187 
1188             mov         esi,    s
1189             xor         ecx,    ecx
1190 
1191             mov         edx,    cols
1192             add         edx,    8
1193             pxor        mm0,    mm0
1194             pxor        mm1,    mm1
1195 
1196             pxor        xmm0,   xmm0
1197             nextcol4:
1198 
1199             movd        xmm1,   DWORD PTR [esi+ecx-8]   // -8 -7 -6 -5
1200             movd        xmm2,   DWORD PTR [esi+ecx+7]   // +7 +8 +9 +10
1201 
1202             punpcklbw   xmm1,   xmm0                    // expanding
1203             punpcklbw   xmm2,   xmm0                    // expanding
1204 
1205             punpcklwd   xmm1,   xmm0                    // expanding to dwords
1206             punpcklwd   xmm2,   xmm0                    // expanding to dwords
1207 
1208             psubd       xmm2,   xmm1                    // 7--8   8--7   9--6 10--5
1209             paddd       xmm1,   xmm1                    // -8*2   -7*2   -6*2 -5*2
1210 
1211             paddd       xmm1,   xmm2                    // 7+-8   8+-7   9+-6 10+-5
1212             pmaddwd     xmm1,   xmm2                    // squared of 7+-8   8+-7   9+-6 10+-5
1213 
1214             paddd       xmm6,   xmm2
1215             paddd       xmm7,   xmm1
1216 
1217             pshufd      xmm6,   xmm6,   0               // duplicate the last ones
1218             pshufd      xmm7,   xmm7,   0               // duplicate the last ones
1219 
1220             psrldq      xmm1,       4                   // 8--7   9--6 10--5  0000
1221             psrldq      xmm2,       4                   // 8--7   9--6 10--5  0000
1222 
1223             pshufd      xmm3,   xmm1,   3               // 0000  8--7   8--7   8--7 squared
1224             pshufd      xmm4,   xmm2,   3               // 0000  8--7   8--7   8--7 squared
1225 
1226             paddd       xmm6,   xmm4
1227             paddd       xmm7,   xmm3
1228 
1229             pshufd      xmm3,   xmm1,   01011111b       // 0000  0000   9--6   9--6 squared
1230             pshufd      xmm4,   xmm2,   01011111b       // 0000  0000   9--6   9--6 squared
1231 
1232             paddd       xmm7,   xmm3
1233             paddd       xmm6,   xmm4
1234 
1235             pshufd      xmm3,   xmm1,   10111111b       // 0000  0000   8--7   8--7 squared
1236             pshufd      xmm4,   xmm2,   10111111b       // 0000  0000   8--7   8--7 squared
1237 
1238             paddd       xmm7,   xmm3
1239             paddd       xmm6,   xmm4
1240 
1241             movdqa      xmm3,   xmm6
1242             pmaddwd     xmm3,   xmm3
1243 
1244             movdqa      xmm5,   xmm7
1245             pslld       xmm5,   4
1246 
1247             psubd       xmm5,   xmm7
1248             psubd       xmm5,   xmm3
1249 
1250             psubd       xmm5,   flimit4
1251             psrad       xmm5,   31
1252 
1253             packssdw    xmm5,   xmm0
1254             packsswb    xmm5,   xmm0
1255 
1256             movd        xmm1,   DWORD PTR [esi+ecx]
1257             movq        xmm2,   xmm1
1258 
1259             punpcklbw   xmm1,   xmm0
1260             punpcklwd   xmm1,   xmm0
1261 
1262             paddd       xmm1,   xmm6
1263             paddd       xmm1,   four8s
1264 
1265             psrad       xmm1,   4
1266             packssdw    xmm1,   xmm0
1267 
1268             packuswb    xmm1,   xmm0
1269             pand        xmm1,   xmm5
1270 
1271             pandn       xmm5,   xmm2
1272             por         xmm5,   xmm1
1273 
1274             movd        [esi+ecx-8],  mm0
1275             movq        mm0,    mm1
1276 
1277             movdq2q     mm1,    xmm5
1278             psrldq      xmm7,   12
1279 
1280             psrldq      xmm6,   12
1281             add         ecx,    4
1282 
1283             cmp         ecx,    edx
1284             jl          nextcol4
1285 
1286         }
1287         s += pitch;
1288     }
1289 }
1290 
1291 #if 0
1292 
1293 /****************************************************************************
1294  *
1295  *  ROUTINE       : plane_add_noise_mmx
1296  *
1297  *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
1298  *                                  noise to
1299  *                  unsigned int Width    width of plane
1300  *                  unsigned int Height   height of plane
1301  *                  int  Pitch    distance between subsequent lines of frame
1302  *                  int  q        quantizer used to determine amount of noise
1303  *                                  to add
1304  *
1305  *  OUTPUTS       : None.
1306  *
1307  *  RETURNS       : void.
1308  *
1309  *  FUNCTION      : adds gaussian noise to a plane of pixels
1310  *
1311  *  SPECIAL NOTES : None.
1312  *
1313  ****************************************************************************/
1314 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1315 {
1316     unsigned int i;
1317     int Pitch4 = Pitch * 4;
1318     const int noise_amount = 2;
1319     const int noise_adder = 2 * noise_amount + 1;
1320 
1321     __declspec(align(16)) unsigned char blackclamp[16];
1322     __declspec(align(16)) unsigned char whiteclamp[16];
1323     __declspec(align(16)) unsigned char bothclamp[16];
1324 
1325     char char_dist[300];
1326     char Rand[2048];
1327 
1328     double sigma;
1329     __asm emms
1330     sigma = a + .5 + .6 * (63 - q) / 63.0;
1331 
1332     // set up a lookup table of 256 entries that matches
1333     // a gaussian distribution with sigma determined by q.
1334     //
1335     {
1336         double i, sum = 0;
1337         int next, j;
1338 
1339         next = 0;
1340 
1341         for (i = -32; i < 32; i++)
1342         {
1343             int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
1344 
1345             if (a)
1346             {
1347                 for (j = 0; j < a; j++)
1348                 {
1349                     char_dist[next+j] = (char) i;
1350                 }
1351 
1352                 next = next + j;
1353             }
1354 
1355         }
1356 
1357         for (next = next; next < 256; next++)
1358             char_dist[next] = 0;
1359 
1360     }
1361 
1362     for (i = 0; i < 2048; i++)
1363     {
1364         Rand[i] = char_dist[rand() & 0xff];
1365     }
1366 
1367     for (i = 0; i < 16; i++)
1368     {
1369         blackclamp[i] = -char_dist[0];
1370         whiteclamp[i] = -char_dist[0];
1371         bothclamp[i] = -2 * char_dist[0];
1372     }
1373 
1374     for (i = 0; i < Height; i++)
1375     {
1376         unsigned char *Pos = Start + i * Pitch;
1377         char  *Ref = Rand + (rand() & 0xff);
1378 
1379         __asm
1380         {
1381             mov ecx, [Width]
1382             mov esi, Pos
1383             mov edi, Ref
1384             xor         eax, eax
1385 
1386             nextset:
1387             movq        mm1, [esi+eax]        // get the source
1388 
1389             psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
1390             paddusb     mm1, bothclamp
1391             psubusb     mm1, whiteclamp
1392 
1393             movq        mm2, [edi+eax]        // get the noise for this line
1394             paddb       mm1, mm2             // add it in
1395             movq        [esi+eax], mm1        // store the result
1396 
1397             add         eax, 8                // move to the next line
1398 
1399             cmp         eax, ecx
1400             jl          nextset
1401 
1402 
1403         }
1404 
1405     }
1406 }
1407 #else
1408 extern char an[8][64][3072];
1409 extern int cd[8][64];
1410 
vp8_plane_add_noise_mmx(unsigned char * Start,unsigned int Width,unsigned int Height,int Pitch,int q,int a)1411 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1412 {
1413     unsigned int i;
1414     __declspec(align(16)) unsigned char blackclamp[16];
1415     __declspec(align(16)) unsigned char whiteclamp[16];
1416     __declspec(align(16)) unsigned char bothclamp[16];
1417 
1418 
1419     __asm emms
1420 
1421     for (i = 0; i < 16; i++)
1422     {
1423         blackclamp[i] = -cd[a][q];
1424         whiteclamp[i] = -cd[a][q];
1425         bothclamp[i] = -2 * cd[a][q];
1426     }
1427 
1428     for (i = 0; i < Height; i++)
1429     {
1430         unsigned char *Pos = Start + i * Pitch;
1431         char  *Ref = an[a][q] + (rand() & 0xff);
1432 
1433         __asm
1434         {
1435             mov ecx, [Width]
1436             mov esi, Pos
1437             mov edi, Ref
1438             xor         eax, eax
1439 
1440             nextset:
1441             movq        mm1, [esi+eax]        // get the source
1442 
1443             psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
1444             paddusb     mm1, bothclamp
1445             psubusb     mm1, whiteclamp
1446 
1447             movq        mm2, [edi+eax]        // get the noise for this line
1448             paddb       mm1, mm2             // add it in
1449             movq        [esi+eax], mm1        // store the result
1450 
1451             add         eax, 8                // move to the next line
1452 
1453             cmp         eax, ecx
1454             jl          nextset
1455         }
1456     }
1457 }
1458 
1459 
vp8_plane_add_noise_wmt(unsigned char * Start,unsigned int Width,unsigned int Height,int Pitch,int q,int a)1460 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1461 {
1462     unsigned int i;
1463 
1464     __declspec(align(16)) unsigned char blackclamp[16];
1465     __declspec(align(16)) unsigned char whiteclamp[16];
1466     __declspec(align(16)) unsigned char bothclamp[16];
1467 
1468     __asm emms
1469 
1470     for (i = 0; i < 16; i++)
1471     {
1472         blackclamp[i] = -cd[a][q];
1473         whiteclamp[i] = -cd[a][q];
1474         bothclamp[i] = -2 * cd[a][q];
1475     }
1476 
1477     for (i = 0; i < Height; i++)
1478     {
1479         unsigned char *Pos = Start + i * Pitch;
1480         char *Ref = an[a][q] + (rand() & 0xff);
1481 
1482         __asm
1483         {
1484             mov ecx,    [Width]
1485             mov esi,    Pos
1486             mov edi,    Ref
1487             xor         eax, eax
1488 
1489             nextset:
1490             movdqu      xmm1, [esi+eax]        // get the source
1491 
1492             psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
1493             paddusb     xmm1, bothclamp
1494             psubusb     xmm1, whiteclamp
1495 
1496             movdqu      xmm2, [edi+eax]        // get the noise for this line
1497             paddb       xmm1, xmm2             // add it in
1498             movdqu      [esi+eax], xmm1        // store the result
1499 
1500             add         eax, 16                // move to the next line
1501 
1502             cmp         eax, ecx
1503             jl          nextset
1504         }
1505     }
1506 }
1507 
1508 #endif
1509