• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14
15%define BLOCK_HEIGHT_WIDTH 4
16%define vp8_filter_weight 128
17%define VP8_FILTER_SHIFT  7
18
19
20;void vp8_filter_block1d_h6_mmx
21;(
22;    unsigned char   *src_ptr,
23;    unsigned short  *output_ptr,
24;    unsigned int    src_pixels_per_line,
25;    unsigned int    pixel_step,
26;    unsigned int    output_height,
27;    unsigned int    output_width,
28;    short           * vp8_filter
29;)
30global sym(vp8_filter_block1d_h6_mmx)
31sym(vp8_filter_block1d_h6_mmx):
32    push        rbp
33    mov         rbp, rsp
34    SHADOW_ARGS_TO_STACK 7
35    GET_GOT     rbx
36    push        rsi
37    push        rdi
38    ; end prolog
39
40        mov         rdx,    arg(6) ;vp8_filter
41
42        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
43        movq        mm2,    [rdx + 32]         ;
44        movq        mm6,    [rdx + 48]        ;
45        movq        mm7,    [rdx + 64]        ;
46
47        mov         rdi,    arg(1) ;output_ptr
48        mov         rsi,    arg(0) ;src_ptr
49        movsxd      rcx,    dword ptr arg(4) ;output_height
50        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
51        pxor        mm0,    mm0              ; mm0 = 00000000
52
53nextrow:
54        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
55        movq        mm4,    mm3              ; mm4 = p-2..p5
56        psrlq       mm3,    8                ; mm3 = p-1..p5
57        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
58        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
59
60        movq        mm5,    mm4              ; mm5 = p-2..p5
61        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
62        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
63        paddsw      mm3,    mm4              ; mm3 += mm5
64
65        movq        mm4,    mm5              ; mm4 = p-2..p5;
66        psrlq       mm5,    16               ; mm5 = p0..p5;
67        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
68        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
69        paddsw      mm3,    mm5              ; mm3 += mm5
70
71        movq        mm5,    mm4              ; mm5 = p-2..p5
72        psrlq       mm4,    24               ; mm4 = p1..p5
73        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
74        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
75        paddsw      mm3,    mm4              ; mm3 += mm5
76
77        ; do outer positive taps
78        movd        mm4,    [rsi+3]
79        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
80        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
81        paddsw      mm3,    mm4              ; mm3 += mm5
82
83        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
84        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
85        paddsw      mm3,    mm5              ; mm3 += mm5
86
87        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
88        psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
89        packuswb    mm3,    mm0              ; pack and unpack to saturate
90        punpcklbw   mm3,    mm0              ;
91
92        movq        [rdi],  mm3              ; store the results in the destination
93
94%if ABI_IS_32BIT
95        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
96        add         rdi,    rax;
97%else
98        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
99        add         rdi,    rax;
100
101        add         rsi,    r8               ; next line
102%endif
103
104        dec         rcx                      ; decrement count
105        jnz         nextrow                  ; next row
106
107    ; begin epilog
108    pop rdi
109    pop rsi
110    RESTORE_GOT
111    UNSHADOW_ARGS
112    pop         rbp
113    ret
114
115
116;
117; THIS FUNCTION APPEARS TO BE UNUSED
118;
119;void vp8_filter_block1d_v6_mmx
120;(
121;   short *src_ptr,
122;   unsigned char *output_ptr,
123;   unsigned int pixels_per_line,
124;   unsigned int pixel_step,
125;   unsigned int output_height,
126;   unsigned int output_width,
127;   short * vp8_filter
128;)
129global sym(vp8_filter_block1d_v6_mmx)
130sym(vp8_filter_block1d_v6_mmx):
131    push        rbp
132    mov         rbp, rsp
133    SHADOW_ARGS_TO_STACK 7
134    GET_GOT     rbx
135    push        rsi
136    push        rdi
137    ; end prolog
138
139        movq      mm5, [GLOBAL(rd)]
140        push        rbx
141        mov         rbx, arg(6) ;vp8_filter
142        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
143        movq      mm2, [rbx + 32]         ;
144        movq      mm6, [rbx + 48]        ;
145        movq      mm7, [rbx + 64]        ;
146
147        movsxd      rdx, dword ptr arg(2) ;pixels_per_line
148        mov         rdi, arg(1) ;output_ptr
149        mov         rsi, arg(0) ;src_ptr
150        sub         rsi, rdx
151        sub         rsi, rdx
152        movsxd      rcx, DWORD PTR arg(4) ;output_height
153        movsxd      rax, DWORD PTR arg(5) ;output_width      ; destination pitch?
154        pxor        mm0, mm0              ; mm0 = 00000000
155
156
157nextrow_v:
158        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
159        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
160
161
162        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
163        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
164        paddsw      mm3, mm4              ; mm3 += mm4
165
166        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
167        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
168        paddsw      mm3, mm4              ; mm3 += mm4
169
170        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
171        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
172        paddsw      mm3, mm4              ; mm3 += mm4
173
174
175        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
176        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
177        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
178        paddsw      mm3, mm4              ; mm3 += mm4
179
180        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
181        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
182        paddsw      mm3, mm4              ; mm3 += mm4
183
184
185        paddsw      mm3, mm5               ; mm3 += round value
186        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
187        packuswb    mm3, mm0              ; pack and saturate
188
189        movd        [rdi],mm3             ; store the results in the destination
190
191        add         rdi,rax;
192
193        dec         rcx                   ; decrement count
194        jnz         nextrow_v             ; next row
195
196        pop         rbx
197
198    ; begin epilog
199    pop rdi
200    pop rsi
201    RESTORE_GOT
202    UNSHADOW_ARGS
203    pop         rbp
204    ret
205
206
207;void vp8_filter_block1dc_v6_mmx
208;(
209;   short *src_ptr,
210;   unsigned char *output_ptr,
211;    int output_pitch,
212;   unsigned int pixels_per_line,
213;   unsigned int pixel_step,
214;   unsigned int output_height,
215;   unsigned int output_width,
216;   short * vp8_filter
217;)
218global sym(vp8_filter_block1dc_v6_mmx)
219sym(vp8_filter_block1dc_v6_mmx):
220    push        rbp
221    mov         rbp, rsp
222    SHADOW_ARGS_TO_STACK 8
223    GET_GOT     rbx
224    push        rsi
225    push        rdi
226    ; end prolog
227
228        movq      mm5, [GLOBAL(rd)]
229        push        rbx
230        mov         rbx, arg(7) ;vp8_filter
231        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
232        movq      mm2, [rbx + 32]         ;
233        movq      mm6, [rbx + 48]        ;
234        movq      mm7, [rbx + 64]        ;
235
236        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
237        mov         rdi, arg(1) ;output_ptr
238        mov         rsi, arg(0) ;src_ptr
239        sub         rsi, rdx
240        sub         rsi, rdx
241        movsxd      rcx, DWORD PTR arg(5) ;output_height
242        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
243        pxor        mm0, mm0              ; mm0 = 00000000
244
245
246nextrow_cv:
247        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
248        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
249
250
251        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
252        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
253        paddsw      mm3, mm4              ; mm3 += mm4
254
255        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
256        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
257        paddsw      mm3, mm4              ; mm3 += mm4
258
259        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
260        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
261        paddsw      mm3, mm4              ; mm3 += mm4
262
263
264        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
265        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
266        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
267        paddsw      mm3, mm4              ; mm3 += mm4
268
269        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
270        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
271        paddsw      mm3, mm4              ; mm3 += mm4
272
273
274        paddsw      mm3, mm5               ; mm3 += round value
275        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
276        packuswb    mm3, mm0              ; pack and saturate
277
278        movd        [rdi],mm3             ; store the results in the destination
279        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
280        ; recon block should be in cache this shouldn't cost much.  Its obviously
281        ; avoidable!!!.
282        lea         rdi,  [rdi+rax] ;
283        dec         rcx                   ; decrement count
284        jnz         nextrow_cv             ; next row
285
286        pop         rbx
287
288    ; begin epilog
289    pop rdi
290    pop rsi
291    RESTORE_GOT
292    UNSHADOW_ARGS
293    pop         rbp
294    ret
295
296
297;void bilinear_predict8x8_mmx
298;(
299;    unsigned char  *src_ptr,
300;    int   src_pixels_per_line,
301;    int  xoffset,
302;    int  yoffset,
303;   unsigned char *dst_ptr,
304;    int dst_pitch
305;)
306global sym(vp8_bilinear_predict8x8_mmx)
307sym(vp8_bilinear_predict8x8_mmx):
308    push        rbp
309    mov         rbp, rsp
310    SHADOW_ARGS_TO_STACK 6
311    GET_GOT     rbx
312    push        rsi
313    push        rdi
314    ; end prolog
315
316    ;const short *HFilter = bilinear_filters_mmx[xoffset];
317    ;const short *VFilter = bilinear_filters_mmx[yoffset];
318
319        movsxd      rax,        dword ptr arg(2) ;xoffset
320        mov         rdi,        arg(4) ;dst_ptr           ;
321
322        shl         rax,        5 ; offset * 32
323        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
324
325        add         rax,        rcx ; HFilter
326        mov         rsi,        arg(0) ;src_ptr              ;
327
328        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
329        movq        mm1,        [rax]               ;
330
331        movq        mm2,        [rax+16]            ;
332        movsxd      rax,        dword ptr arg(3) ;yoffset
333
334        pxor        mm0,        mm0                 ;
335
336        shl         rax,        5 ; offset*32
337        add         rax,        rcx ; VFilter
338
339        lea         rcx,        [rdi+rdx*8]          ;
340        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
341
342
343
344        ; get the first horizontal line done       ;
345        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
346        movq        mm4,        mm3                 ; make a copy of current line
347
348        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
349        punpckhbw   mm4,        mm0                 ;
350
351        pmullw      mm3,        mm1                 ;
352        pmullw      mm4,        mm1                 ;
353
354        movq        mm5,        [rsi+1]             ;
355        movq        mm6,        mm5                 ;
356
357        punpcklbw   mm5,        mm0                 ;
358        punpckhbw   mm6,        mm0                 ;
359
360        pmullw      mm5,        mm2                 ;
361        pmullw      mm6,        mm2                 ;
362
363        paddw       mm3,        mm5                 ;
364        paddw       mm4,        mm6                 ;
365
366        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
367        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
368
369        paddw       mm4,        [GLOBAL(rd)]                 ;
370        psraw       mm4,        VP8_FILTER_SHIFT        ;
371
372        movq        mm7,        mm3                 ;
373        packuswb    mm7,        mm4                 ;
374
375        add         rsi,        rdx                 ; next line
376next_row_8x8:
377        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
378        movq        mm4,        mm3                 ; make a copy of current line
379
380        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
381        punpckhbw   mm4,        mm0                 ;
382
383        pmullw      mm3,        mm1                 ;
384        pmullw      mm4,        mm1                 ;
385
386        movq        mm5,        [rsi+1]             ;
387        movq        mm6,        mm5                 ;
388
389        punpcklbw   mm5,        mm0                 ;
390        punpckhbw   mm6,        mm0                 ;
391
392        pmullw      mm5,        mm2                 ;
393        pmullw      mm6,        mm2                 ;
394
395        paddw       mm3,        mm5                 ;
396        paddw       mm4,        mm6                 ;
397
398        movq        mm5,        mm7                 ;
399        movq        mm6,        mm7                 ;
400
401        punpcklbw   mm5,        mm0                 ;
402        punpckhbw   mm6,        mm0
403
404        pmullw      mm5,        [rax]               ;
405        pmullw      mm6,        [rax]               ;
406
407        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
408        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
409
410        paddw       mm4,        [GLOBAL(rd)]                 ;
411        psraw       mm4,        VP8_FILTER_SHIFT        ;
412
413        movq        mm7,        mm3                 ;
414        packuswb    mm7,        mm4                 ;
415
416
417        pmullw      mm3,        [rax+16]            ;
418        pmullw      mm4,        [rax+16]            ;
419
420        paddw       mm3,        mm5                 ;
421        paddw       mm4,        mm6                 ;
422
423
424        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
425        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
426
427        paddw       mm4,        [GLOBAL(rd)]                 ;
428        psraw       mm4,        VP8_FILTER_SHIFT        ;
429
430        packuswb    mm3,        mm4
431
432        movq        [rdi],      mm3                 ; store the results in the destination
433
434%if ABI_IS_32BIT
435        add         rsi,        rdx                 ; next line
436        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
437%else
438        movsxd      r8,         dword ptr arg(5) ;dst_pitch
439        add         rsi,        rdx                 ; next line
440        add         rdi,        r8                  ;dst_pitch
441%endif
442        cmp         rdi,        rcx                 ;
443        jne         next_row_8x8
444
445    ; begin epilog
446    pop rdi
447    pop rsi
448    RESTORE_GOT
449    UNSHADOW_ARGS
450    pop         rbp
451    ret
452
453
454;void bilinear_predict8x4_mmx
455;(
456;    unsigned char  *src_ptr,
457;    int   src_pixels_per_line,
458;    int  xoffset,
459;    int  yoffset,
460;    unsigned char *dst_ptr,
461;    int dst_pitch
462;)
463global sym(vp8_bilinear_predict8x4_mmx)
464sym(vp8_bilinear_predict8x4_mmx):
465    push        rbp
466    mov         rbp, rsp
467    SHADOW_ARGS_TO_STACK 6
468    GET_GOT     rbx
469    push        rsi
470    push        rdi
471    ; end prolog
472
473    ;const short *HFilter = bilinear_filters_mmx[xoffset];
474    ;const short *VFilter = bilinear_filters_mmx[yoffset];
475
476        movsxd      rax,        dword ptr arg(2) ;xoffset
477        mov         rdi,        arg(4) ;dst_ptr           ;
478
479        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
480        shl         rax,        5
481
482        mov         rsi,        arg(0) ;src_ptr              ;
483        add         rax,        rcx
484
485        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
486        movq        mm1,        [rax]               ;
487
488        movq        mm2,        [rax+16]            ;
489        movsxd      rax,        dword ptr arg(3) ;yoffset
490
491        pxor        mm0,        mm0                 ;
492        shl         rax,        5
493
494        add         rax,        rcx
495        lea         rcx,        [rdi+rdx*4]          ;
496
497        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
498
499        ; get the first horizontal line done       ;
500        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
501        movq        mm4,        mm3                 ; make a copy of current line
502
503        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
504        punpckhbw   mm4,        mm0                 ;
505
506        pmullw      mm3,        mm1                 ;
507        pmullw      mm4,        mm1                 ;
508
509        movq        mm5,        [rsi+1]             ;
510        movq        mm6,        mm5                 ;
511
512        punpcklbw   mm5,        mm0                 ;
513        punpckhbw   mm6,        mm0                 ;
514
515        pmullw      mm5,        mm2                 ;
516        pmullw      mm6,        mm2                 ;
517
518        paddw       mm3,        mm5                 ;
519        paddw       mm4,        mm6                 ;
520
521        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
522        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
523
524        paddw       mm4,        [GLOBAL(rd)]                 ;
525        psraw       mm4,        VP8_FILTER_SHIFT        ;
526
527        movq        mm7,        mm3                 ;
528        packuswb    mm7,        mm4                 ;
529
530        add         rsi,        rdx                 ; next line
531next_row_8x4:
532        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
533        movq        mm4,        mm3                 ; make a copy of current line
534
535        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
536        punpckhbw   mm4,        mm0                 ;
537
538        pmullw      mm3,        mm1                 ;
539        pmullw      mm4,        mm1                 ;
540
541        movq        mm5,        [rsi+1]             ;
542        movq        mm6,        mm5                 ;
543
544        punpcklbw   mm5,        mm0                 ;
545        punpckhbw   mm6,        mm0                 ;
546
547        pmullw      mm5,        mm2                 ;
548        pmullw      mm6,        mm2                 ;
549
550        paddw       mm3,        mm5                 ;
551        paddw       mm4,        mm6                 ;
552
553        movq        mm5,        mm7                 ;
554        movq        mm6,        mm7                 ;
555
556        punpcklbw   mm5,        mm0                 ;
557        punpckhbw   mm6,        mm0
558
559        pmullw      mm5,        [rax]               ;
560        pmullw      mm6,        [rax]               ;
561
562        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
563        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
564
565        paddw       mm4,        [GLOBAL(rd)]                 ;
566        psraw       mm4,        VP8_FILTER_SHIFT        ;
567
568        movq        mm7,        mm3                 ;
569        packuswb    mm7,        mm4                 ;
570
571
572        pmullw      mm3,        [rax+16]            ;
573        pmullw      mm4,        [rax+16]            ;
574
575        paddw       mm3,        mm5                 ;
576        paddw       mm4,        mm6                 ;
577
578
579        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
580        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
581
582        paddw       mm4,        [GLOBAL(rd)]                 ;
583        psraw       mm4,        VP8_FILTER_SHIFT        ;
584
585        packuswb    mm3,        mm4
586
587        movq        [rdi],      mm3                 ; store the results in the destination
588
589%if ABI_IS_32BIT
590        add         rsi,        rdx                 ; next line
591        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
592%else
593        movsxd      r8,         dword ptr arg(5) ;dst_pitch
594        add         rsi,        rdx                 ; next line
595        add         rdi,        r8
596%endif
597        cmp         rdi,        rcx                 ;
598        jne         next_row_8x4
599
600    ; begin epilog
601    pop rdi
602    pop rsi
603    RESTORE_GOT
604    UNSHADOW_ARGS
605    pop         rbp
606    ret
607
608
609;void bilinear_predict4x4_mmx
610;(
611;    unsigned char  *src_ptr,
612;    int   src_pixels_per_line,
613;    int  xoffset,
614;    int  yoffset,
615;    unsigned char *dst_ptr,
616;    int dst_pitch
617;)
618global sym(vp8_bilinear_predict4x4_mmx)
619sym(vp8_bilinear_predict4x4_mmx):
620    push        rbp
621    mov         rbp, rsp
622    SHADOW_ARGS_TO_STACK 6
623    GET_GOT     rbx
624    push        rsi
625    push        rdi
626    ; end prolog
627
628    ;const short *HFilter = bilinear_filters_mmx[xoffset];
629    ;const short *VFilter = bilinear_filters_mmx[yoffset];
630
631        movsxd      rax,        dword ptr arg(2) ;xoffset
632        mov         rdi,        arg(4) ;dst_ptr           ;
633
634        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
635        shl         rax,        5
636
637        add         rax,        rcx ; HFilter
638        mov         rsi,        arg(0) ;src_ptr              ;
639
640        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
641        movq        mm1,        [rax]               ;
642
643        movq        mm2,        [rax+16]            ;
644        movsxd      rax,        dword ptr arg(3) ;yoffset
645
646        pxor        mm0,        mm0                 ;
647        shl         rax,        5
648
649        add         rax,        rcx
650        lea         rcx,        [rdi+rdx*4]          ;
651
652        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
653
654        ; get the first horizontal line done       ;
655        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
656        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
657
658        pmullw      mm3,        mm1                 ;
659        movd        mm5,        [rsi+1]             ;
660
661        punpcklbw   mm5,        mm0                 ;
662        pmullw      mm5,        mm2                 ;
663
664        paddw       mm3,        mm5                 ;
665        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
666
667        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
668
669        movq        mm7,        mm3                 ;
670        packuswb    mm7,        mm0                 ;
671
672        add         rsi,        rdx                 ; next line
673next_row_4x4:
674        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
675        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
676
677        pmullw      mm3,        mm1                 ;
678        movd        mm5,        [rsi+1]             ;
679
680        punpcklbw   mm5,        mm0                 ;
681        pmullw      mm5,        mm2                 ;
682
683        paddw       mm3,        mm5                 ;
684
685        movq        mm5,        mm7                 ;
686        punpcklbw   mm5,        mm0                 ;
687
688        pmullw      mm5,        [rax]               ;
689        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
690
691        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
692        movq        mm7,        mm3                 ;
693
694        packuswb    mm7,        mm0                 ;
695
696        pmullw      mm3,        [rax+16]            ;
697        paddw       mm3,        mm5                 ;
698
699
700        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
701        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
702
703        packuswb    mm3,        mm0
704        movd        [rdi],      mm3                 ; store the results in the destination
705
706%if ABI_IS_32BIT
707        add         rsi,        rdx                 ; next line
708        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
709%else
710        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
711        add         rsi,        rdx                 ; next line
712        add         rdi,        r8
713%endif
714
715        cmp         rdi,        rcx                 ;
716        jne         next_row_4x4
717
718    ; begin epilog
719    pop rdi
720    pop rsi
721    RESTORE_GOT
722    UNSHADOW_ARGS
723    pop         rbp
724    ret
725
726
727
728SECTION_RODATA
729align 16
730rd:
731    times 4 dw 0x40
732
733align 16
734global HIDDEN_DATA(sym(vp8_six_tap_mmx))
735sym(vp8_six_tap_mmx):
736    times 8 dw 0
737    times 8 dw 0
738    times 8 dw 128
739    times 8 dw 0
740    times 8 dw 0
741    times 8 dw 0
742
743    times 8 dw 0
744    times 8 dw -6
745    times 8 dw 123
746    times 8 dw 12
747    times 8 dw -1
748    times 8 dw 0
749
750    times 8 dw 2
751    times 8 dw -11
752    times 8 dw 108
753    times 8 dw 36
754    times 8 dw -8
755    times 8 dw 1
756
757    times 8 dw 0
758    times 8 dw -9
759    times 8 dw 93
760    times 8 dw 50
761    times 8 dw -6
762    times 8 dw 0
763
764    times 8 dw 3
765    times 8 dw -16
766    times 8 dw 77
767    times 8 dw 77
768    times 8 dw -16
769    times 8 dw 3
770
771    times 8 dw 0
772    times 8 dw -6
773    times 8 dw 50
774    times 8 dw 93
775    times 8 dw -9
776    times 8 dw 0
777
778    times 8 dw 1
779    times 8 dw -8
780    times 8 dw 36
781    times 8 dw 108
782    times 8 dw -11
783    times 8 dw 2
784
785    times 8 dw 0
786    times 8 dw -1
787    times 8 dw 12
788    times 8 dw 123
789    times 8 dw -6
790    times 8 dw 0
791
792
793align 16
794global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
795sym(vp8_bilinear_filters_mmx):
796    times 8 dw 128
797    times 8 dw 0
798
799    times 8 dw 112
800    times 8 dw 16
801
802    times 8 dw 96
803    times 8 dw 32
804
805    times 8 dw 80
806    times 8 dw 48
807
808    times 8 dw 64
809    times 8 dw 64
810
811    times 8 dw 48
812    times 8 dw 80
813
814    times 8 dw 32
815    times 8 dw 96
816
817    times 8 dw 16
818    times 8 dw 112
819