• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13extern sym(vp8_bilinear_filters_x86_8)
14
15
16%define BLOCK_HEIGHT_WIDTH 4
17%define vp8_filter_weight 128
18%define VP8_FILTER_SHIFT  7
19
20
21;void vp8_filter_block1d_h6_mmx
22;(
23;    unsigned char   *src_ptr,
24;    unsigned short  *output_ptr,
25;    unsigned int    src_pixels_per_line,
26;    unsigned int    pixel_step,
27;    unsigned int    output_height,
28;    unsigned int    output_width,
29;    short           * vp8_filter
30;)
31global sym(vp8_filter_block1d_h6_mmx) PRIVATE
32sym(vp8_filter_block1d_h6_mmx):
33    push        rbp
34    mov         rbp, rsp
35    SHADOW_ARGS_TO_STACK 7
36    GET_GOT     rbx
37    push        rsi
38    push        rdi
39    ; end prolog
40
41        mov         rdx,    arg(6) ;vp8_filter
42
43        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
44        movq        mm2,    [rdx + 32]         ;
45        movq        mm6,    [rdx + 48]        ;
46        movq        mm7,    [rdx + 64]        ;
47
48        mov         rdi,    arg(1) ;output_ptr
49        mov         rsi,    arg(0) ;src_ptr
50        movsxd      rcx,    dword ptr arg(4) ;output_height
51        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
52        pxor        mm0,    mm0              ; mm0 = 00000000
53
54.nextrow:
55        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
56        movq        mm4,    mm3              ; mm4 = p-2..p5
57        psrlq       mm3,    8                ; mm3 = p-1..p5
58        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
59        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
60
61        movq        mm5,    mm4              ; mm5 = p-2..p5
62        punpckhbw   mm4,    mm0              ; mm5 = p2..p5
63        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
64        paddsw      mm3,    mm4              ; mm3 += mm5
65
66        movq        mm4,    mm5              ; mm4 = p-2..p5;
67        psrlq       mm5,    16               ; mm5 = p0..p5;
68        punpcklbw   mm5,    mm0              ; mm5 = p0..p3
69        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
70        paddsw      mm3,    mm5              ; mm3 += mm5
71
72        movq        mm5,    mm4              ; mm5 = p-2..p5
73        psrlq       mm4,    24               ; mm4 = p1..p5
74        punpcklbw   mm4,    mm0              ; mm4 = p1..p4
75        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
76        paddsw      mm3,    mm4              ; mm3 += mm5
77
78        ; do outer positive taps
79        movd        mm4,    [rsi+3]
80        punpcklbw   mm4,    mm0              ; mm5 = p3..p6
81        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
82        paddsw      mm3,    mm4              ; mm3 += mm5
83
84        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
85        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
86        paddsw      mm3,    mm5              ; mm3 += mm5
87
88        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
89        psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
90        packuswb    mm3,    mm0              ; pack and unpack to saturate
91        punpcklbw   mm3,    mm0              ;
92
93        movq        [rdi],  mm3              ; store the results in the destination
94
95%if ABI_IS_32BIT
96        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
97        add         rdi,    rax;
98%else
99        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
100        add         rdi,    rax;
101
102        add         rsi,    r8               ; next line
103%endif
104
105        dec         rcx                      ; decrement count
106        jnz         .nextrow                 ; next row
107
108    ; begin epilog
109    pop rdi
110    pop rsi
111    RESTORE_GOT
112    UNSHADOW_ARGS
113    pop         rbp
114    ret
115
116
117;void vp8_filter_block1dc_v6_mmx
118;(
119;   short *src_ptr,
120;   unsigned char *output_ptr,
121;    int output_pitch,
122;   unsigned int pixels_per_line,
123;   unsigned int pixel_step,
124;   unsigned int output_height,
125;   unsigned int output_width,
126;   short * vp8_filter
127;)
128global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
129sym(vp8_filter_block1dc_v6_mmx):
130    push        rbp
131    mov         rbp, rsp
132    SHADOW_ARGS_TO_STACK 8
133    GET_GOT     rbx
134    push        rsi
135    push        rdi
136    ; end prolog
137
138        movq      mm5, [GLOBAL(rd)]
139        push        rbx
140        mov         rbx, arg(7) ;vp8_filter
141        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
142        movq      mm2, [rbx + 32]         ;
143        movq      mm6, [rbx + 48]        ;
144        movq      mm7, [rbx + 64]        ;
145
146        movsxd      rdx, dword ptr arg(3) ;pixels_per_line
147        mov         rdi, arg(1) ;output_ptr
148        mov         rsi, arg(0) ;src_ptr
149        sub         rsi, rdx
150        sub         rsi, rdx
151        movsxd      rcx, DWORD PTR arg(5) ;output_height
152        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
153        pxor        mm0, mm0              ; mm0 = 00000000
154
155
156.nextrow_cv:
157        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
158        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
159
160
161        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
162        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
163        paddsw      mm3, mm4              ; mm3 += mm4
164
165        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
166        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
167        paddsw      mm3, mm4              ; mm3 += mm4
168
169        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
170        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
171        paddsw      mm3, mm4              ; mm3 += mm4
172
173
174        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
175        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
176        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
177        paddsw      mm3, mm4              ; mm3 += mm4
178
179        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
180        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
181        paddsw      mm3, mm4              ; mm3 += mm4
182
183
184        paddsw      mm3, mm5               ; mm3 += round value
185        psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
186        packuswb    mm3, mm0              ; pack and saturate
187
188        movd        [rdi],mm3             ; store the results in the destination
189        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
190        ; recon block should be in cache this shouldn't cost much.  Its obviously
191        ; avoidable!!!.
192        lea         rdi,  [rdi+rax] ;
193        dec         rcx                   ; decrement count
194        jnz         .nextrow_cv           ; next row
195
196        pop         rbx
197
198    ; begin epilog
199    pop rdi
200    pop rsi
201    RESTORE_GOT
202    UNSHADOW_ARGS
203    pop         rbp
204    ret
205
206
207;void bilinear_predict8x8_mmx
208;(
209;    unsigned char  *src_ptr,
210;    int   src_pixels_per_line,
211;    int  xoffset,
212;    int  yoffset,
213;   unsigned char *dst_ptr,
214;    int dst_pitch
215;)
216global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
217sym(vp8_bilinear_predict8x8_mmx):
218    push        rbp
219    mov         rbp, rsp
220    SHADOW_ARGS_TO_STACK 6
221    GET_GOT     rbx
222    push        rsi
223    push        rdi
224    ; end prolog
225
226    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
227    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
228
229        movsxd      rax,        dword ptr arg(2) ;xoffset
230        mov         rdi,        arg(4) ;dst_ptr           ;
231
232        shl         rax,        5 ; offset * 32
233        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
234
235        add         rax,        rcx ; HFilter
236        mov         rsi,        arg(0) ;src_ptr              ;
237
238        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
239        movq        mm1,        [rax]               ;
240
241        movq        mm2,        [rax+16]            ;
242        movsxd      rax,        dword ptr arg(3) ;yoffset
243
244        pxor        mm0,        mm0                 ;
245
246        shl         rax,        5 ; offset*32
247        add         rax,        rcx ; VFilter
248
249        lea         rcx,        [rdi+rdx*8]          ;
250        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
251
252
253
254        ; get the first horizontal line done       ;
255        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
256        movq        mm4,        mm3                 ; make a copy of current line
257
258        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
259        punpckhbw   mm4,        mm0                 ;
260
261        pmullw      mm3,        mm1                 ;
262        pmullw      mm4,        mm1                 ;
263
264        movq        mm5,        [rsi+1]             ;
265        movq        mm6,        mm5                 ;
266
267        punpcklbw   mm5,        mm0                 ;
268        punpckhbw   mm6,        mm0                 ;
269
270        pmullw      mm5,        mm2                 ;
271        pmullw      mm6,        mm2                 ;
272
273        paddw       mm3,        mm5                 ;
274        paddw       mm4,        mm6                 ;
275
276        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
277        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
278
279        paddw       mm4,        [GLOBAL(rd)]                 ;
280        psraw       mm4,        VP8_FILTER_SHIFT        ;
281
282        movq        mm7,        mm3                 ;
283        packuswb    mm7,        mm4                 ;
284
285        add         rsi,        rdx                 ; next line
286.next_row_8x8:
287        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
288        movq        mm4,        mm3                 ; make a copy of current line
289
290        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
291        punpckhbw   mm4,        mm0                 ;
292
293        pmullw      mm3,        mm1                 ;
294        pmullw      mm4,        mm1                 ;
295
296        movq        mm5,        [rsi+1]             ;
297        movq        mm6,        mm5                 ;
298
299        punpcklbw   mm5,        mm0                 ;
300        punpckhbw   mm6,        mm0                 ;
301
302        pmullw      mm5,        mm2                 ;
303        pmullw      mm6,        mm2                 ;
304
305        paddw       mm3,        mm5                 ;
306        paddw       mm4,        mm6                 ;
307
308        movq        mm5,        mm7                 ;
309        movq        mm6,        mm7                 ;
310
311        punpcklbw   mm5,        mm0                 ;
312        punpckhbw   mm6,        mm0
313
314        pmullw      mm5,        [rax]               ;
315        pmullw      mm6,        [rax]               ;
316
317        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
318        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
319
320        paddw       mm4,        [GLOBAL(rd)]                 ;
321        psraw       mm4,        VP8_FILTER_SHIFT        ;
322
323        movq        mm7,        mm3                 ;
324        packuswb    mm7,        mm4                 ;
325
326
327        pmullw      mm3,        [rax+16]            ;
328        pmullw      mm4,        [rax+16]            ;
329
330        paddw       mm3,        mm5                 ;
331        paddw       mm4,        mm6                 ;
332
333
334        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
335        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
336
337        paddw       mm4,        [GLOBAL(rd)]                 ;
338        psraw       mm4,        VP8_FILTER_SHIFT        ;
339
340        packuswb    mm3,        mm4
341
342        movq        [rdi],      mm3                 ; store the results in the destination
343
344%if ABI_IS_32BIT
345        add         rsi,        rdx                 ; next line
346        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
347%else
348        movsxd      r8,         dword ptr arg(5) ;dst_pitch
349        add         rsi,        rdx                 ; next line
350        add         rdi,        r8                  ;dst_pitch
351%endif
352        cmp         rdi,        rcx                 ;
353        jne         .next_row_8x8
354
355    ; begin epilog
356    pop rdi
357    pop rsi
358    RESTORE_GOT
359    UNSHADOW_ARGS
360    pop         rbp
361    ret
362
363
364;void bilinear_predict8x4_mmx
365;(
366;    unsigned char  *src_ptr,
367;    int   src_pixels_per_line,
368;    int  xoffset,
369;    int  yoffset,
370;    unsigned char *dst_ptr,
371;    int dst_pitch
372;)
373global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
374sym(vp8_bilinear_predict8x4_mmx):
375    push        rbp
376    mov         rbp, rsp
377    SHADOW_ARGS_TO_STACK 6
378    GET_GOT     rbx
379    push        rsi
380    push        rdi
381    ; end prolog
382
383    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
384    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
385
386        movsxd      rax,        dword ptr arg(2) ;xoffset
387        mov         rdi,        arg(4) ;dst_ptr           ;
388
389        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
390        shl         rax,        5
391
392        mov         rsi,        arg(0) ;src_ptr              ;
393        add         rax,        rcx
394
395        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
396        movq        mm1,        [rax]               ;
397
398        movq        mm2,        [rax+16]            ;
399        movsxd      rax,        dword ptr arg(3) ;yoffset
400
401        pxor        mm0,        mm0                 ;
402        shl         rax,        5
403
404        add         rax,        rcx
405        lea         rcx,        [rdi+rdx*4]          ;
406
407        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
408
409        ; get the first horizontal line done       ;
410        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
411        movq        mm4,        mm3                 ; make a copy of current line
412
413        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
414        punpckhbw   mm4,        mm0                 ;
415
416        pmullw      mm3,        mm1                 ;
417        pmullw      mm4,        mm1                 ;
418
419        movq        mm5,        [rsi+1]             ;
420        movq        mm6,        mm5                 ;
421
422        punpcklbw   mm5,        mm0                 ;
423        punpckhbw   mm6,        mm0                 ;
424
425        pmullw      mm5,        mm2                 ;
426        pmullw      mm6,        mm2                 ;
427
428        paddw       mm3,        mm5                 ;
429        paddw       mm4,        mm6                 ;
430
431        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
432        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
433
434        paddw       mm4,        [GLOBAL(rd)]                 ;
435        psraw       mm4,        VP8_FILTER_SHIFT        ;
436
437        movq        mm7,        mm3                 ;
438        packuswb    mm7,        mm4                 ;
439
440        add         rsi,        rdx                 ; next line
441.next_row_8x4:
442        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
443        movq        mm4,        mm3                 ; make a copy of current line
444
445        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
446        punpckhbw   mm4,        mm0                 ;
447
448        pmullw      mm3,        mm1                 ;
449        pmullw      mm4,        mm1                 ;
450
451        movq        mm5,        [rsi+1]             ;
452        movq        mm6,        mm5                 ;
453
454        punpcklbw   mm5,        mm0                 ;
455        punpckhbw   mm6,        mm0                 ;
456
457        pmullw      mm5,        mm2                 ;
458        pmullw      mm6,        mm2                 ;
459
460        paddw       mm3,        mm5                 ;
461        paddw       mm4,        mm6                 ;
462
463        movq        mm5,        mm7                 ;
464        movq        mm6,        mm7                 ;
465
466        punpcklbw   mm5,        mm0                 ;
467        punpckhbw   mm6,        mm0
468
469        pmullw      mm5,        [rax]               ;
470        pmullw      mm6,        [rax]               ;
471
472        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
473        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
474
475        paddw       mm4,        [GLOBAL(rd)]                 ;
476        psraw       mm4,        VP8_FILTER_SHIFT        ;
477
478        movq        mm7,        mm3                 ;
479        packuswb    mm7,        mm4                 ;
480
481
482        pmullw      mm3,        [rax+16]            ;
483        pmullw      mm4,        [rax+16]            ;
484
485        paddw       mm3,        mm5                 ;
486        paddw       mm4,        mm6                 ;
487
488
489        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
490        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
491
492        paddw       mm4,        [GLOBAL(rd)]                 ;
493        psraw       mm4,        VP8_FILTER_SHIFT        ;
494
495        packuswb    mm3,        mm4
496
497        movq        [rdi],      mm3                 ; store the results in the destination
498
499%if ABI_IS_32BIT
500        add         rsi,        rdx                 ; next line
501        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
502%else
503        movsxd      r8,         dword ptr arg(5) ;dst_pitch
504        add         rsi,        rdx                 ; next line
505        add         rdi,        r8
506%endif
507        cmp         rdi,        rcx                 ;
508        jne         .next_row_8x4
509
510    ; begin epilog
511    pop rdi
512    pop rsi
513    RESTORE_GOT
514    UNSHADOW_ARGS
515    pop         rbp
516    ret
517
518
519;void bilinear_predict4x4_mmx
520;(
521;    unsigned char  *src_ptr,
522;    int   src_pixels_per_line,
523;    int  xoffset,
524;    int  yoffset,
525;    unsigned char *dst_ptr,
526;    int dst_pitch
527;)
528global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
529sym(vp8_bilinear_predict4x4_mmx):
530    push        rbp
531    mov         rbp, rsp
532    SHADOW_ARGS_TO_STACK 6
533    GET_GOT     rbx
534    push        rsi
535    push        rdi
536    ; end prolog
537
538    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
539    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
540
541        movsxd      rax,        dword ptr arg(2) ;xoffset
542        mov         rdi,        arg(4) ;dst_ptr           ;
543
544        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
545        shl         rax,        5
546
547        add         rax,        rcx ; HFilter
548        mov         rsi,        arg(0) ;src_ptr              ;
549
550        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
551        movq        mm1,        [rax]               ;
552
553        movq        mm2,        [rax+16]            ;
554        movsxd      rax,        dword ptr arg(3) ;yoffset
555
556        pxor        mm0,        mm0                 ;
557        shl         rax,        5
558
559        add         rax,        rcx
560        lea         rcx,        [rdi+rdx*4]          ;
561
562        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
563
564        ; get the first horizontal line done       ;
565        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
566        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
567
568        pmullw      mm3,        mm1                 ;
569        movd        mm5,        [rsi+1]             ;
570
571        punpcklbw   mm5,        mm0                 ;
572        pmullw      mm5,        mm2                 ;
573
574        paddw       mm3,        mm5                 ;
575        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
576
577        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
578
579        movq        mm7,        mm3                 ;
580        packuswb    mm7,        mm0                 ;
581
582        add         rsi,        rdx                 ; next line
583.next_row_4x4:
584        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
585        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
586
587        pmullw      mm3,        mm1                 ;
588        movd        mm5,        [rsi+1]             ;
589
590        punpcklbw   mm5,        mm0                 ;
591        pmullw      mm5,        mm2                 ;
592
593        paddw       mm3,        mm5                 ;
594
595        movq        mm5,        mm7                 ;
596        punpcklbw   mm5,        mm0                 ;
597
598        pmullw      mm5,        [rax]               ;
599        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
600
601        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
602        movq        mm7,        mm3                 ;
603
604        packuswb    mm7,        mm0                 ;
605
606        pmullw      mm3,        [rax+16]            ;
607        paddw       mm3,        mm5                 ;
608
609
610        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
611        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
612
613        packuswb    mm3,        mm0
614        movd        [rdi],      mm3                 ; store the results in the destination
615
616%if ABI_IS_32BIT
617        add         rsi,        rdx                 ; next line
618        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
619%else
620        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
621        add         rsi,        rdx                 ; next line
622        add         rdi,        r8
623%endif
624
625        cmp         rdi,        rcx                 ;
626        jne         .next_row_4x4
627
628    ; begin epilog
629    pop rdi
630    pop rsi
631    RESTORE_GOT
632    UNSHADOW_ARGS
633    pop         rbp
634    ret
635
636
637
638SECTION_RODATA
639align 16
640rd:
641    times 4 dw 0x40
642
643align 16
644global HIDDEN_DATA(sym(vp8_six_tap_mmx))
645sym(vp8_six_tap_mmx):
646    times 8 dw 0
647    times 8 dw 0
648    times 8 dw 128
649    times 8 dw 0
650    times 8 dw 0
651    times 8 dw 0
652
653    times 8 dw 0
654    times 8 dw -6
655    times 8 dw 123
656    times 8 dw 12
657    times 8 dw -1
658    times 8 dw 0
659
660    times 8 dw 2
661    times 8 dw -11
662    times 8 dw 108
663    times 8 dw 36
664    times 8 dw -8
665    times 8 dw 1
666
667    times 8 dw 0
668    times 8 dw -9
669    times 8 dw 93
670    times 8 dw 50
671    times 8 dw -6
672    times 8 dw 0
673
674    times 8 dw 3
675    times 8 dw -16
676    times 8 dw 77
677    times 8 dw 77
678    times 8 dw -16
679    times 8 dw 3
680
681    times 8 dw 0
682    times 8 dw -6
683    times 8 dw 50
684    times 8 dw 93
685    times 8 dw -9
686    times 8 dw 0
687
688    times 8 dw 1
689    times 8 dw -8
690    times 8 dw 36
691    times 8 dw 108
692    times 8 dw -11
693    times 8 dw 2
694
695    times 8 dw 0
696    times 8 dw -1
697    times 8 dw 12
698    times 8 dw 123
699    times 8 dw -6
700    times 8 dw 0
701
702
703