• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13extern sym(vp8_bilinear_filters_x86_8)
14
15%define BLOCK_HEIGHT_WIDTH 4
16%define VP8_FILTER_WEIGHT 128
17%define VP8_FILTER_SHIFT  7
18
19SECTION .text
20
21;/************************************************************************************
22; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
23; input pixel array has output_height rows. This routine assumes that output_height is an
24; even number. This function handles 8 pixels in horizontal direction, calculating ONE
25; rows each iteration to take advantage of the 128 bits operations.
26;*************************************************************************************/
27;void vp8_filter_block1d8_h6_sse2
28;(
29;    unsigned char  *src_ptr,
30;    unsigned short *output_ptr,
31;    unsigned int    src_pixels_per_line,
32;    unsigned int    pixel_step,
33;    unsigned int    output_height,
34;    unsigned int    output_width,
35;    short           *vp8_filter
36;)
37global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
38sym(vp8_filter_block1d8_h6_sse2):
39    push        rbp
40    mov         rbp, rsp
41    SHADOW_ARGS_TO_STACK 7
42    SAVE_XMM 7
43    GET_GOT     rbx
44    push        rsi
45    push        rdi
46    ; end prolog
47
48        mov         rdx,        arg(6) ;vp8_filter
49        mov         rsi,        arg(0) ;src_ptr
50
51        mov         rdi,        arg(1) ;output_ptr
52
53        movsxd      rcx,        dword ptr arg(4) ;output_height
54        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
55%if ABI_IS_32BIT=0
56        movsxd      r8,         dword ptr arg(5) ;output_width
57%endif
58        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
59
60.filter_block1d8_h6_rowloop:
61        movq        xmm3,       MMWORD PTR [rsi - 2]
62        movq        xmm1,       MMWORD PTR [rsi + 6]
63
64        prefetcht2  [rsi+rax-2]
65
66        pslldq      xmm1,       8
67        por         xmm1,       xmm3
68
69        movdqa      xmm4,       xmm1
70        movdqa      xmm5,       xmm1
71
72        movdqa      xmm6,       xmm1
73        movdqa      xmm7,       xmm1
74
75        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
76        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
77
78        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
79        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
80
81        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
82        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
83
84
85        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
86        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
87
88        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
89
90        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
91        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
92
93        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
94
95        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
96        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
97
98
99        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
100
101        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
102        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
103
104
105        paddsw      xmm4,       xmm7
106        paddsw      xmm4,       xmm5
107
108        paddsw      xmm4,       xmm3
109        paddsw      xmm4,       xmm6
110
111        paddsw      xmm4,       xmm1
112        paddsw      xmm4,       [GLOBAL(rd)]
113
114        psraw       xmm4,       7
115
116        packuswb    xmm4,       xmm0
117        punpcklbw   xmm4,       xmm0
118
119        movdqa      XMMWORD Ptr [rdi],         xmm4
120        lea         rsi,        [rsi + rax]
121
122%if ABI_IS_32BIT
123        add         rdi,        DWORD Ptr arg(5) ;[output_width]
124%else
125        add         rdi,        r8
126%endif
127        dec         rcx
128
129        jnz         .filter_block1d8_h6_rowloop                ; next row
130
131    ; begin epilog
132    pop rdi
133    pop rsi
134    RESTORE_GOT
135    RESTORE_XMM
136    UNSHADOW_ARGS
137    pop         rbp
138    ret
139
140
141;void vp8_filter_block1d16_h6_sse2
142;(
143;    unsigned char  *src_ptr,
144;    unsigned short *output_ptr,
145;    unsigned int    src_pixels_per_line,
146;    unsigned int    pixel_step,
147;    unsigned int    output_height,
148;    unsigned int    output_width,
149;    short           *vp8_filter
150;)
151;/************************************************************************************
152; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
153; input pixel array has output_height rows. This routine assumes that output_height is an
154; even number. This function handles 8 pixels in horizontal direction, calculating ONE
155; rows each iteration to take advantage of the 128 bits operations.
156;*************************************************************************************/
157global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
158sym(vp8_filter_block1d16_h6_sse2):
159    push        rbp
160    mov         rbp, rsp
161    SHADOW_ARGS_TO_STACK 7
162    SAVE_XMM 7
163    GET_GOT     rbx
164    push        rsi
165    push        rdi
166    ; end prolog
167
168        mov         rdx,        arg(6) ;vp8_filter
169        mov         rsi,        arg(0) ;src_ptr
170
171        mov         rdi,        arg(1) ;output_ptr
172
173        movsxd      rcx,        dword ptr arg(4) ;output_height
174        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
175%if ABI_IS_32BIT=0
176        movsxd      r8,         dword ptr arg(5) ;output_width
177%endif
178
179        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
180
181.filter_block1d16_h6_sse2_rowloop:
182        movq        xmm3,       MMWORD PTR [rsi - 2]
183        movq        xmm1,       MMWORD PTR [rsi + 6]
184
185        ; Load from 11 to avoid reading out of bounds.
186        movq        xmm2,       MMWORD PTR [rsi +11]
187        ; The lower bits are not cleared before 'or'ing with xmm1,
188        ; but that is OK because the values in the overlapping positions
189        ; are already equal to the ones in xmm1.
190        pslldq      xmm2,       5
191
192        por         xmm2,       xmm1
193        prefetcht2  [rsi+rax-2]
194
195        pslldq      xmm1,       8
196        por         xmm1,       xmm3
197
198        movdqa      xmm4,       xmm1
199        movdqa      xmm5,       xmm1
200
201        movdqa      xmm6,       xmm1
202        movdqa      xmm7,       xmm1
203
204        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
205        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
206
207        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
208        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
209
210        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
211        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
212
213
214        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
215        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
216
217        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
218
219        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
220        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
221
222        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
223
224        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
225        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
226
227
228        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
229
230        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
231        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
232
233        paddsw      xmm4,       xmm7
234        paddsw      xmm4,       xmm5
235
236        paddsw      xmm4,       xmm3
237        paddsw      xmm4,       xmm6
238
239        paddsw      xmm4,       xmm1
240        paddsw      xmm4,       [GLOBAL(rd)]
241
242        psraw       xmm4,       7
243
244        packuswb    xmm4,       xmm0
245        punpcklbw   xmm4,       xmm0
246
247        movdqa      XMMWORD Ptr [rdi],         xmm4
248
249        movdqa      xmm3,       xmm2
250        movdqa      xmm4,       xmm2
251
252        movdqa      xmm5,       xmm2
253        movdqa      xmm6,       xmm2
254
255        movdqa      xmm7,       xmm2
256
257        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
258        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
259
260        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
261        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
262
263        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
264        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
265
266
267        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
268        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
269
270        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
271
272        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
273        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
274
275        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
276
277        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
278        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
279
280        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
281
282        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
283        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
284
285
286        paddsw      xmm4,       xmm7
287        paddsw      xmm4,       xmm5
288
289        paddsw      xmm4,       xmm3
290        paddsw      xmm4,       xmm6
291
292        paddsw      xmm4,       xmm2
293        paddsw      xmm4,       [GLOBAL(rd)]
294
295        psraw       xmm4,       7
296
297        packuswb    xmm4,       xmm0
298        punpcklbw   xmm4,       xmm0
299
300        movdqa      XMMWORD Ptr [rdi+16],      xmm4
301
302        lea         rsi,        [rsi + rax]
303%if ABI_IS_32BIT
304        add         rdi,        DWORD Ptr arg(5) ;[output_width]
305%else
306        add         rdi,        r8
307%endif
308
309        dec         rcx
310        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
311
312    ; begin epilog
313    pop rdi
314    pop rsi
315    RESTORE_GOT
316    RESTORE_XMM
317    UNSHADOW_ARGS
318    pop         rbp
319    ret
320
321
322;void vp8_filter_block1d8_v6_sse2
323;(
324;    short *src_ptr,
325;    unsigned char *output_ptr,
326;    int dst_ptich,
327;    unsigned int pixels_per_line,
328;    unsigned int pixel_step,
329;    unsigned int output_height,
330;    unsigned int output_width,
331;    short * vp8_filter
332;)
333;/************************************************************************************
334; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
335; input pixel array has output_height rows.
336;*************************************************************************************/
337global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
338sym(vp8_filter_block1d8_v6_sse2):
339    push        rbp
340    mov         rbp, rsp
341    SHADOW_ARGS_TO_STACK 8
342    SAVE_XMM 7
343    GET_GOT     rbx
344    push        rsi
345    push        rdi
346    ; end prolog
347
348        mov         rax,        arg(7) ;vp8_filter
349        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
350
351        mov         rdi,        arg(1) ;output_ptr
352        mov         rsi,        arg(0) ;src_ptr
353
354        sub         rsi,        rdx
355        sub         rsi,        rdx
356
357        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
358        pxor        xmm0,       xmm0                        ; clear xmm0
359
360        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
361%if ABI_IS_32BIT=0
362        movsxd      r8,         dword ptr arg(2) ; dst_ptich
363%endif
364
365.vp8_filter_block1d8_v6_sse2_loop:
366        movdqa      xmm1,       XMMWORD PTR [rsi]
367        pmullw      xmm1,       [rax]
368
369        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
370        pmullw      xmm2,       [rax + 16]
371
372        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
373        pmullw      xmm3,       [rax + 32]
374
375        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
376        pmullw      xmm5,       [rax + 64]
377
378        add         rsi,        rdx
379        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
380
381        pmullw      xmm4,       [rax + 48]
382        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
383
384        pmullw      xmm6,       [rax + 80]
385
386        paddsw      xmm2,       xmm5
387        paddsw      xmm2,       xmm3
388
389        paddsw      xmm2,       xmm1
390        paddsw      xmm2,       xmm4
391
392        paddsw      xmm2,       xmm6
393        paddsw      xmm2,       xmm7
394
395        psraw       xmm2,       7
396        packuswb    xmm2,       xmm0              ; pack and saturate
397
398        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
399%if ABI_IS_32BIT
400        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
401%else
402        add         rdi,        r8
403%endif
404        dec         rcx         ; decrement count
405        jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
406
407    ; begin epilog
408    pop rdi
409    pop rsi
410    RESTORE_GOT
411    RESTORE_XMM
412    UNSHADOW_ARGS
413    pop         rbp
414    ret
415
416
417;void vp8_filter_block1d16_v6_sse2
418;(
419;    unsigned short *src_ptr,
420;    unsigned char *output_ptr,
421;    int dst_ptich,
422;    unsigned int pixels_per_line,
423;    unsigned int pixel_step,
424;    unsigned int output_height,
425;    unsigned int output_width,
426;    const short    *vp8_filter
427;)
428;/************************************************************************************
429; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
430; input pixel array has output_height rows.
431;*************************************************************************************/
432global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
433sym(vp8_filter_block1d16_v6_sse2):
434    push        rbp
435    mov         rbp, rsp
436    SHADOW_ARGS_TO_STACK 8
437    SAVE_XMM 7
438    GET_GOT     rbx
439    push        rsi
440    push        rdi
441    ; end prolog
442
443        mov         rax,        arg(7) ;vp8_filter
444        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
445
446        mov         rdi,        arg(1) ;output_ptr
447        mov         rsi,        arg(0) ;src_ptr
448
449        sub         rsi,        rdx
450        sub         rsi,        rdx
451
452        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
453%if ABI_IS_32BIT=0
454        movsxd      r8,         dword ptr arg(2) ; dst_ptich
455%endif
456
457.vp8_filter_block1d16_v6_sse2_loop:
458; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
459        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
460        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
461        pmullw      xmm1,       [rax + 16]
462        pmullw      xmm2,       [rax + 16]
463
464        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
465        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
466        pmullw      xmm3,       [rax + 64]
467        pmullw      xmm4,       [rax + 64]
468
469        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
470        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
471        pmullw      xmm5,       [rax + 32]
472        pmullw      xmm6,       [rax + 32]
473
474        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
475        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
476        pmullw      xmm7,       [rax]
477        pmullw      xmm0,       [rax]
478
479        paddsw      xmm1,       xmm3
480        paddsw      xmm2,       xmm4
481        paddsw      xmm1,       xmm5
482        paddsw      xmm2,       xmm6
483        paddsw      xmm1,       xmm7
484        paddsw      xmm2,       xmm0
485
486        add         rsi,        rdx
487
488        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
489        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
490        pmullw      xmm3,       [rax + 48]
491        pmullw      xmm4,       [rax + 48]
492
493        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
494        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
495        pmullw      xmm5,       [rax + 80]
496        pmullw      xmm6,       [rax + 80]
497
498        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
499        pxor        xmm0,       xmm0                        ; clear xmm0
500
501        paddsw      xmm1,       xmm3
502        paddsw      xmm2,       xmm4
503        paddsw      xmm1,       xmm5
504        paddsw      xmm2,       xmm6
505
506        paddsw      xmm1,       xmm7
507        paddsw      xmm2,       xmm7
508
509        psraw       xmm1,       7
510        psraw       xmm2,       7
511
512        packuswb    xmm1,       xmm2              ; pack and saturate
513        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
514%if ABI_IS_32BIT
515        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
516%else
517        add         rdi,        r8
518%endif
519        dec         rcx         ; decrement count
520        jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
521
522    ; begin epilog
523    pop rdi
524    pop rsi
525    RESTORE_GOT
526    RESTORE_XMM
527    UNSHADOW_ARGS
528    pop         rbp
529    ret
530
531
532;void vp8_filter_block1d8_h6_only_sse2
533;(
534;    unsigned char  *src_ptr,
535;    unsigned int    src_pixels_per_line,
536;    unsigned char  *output_ptr,
537;    int dst_ptich,
538;    unsigned int    output_height,
539;    const short    *vp8_filter
540;)
541; First-pass filter only when yoffset==0
542global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
543sym(vp8_filter_block1d8_h6_only_sse2):
544    push        rbp
545    mov         rbp, rsp
546    SHADOW_ARGS_TO_STACK 6
547    SAVE_XMM 7
548    GET_GOT     rbx
549    push        rsi
550    push        rdi
551    ; end prolog
552
553        mov         rdx,        arg(5) ;vp8_filter
554        mov         rsi,        arg(0) ;src_ptr
555
556        mov         rdi,        arg(2) ;output_ptr
557
558        movsxd      rcx,        dword ptr arg(4) ;output_height
559        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
560%if ABI_IS_32BIT=0
561        movsxd      r8,         dword ptr arg(3) ;dst_ptich
562%endif
563        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
564
565.filter_block1d8_h6_only_rowloop:
566        movq        xmm3,       MMWORD PTR [rsi - 2]
567        movq        xmm1,       MMWORD PTR [rsi + 6]
568
569        prefetcht2  [rsi+rax-2]
570
571        pslldq      xmm1,       8
572        por         xmm1,       xmm3
573
574        movdqa      xmm4,       xmm1
575        movdqa      xmm5,       xmm1
576
577        movdqa      xmm6,       xmm1
578        movdqa      xmm7,       xmm1
579
580        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
581        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
582
583        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
584        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
585
586        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
587        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
588
589
590        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
591        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
592
593        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
594
595        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
596        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
597
598        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
599
600        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
601        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
602
603
604        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
605
606        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
607        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
608
609
610        paddsw      xmm4,       xmm7
611        paddsw      xmm4,       xmm5
612
613        paddsw      xmm4,       xmm3
614        paddsw      xmm4,       xmm6
615
616        paddsw      xmm4,       xmm1
617        paddsw      xmm4,       [GLOBAL(rd)]
618
619        psraw       xmm4,       7
620
621        packuswb    xmm4,       xmm0
622
623        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
624        lea         rsi,        [rsi + rax]
625
626%if ABI_IS_32BIT
627        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
628%else
629        add         rdi,        r8
630%endif
631        dec         rcx
632
633        jnz         .filter_block1d8_h6_only_rowloop               ; next row
634
635    ; begin epilog
636    pop rdi
637    pop rsi
638    RESTORE_GOT
639    RESTORE_XMM
640    UNSHADOW_ARGS
641    pop         rbp
642    ret
643
644
645;void vp8_filter_block1d16_h6_only_sse2
646;(
647;    unsigned char  *src_ptr,
648;    unsigned int    src_pixels_per_line,
649;    unsigned char  *output_ptr,
650;    int dst_ptich,
651;    unsigned int    output_height,
652;    const short    *vp8_filter
653;)
654; First-pass filter only when yoffset==0
655global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
656sym(vp8_filter_block1d16_h6_only_sse2):
657    push        rbp
658    mov         rbp, rsp
659    SHADOW_ARGS_TO_STACK 6
660    SAVE_XMM 7
661    GET_GOT     rbx
662    push        rsi
663    push        rdi
664    ; end prolog
665
666        mov         rdx,        arg(5) ;vp8_filter
667        mov         rsi,        arg(0) ;src_ptr
668
669        mov         rdi,        arg(2) ;output_ptr
670
671        movsxd      rcx,        dword ptr arg(4) ;output_height
672        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
673%if ABI_IS_32BIT=0
674        movsxd      r8,         dword ptr arg(3) ;dst_ptich
675%endif
676
677        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
678
679.filter_block1d16_h6_only_sse2_rowloop:
680        movq        xmm3,       MMWORD PTR [rsi - 2]
681        movq        xmm1,       MMWORD PTR [rsi + 6]
682
683        movq        xmm2,       MMWORD PTR [rsi +14]
684        pslldq      xmm2,       8
685
686        por         xmm2,       xmm1
687        prefetcht2  [rsi+rax-2]
688
689        pslldq      xmm1,       8
690        por         xmm1,       xmm3
691
692        movdqa      xmm4,       xmm1
693        movdqa      xmm5,       xmm1
694
695        movdqa      xmm6,       xmm1
696        movdqa      xmm7,       xmm1
697
698        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
699        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
700
701        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
702        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
703
704        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
705        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
706
707        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
708        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
709
710        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
711
712        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
713        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
714
715        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
716
717        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
718        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
719
720        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
721
722        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
723        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
724
725        paddsw      xmm4,       xmm7
726        paddsw      xmm4,       xmm5
727
728        paddsw      xmm4,       xmm3
729        paddsw      xmm4,       xmm6
730
731        paddsw      xmm4,       xmm1
732        paddsw      xmm4,       [GLOBAL(rd)]
733
734        psraw       xmm4,       7
735
736        packuswb    xmm4,       xmm0                        ; lower 8 bytes
737
738        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
739
740        movdqa      xmm3,       xmm2
741        movdqa      xmm4,       xmm2
742
743        movdqa      xmm5,       xmm2
744        movdqa      xmm6,       xmm2
745
746        movdqa      xmm7,       xmm2
747
748        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
749        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
750
751        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
752        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
753
754        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
755        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
756
757        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
758        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
759
760        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
761
762        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
763        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
764
765        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
766
767        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
768        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
769
770        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
771
772        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
773        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
774
775        paddsw      xmm4,       xmm7
776        paddsw      xmm4,       xmm5
777
778        paddsw      xmm4,       xmm3
779        paddsw      xmm4,       xmm6
780
781        paddsw      xmm4,       xmm2
782        paddsw      xmm4,       [GLOBAL(rd)]
783
784        psraw       xmm4,       7
785
786        packuswb    xmm4,       xmm0                        ; higher 8 bytes
787
788        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
789
790        lea         rsi,        [rsi + rax]
791%if ABI_IS_32BIT
792        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
793%else
794        add         rdi,        r8
795%endif
796
797        dec         rcx
798        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
799
800    ; begin epilog
801    pop rdi
802    pop rsi
803    RESTORE_GOT
804    RESTORE_XMM
805    UNSHADOW_ARGS
806    pop         rbp
807    ret
808
809
810;void vp8_filter_block1d8_v6_only_sse2
811;(
812;    unsigned char *src_ptr,
813;    unsigned int    src_pixels_per_line,
814;    unsigned char *output_ptr,
815;    int dst_ptich,
816;    unsigned int output_height,
817;    const short    *vp8_filter
818;)
819; Second-pass filter only when xoffset==0
820global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
821sym(vp8_filter_block1d8_v6_only_sse2):
822    push        rbp
823    mov         rbp, rsp
824    SHADOW_ARGS_TO_STACK 6
825    SAVE_XMM 7
826    GET_GOT     rbx
827    push        rsi
828    push        rdi
829    ; end prolog
830
831        mov         rsi,        arg(0) ;src_ptr
832        mov         rdi,        arg(2) ;output_ptr
833
834        movsxd      rcx,        dword ptr arg(4) ;output_height
835        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
836
837        mov         rax,        arg(5) ;vp8_filter
838
839        pxor        xmm0,       xmm0                        ; clear xmm0
840
841        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
842%if ABI_IS_32BIT=0
843        movsxd      r8,         dword ptr arg(3) ; dst_ptich
844%endif
845
846.vp8_filter_block1d8_v6_only_sse2_loop:
847        movq        xmm1,       MMWORD PTR [rsi]
848        movq        xmm2,       MMWORD PTR [rsi + rdx]
849        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
850        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
851        add         rsi,        rdx
852        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
853        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
854
855        punpcklbw   xmm1,       xmm0
856        pmullw      xmm1,       [rax]
857
858        punpcklbw   xmm2,       xmm0
859        pmullw      xmm2,       [rax + 16]
860
861        punpcklbw   xmm3,       xmm0
862        pmullw      xmm3,       [rax + 32]
863
864        punpcklbw   xmm5,       xmm0
865        pmullw      xmm5,       [rax + 64]
866
867        punpcklbw   xmm4,       xmm0
868        pmullw      xmm4,       [rax + 48]
869
870        punpcklbw   xmm6,       xmm0
871        pmullw      xmm6,       [rax + 80]
872
873        paddsw      xmm2,       xmm5
874        paddsw      xmm2,       xmm3
875
876        paddsw      xmm2,       xmm1
877        paddsw      xmm2,       xmm4
878
879        paddsw      xmm2,       xmm6
880        paddsw      xmm2,       xmm7
881
882        psraw       xmm2,       7
883        packuswb    xmm2,       xmm0              ; pack and saturate
884
885        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
886%if ABI_IS_32BIT
887        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
888%else
889        add         rdi,        r8
890%endif
891        dec         rcx         ; decrement count
892        jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
893
894    ; begin epilog
895    pop rdi
896    pop rsi
897    RESTORE_GOT
898    RESTORE_XMM
899    UNSHADOW_ARGS
900    pop         rbp
901    ret
902
903
904;void vp8_unpack_block1d16_h6_sse2
905;(
906;    unsigned char  *src_ptr,
907;    unsigned short *output_ptr,
908;    unsigned int    src_pixels_per_line,
909;    unsigned int    output_height,
910;    unsigned int    output_width
911;)
912global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
913sym(vp8_unpack_block1d16_h6_sse2):
914    push        rbp
915    mov         rbp, rsp
916    SHADOW_ARGS_TO_STACK 5
917    GET_GOT     rbx
918    push        rsi
919    push        rdi
920    ; end prolog
921
922        mov         rsi,        arg(0) ;src_ptr
923        mov         rdi,        arg(1) ;output_ptr
924
925        movsxd      rcx,        dword ptr arg(3) ;output_height
926        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
927
928        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
929%if ABI_IS_32BIT=0
930        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
931%endif
932
933.unpack_block1d16_h6_sse2_rowloop:
934        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
935        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
936
937        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
938        punpcklbw   xmm1,       xmm0
939
940        movdqa      XMMWORD Ptr [rdi],         xmm1
941        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
942
943        lea         rsi,        [rsi + rax]
944%if ABI_IS_32BIT
945        add         rdi,        DWORD Ptr arg(4) ;[output_width]
946%else
947        add         rdi,        r8
948%endif
949        dec         rcx
950        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
951
952    ; begin epilog
953    pop rdi
954    pop rsi
955    RESTORE_GOT
956    UNSHADOW_ARGS
957    pop         rbp
958    ret
959
960
961;void vp8_bilinear_predict16x16_sse2
962;(
963;    unsigned char  *src_ptr,
964;    int   src_pixels_per_line,
965;    int  xoffset,
966;    int  yoffset,
967;    unsigned char *dst_ptr,
968;    int dst_pitch
969;)
970extern sym(vp8_bilinear_filters_x86_8)
971global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
972sym(vp8_bilinear_predict16x16_sse2):
973    push        rbp
974    mov         rbp, rsp
975    SHADOW_ARGS_TO_STACK 6
976    SAVE_XMM 7
977    GET_GOT     rbx
978    push        rsi
979    push        rdi
980    ; end prolog
981
982    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
983    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
984
985        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
986        movsxd      rax,        dword ptr arg(2) ;xoffset
987
988        cmp         rax,        0      ;skip first_pass filter if xoffset=0
989        je          .b16x16_sp_only
990
991        shl         rax,        5
992        add         rax,        rcx    ;HFilter
993
994        mov         rdi,        arg(4) ;dst_ptr
995        mov         rsi,        arg(0) ;src_ptr
996        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
997
998        movdqa      xmm1,       [rax]
999        movdqa      xmm2,       [rax+16]
1000
1001        movsxd      rax,        dword ptr arg(3) ;yoffset
1002
1003        cmp         rax,        0      ;skip second_pass filter if yoffset=0
1004        je          .b16x16_fp_only
1005
1006        shl         rax,        5
1007        add         rax,        rcx    ;VFilter
1008
1009        lea         rcx,        [rdi+rdx*8]
1010        lea         rcx,        [rcx+rdx*8]
1011        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
1012
1013        pxor        xmm0,       xmm0
1014
1015%if ABI_IS_32BIT=0
1016        movsxd      r8,         dword ptr arg(5) ;dst_pitch
1017%endif
1018        ; get the first horizontal line done
1019        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1020        movdqa      xmm4,       xmm3                 ; make a copy of current line
1021
1022        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1023        punpckhbw   xmm4,       xmm0
1024
1025        pmullw      xmm3,       xmm1
1026        pmullw      xmm4,       xmm1
1027
1028        movdqu      xmm5,       [rsi+1]
1029        movdqa      xmm6,       xmm5
1030
1031        punpcklbw   xmm5,       xmm0
1032        punpckhbw   xmm6,       xmm0
1033
1034        pmullw      xmm5,       xmm2
1035        pmullw      xmm6,       xmm2
1036
1037        paddw       xmm3,       xmm5
1038        paddw       xmm4,       xmm6
1039
1040        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1041        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1042
1043        paddw       xmm4,       [GLOBAL(rd)]
1044        psraw       xmm4,       VP8_FILTER_SHIFT
1045
1046        movdqa      xmm7,       xmm3
1047        packuswb    xmm7,       xmm4
1048
1049        add         rsi,        rdx                 ; next line
1050.next_row:
1051        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1052        movdqa      xmm4,       xmm3                 ; make a copy of current line
1053
1054        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1055        punpckhbw   xmm4,       xmm0
1056
1057        pmullw      xmm3,       xmm1
1058        pmullw      xmm4,       xmm1
1059
1060        movdqu      xmm5,       [rsi+1]
1061        movdqa      xmm6,       xmm5
1062
1063        punpcklbw   xmm5,       xmm0
1064        punpckhbw   xmm6,       xmm0
1065
1066        pmullw      xmm5,       xmm2
1067        pmullw      xmm6,       xmm2
1068
1069        paddw       xmm3,       xmm5
1070        paddw       xmm4,       xmm6
1071
1072        movdqa      xmm5,       xmm7
1073        movdqa      xmm6,       xmm7
1074
1075        punpcklbw   xmm5,       xmm0
1076        punpckhbw   xmm6,       xmm0
1077
1078        pmullw      xmm5,       [rax]
1079        pmullw      xmm6,       [rax]
1080
1081        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1082        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1083
1084        paddw       xmm4,       [GLOBAL(rd)]
1085        psraw       xmm4,       VP8_FILTER_SHIFT
1086
1087        movdqa      xmm7,       xmm3
1088        packuswb    xmm7,       xmm4
1089
1090        pmullw      xmm3,       [rax+16]
1091        pmullw      xmm4,       [rax+16]
1092
1093        paddw       xmm3,       xmm5
1094        paddw       xmm4,       xmm6
1095
1096        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1097        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1098
1099        paddw       xmm4,       [GLOBAL(rd)]
1100        psraw       xmm4,       VP8_FILTER_SHIFT
1101
1102        packuswb    xmm3,       xmm4
1103        movdqa      [rdi],      xmm3                 ; store the results in the destination
1104
1105        add         rsi,        rdx                 ; next line
1106%if ABI_IS_32BIT
1107        add         rdi,        DWORD PTR arg(5) ;dst_pitch
1108%else
1109        add         rdi,        r8
1110%endif
1111
1112        cmp         rdi,        rcx
1113        jne         .next_row
1114
1115        jmp         .done
1116
1117.b16x16_sp_only:
1118        movsxd      rax,        dword ptr arg(3) ;yoffset
1119        shl         rax,        5
1120        add         rax,        rcx    ;VFilter
1121
1122        mov         rdi,        arg(4) ;dst_ptr
1123        mov         rsi,        arg(0) ;src_ptr
1124        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
1125
1126        movdqa      xmm1,       [rax]
1127        movdqa      xmm2,       [rax+16]
1128
1129        lea         rcx,        [rdi+rdx*8]
1130        lea         rcx,        [rcx+rdx*8]
1131        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
1132
1133        pxor        xmm0,       xmm0
1134
1135        ; get the first horizontal line done
1136        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1137
1138        add         rsi,        rax                 ; next line
1139.next_row_spo:
1140        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1141
1142        movdqa      xmm5,       xmm7
1143        movdqa      xmm6,       xmm7
1144
1145        movdqa      xmm4,       xmm3                 ; make a copy of current line
1146        movdqa      xmm7,       xmm3
1147
1148        punpcklbw   xmm5,       xmm0
1149        punpckhbw   xmm6,       xmm0
1150        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1151        punpckhbw   xmm4,       xmm0
1152
1153        pmullw      xmm5,       xmm1
1154        pmullw      xmm6,       xmm1
1155        pmullw      xmm3,       xmm2
1156        pmullw      xmm4,       xmm2
1157
1158        paddw       xmm3,       xmm5
1159        paddw       xmm4,       xmm6
1160
1161        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1162        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1163
1164        paddw       xmm4,       [GLOBAL(rd)]
1165        psraw       xmm4,       VP8_FILTER_SHIFT
1166
1167        packuswb    xmm3,       xmm4
1168        movdqa      [rdi],      xmm3                 ; store the results in the destination
1169
1170        add         rsi,        rax                 ; next line
1171        add         rdi,        rdx                 ;dst_pitch
1172        cmp         rdi,        rcx
1173        jne         .next_row_spo
1174
1175        jmp         .done
1176
1177.b16x16_fp_only:
1178        lea         rcx,        [rdi+rdx*8]
1179        lea         rcx,        [rcx+rdx*8]
1180        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
1181        pxor        xmm0,       xmm0
1182
1183.next_row_fpo:
1184        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1185        movdqa      xmm4,       xmm3                 ; make a copy of current line
1186
1187        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
1188        punpckhbw   xmm4,       xmm0
1189
1190        pmullw      xmm3,       xmm1
1191        pmullw      xmm4,       xmm1
1192
1193        movdqu      xmm5,       [rsi+1]
1194        movdqa      xmm6,       xmm5
1195
1196        punpcklbw   xmm5,       xmm0
1197        punpckhbw   xmm6,       xmm0
1198
1199        pmullw      xmm5,       xmm2
1200        pmullw      xmm6,       xmm2
1201
1202        paddw       xmm3,       xmm5
1203        paddw       xmm4,       xmm6
1204
1205        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1206        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1207
1208        paddw       xmm4,       [GLOBAL(rd)]
1209        psraw       xmm4,       VP8_FILTER_SHIFT
1210
1211        packuswb    xmm3,       xmm4
1212        movdqa      [rdi],      xmm3                 ; store the results in the destination
1213
1214        add         rsi,        rax                 ; next line
1215        add         rdi,        rdx                 ; dst_pitch
1216        cmp         rdi,        rcx
1217        jne         .next_row_fpo
1218
1219.done:
1220    ; begin epilog
1221    pop rdi
1222    pop rsi
1223    RESTORE_GOT
1224    RESTORE_XMM
1225    UNSHADOW_ARGS
1226    pop         rbp
1227    ret
1228
1229
1230;void vp8_bilinear_predict8x8_sse2
1231;(
1232;    unsigned char  *src_ptr,
1233;    int   src_pixels_per_line,
1234;    int  xoffset,
1235;    int  yoffset,
1236;    unsigned char *dst_ptr,
1237;    int dst_pitch
1238;)
1239global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
1240sym(vp8_bilinear_predict8x8_sse2):
1241    push        rbp
1242    mov         rbp, rsp
1243    SHADOW_ARGS_TO_STACK 6
1244    SAVE_XMM 7
1245    GET_GOT     rbx
1246    push        rsi
1247    push        rdi
1248    ; end prolog
1249
1250    ALIGN_STACK 16, rax
1251    sub         rsp, 144                         ; reserve 144 bytes
1252
1253    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
1254    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
1255        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
1256
1257        mov         rsi,        arg(0) ;src_ptr
1258        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
1259
1260    ;Read 9-line unaligned data in and put them on stack. This gives a big
1261    ;performance boost.
1262        movdqu      xmm0,       [rsi]
1263        lea         rax,        [rdx + rdx*2]
1264        movdqu      xmm1,       [rsi+rdx]
1265        movdqu      xmm2,       [rsi+rdx*2]
1266        add         rsi,        rax
1267        movdqu      xmm3,       [rsi]
1268        movdqu      xmm4,       [rsi+rdx]
1269        movdqu      xmm5,       [rsi+rdx*2]
1270        add         rsi,        rax
1271        movdqu      xmm6,       [rsi]
1272        movdqu      xmm7,       [rsi+rdx]
1273
1274        movdqa      XMMWORD PTR [rsp],            xmm0
1275
1276        movdqu      xmm0,       [rsi+rdx*2]
1277
1278        movdqa      XMMWORD PTR [rsp+16],         xmm1
1279        movdqa      XMMWORD PTR [rsp+32],         xmm2
1280        movdqa      XMMWORD PTR [rsp+48],         xmm3
1281        movdqa      XMMWORD PTR [rsp+64],         xmm4
1282        movdqa      XMMWORD PTR [rsp+80],         xmm5
1283        movdqa      XMMWORD PTR [rsp+96],         xmm6
1284        movdqa      XMMWORD PTR [rsp+112],        xmm7
1285        movdqa      XMMWORD PTR [rsp+128],        xmm0
1286
1287        movsxd      rax,        dword ptr arg(2) ;xoffset
1288        shl         rax,        5
1289        add         rax,        rcx    ;HFilter
1290
1291        mov         rdi,        arg(4) ;dst_ptr
1292        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
1293
1294        movdqa      xmm1,       [rax]
1295        movdqa      xmm2,       [rax+16]
1296
1297        movsxd      rax,        dword ptr arg(3) ;yoffset
1298        shl         rax,        5
1299        add         rax,        rcx    ;VFilter
1300
1301        lea         rcx,        [rdi+rdx*8]
1302
1303        movdqa      xmm5,       [rax]
1304        movdqa      xmm6,       [rax+16]
1305
1306        pxor        xmm0,       xmm0
1307
1308        ; get the first horizontal line done
1309        movdqa      xmm3,       XMMWORD PTR [rsp]
1310        movdqa      xmm4,       xmm3                 ; make a copy of current line
1311        psrldq      xmm4,       1
1312
1313        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
1314        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
1315
1316        pmullw      xmm3,       xmm1
1317        pmullw      xmm4,       xmm2
1318
1319        paddw       xmm3,       xmm4
1320
1321        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1322        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1323
1324        movdqa      xmm7,       xmm3
1325        add         rsp,        16                 ; next line
1326.next_row8x8:
1327        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1328        movdqa      xmm4,       xmm3                 ; make a copy of current line
1329        psrldq      xmm4,       1
1330
1331        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
1332        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
1333
1334        pmullw      xmm3,       xmm1
1335        pmullw      xmm4,       xmm2
1336
1337        paddw       xmm3,       xmm4
1338        pmullw      xmm7,       xmm5
1339
1340        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1341        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1342
1343        movdqa      xmm4,       xmm3
1344
1345        pmullw      xmm3,       xmm6
1346        paddw       xmm3,       xmm7
1347
1348        movdqa      xmm7,       xmm4
1349
1350        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
1351        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
1352
1353        packuswb    xmm3,       xmm0
1354        movq        [rdi],      xmm3                 ; store the results in the destination
1355
1356        add         rsp,        16                 ; next line
1357        add         rdi,        rdx
1358
1359        cmp         rdi,        rcx
1360        jne         .next_row8x8
1361
1362    ;add rsp, 144
1363    pop rsp
1364    ; begin epilog
1365    pop rdi
1366    pop rsi
1367    RESTORE_GOT
1368    RESTORE_XMM
1369    UNSHADOW_ARGS
1370    pop         rbp
1371    ret
1372
1373
1374SECTION_RODATA
1375align 16
1376rd:
1377    times 8 dw 0x40
1378