• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define BLOCK_HEIGHT_WIDTH 4
15%define VP8_FILTER_WEIGHT 128
16%define VP8_FILTER_SHIFT  7
17
18SECTION .text
19
20;/************************************************************************************
21; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
22; input pixel array has output_height rows. This routine assumes that output_height is an
23; even number. This function handles 8 pixels in horizontal direction, calculating ONE
24; rows each iteration to take advantage of the 128 bits operations.
25;*************************************************************************************/
26;void vp8_filter_block1d8_h6_sse2
27;(
28;    unsigned char  *src_ptr,
29;    unsigned short *output_ptr,
30;    unsigned int    src_pixels_per_line,
31;    unsigned int    pixel_step,
32;    unsigned int    output_height,
33;    unsigned int    output_width,
34;    short           *vp8_filter
35;)
36global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
37sym(vp8_filter_block1d8_h6_sse2):
38    push        rbp
39    mov         rbp, rsp
40    SHADOW_ARGS_TO_STACK 7
41    SAVE_XMM 7
42    GET_GOT     rbx
43    push        rsi
44    push        rdi
45    ; end prolog
46
47        mov         rdx,        arg(6) ;vp8_filter
48        mov         rsi,        arg(0) ;src_ptr
49
50        mov         rdi,        arg(1) ;output_ptr
51
52        movsxd      rcx,        dword ptr arg(4) ;output_height
53        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
54%if ABI_IS_32BIT=0
55        movsxd      r8,         dword ptr arg(5) ;output_width
56%endif
57        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
58
59.filter_block1d8_h6_rowloop:
60        movq        xmm3,       MMWORD PTR [rsi - 2]
61        movq        xmm1,       MMWORD PTR [rsi + 6]
62
63        prefetcht2  [rsi+rax-2]
64
65        pslldq      xmm1,       8
66        por         xmm1,       xmm3
67
68        movdqa      xmm4,       xmm1
69        movdqa      xmm5,       xmm1
70
71        movdqa      xmm6,       xmm1
72        movdqa      xmm7,       xmm1
73
74        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
75        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
76
77        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
78        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
79
80        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
81        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
82
83
84        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
85        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
86
87        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
88
89        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
90        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
91
92        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
93
94        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
95        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
96
97
98        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
99
100        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
101        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
102
103
104        paddsw      xmm4,       xmm7
105        paddsw      xmm4,       xmm5
106
107        paddsw      xmm4,       xmm3
108        paddsw      xmm4,       xmm6
109
110        paddsw      xmm4,       xmm1
111        paddsw      xmm4,       [GLOBAL(rd)]
112
113        psraw       xmm4,       7
114
115        packuswb    xmm4,       xmm0
116        punpcklbw   xmm4,       xmm0
117
118        movdqa      XMMWORD Ptr [rdi],         xmm4
119        lea         rsi,        [rsi + rax]
120
121%if ABI_IS_32BIT
122        add         rdi,        DWORD Ptr arg(5) ;[output_width]
123%else
124        add         rdi,        r8
125%endif
126        dec         rcx
127
128        jnz         .filter_block1d8_h6_rowloop                ; next row
129
130    ; begin epilog
131    pop rdi
132    pop rsi
133    RESTORE_GOT
134    RESTORE_XMM
135    UNSHADOW_ARGS
136    pop         rbp
137    ret
138
139
140;void vp8_filter_block1d16_h6_sse2
141;(
142;    unsigned char  *src_ptr,
143;    unsigned short *output_ptr,
144;    unsigned int    src_pixels_per_line,
145;    unsigned int    pixel_step,
146;    unsigned int    output_height,
147;    unsigned int    output_width,
148;    short           *vp8_filter
149;)
150;/************************************************************************************
151; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
152; input pixel array has output_height rows. This routine assumes that output_height is an
153; even number. This function handles 8 pixels in horizontal direction, calculating ONE
154; rows each iteration to take advantage of the 128 bits operations.
155;*************************************************************************************/
156global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
157sym(vp8_filter_block1d16_h6_sse2):
158    push        rbp
159    mov         rbp, rsp
160    SHADOW_ARGS_TO_STACK 7
161    SAVE_XMM 7
162    GET_GOT     rbx
163    push        rsi
164    push        rdi
165    ; end prolog
166
167        mov         rdx,        arg(6) ;vp8_filter
168        mov         rsi,        arg(0) ;src_ptr
169
170        mov         rdi,        arg(1) ;output_ptr
171
172        movsxd      rcx,        dword ptr arg(4) ;output_height
173        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
174%if ABI_IS_32BIT=0
175        movsxd      r8,         dword ptr arg(5) ;output_width
176%endif
177
178        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
179
180.filter_block1d16_h6_sse2_rowloop:
181        movq        xmm3,       MMWORD PTR [rsi - 2]
182        movq        xmm1,       MMWORD PTR [rsi + 6]
183
184        ; Load from 11 to avoid reading out of bounds.
185        movq        xmm2,       MMWORD PTR [rsi +11]
186        ; The lower bits are not cleared before 'or'ing with xmm1,
187        ; but that is OK because the values in the overlapping positions
188        ; are already equal to the ones in xmm1.
189        pslldq      xmm2,       5
190
191        por         xmm2,       xmm1
192        prefetcht2  [rsi+rax-2]
193
194        pslldq      xmm1,       8
195        por         xmm1,       xmm3
196
197        movdqa      xmm4,       xmm1
198        movdqa      xmm5,       xmm1
199
200        movdqa      xmm6,       xmm1
201        movdqa      xmm7,       xmm1
202
203        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
204        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
205
206        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
207        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
208
209        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
210        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
211
212
213        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
214        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
215
216        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
217
218        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
219        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
220
221        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
222
223        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
224        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
225
226
227        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
228
229        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
230        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
231
232        paddsw      xmm4,       xmm7
233        paddsw      xmm4,       xmm5
234
235        paddsw      xmm4,       xmm3
236        paddsw      xmm4,       xmm6
237
238        paddsw      xmm4,       xmm1
239        paddsw      xmm4,       [GLOBAL(rd)]
240
241        psraw       xmm4,       7
242
243        packuswb    xmm4,       xmm0
244        punpcklbw   xmm4,       xmm0
245
246        movdqa      XMMWORD Ptr [rdi],         xmm4
247
248        movdqa      xmm3,       xmm2
249        movdqa      xmm4,       xmm2
250
251        movdqa      xmm5,       xmm2
252        movdqa      xmm6,       xmm2
253
254        movdqa      xmm7,       xmm2
255
256        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
257        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
258
259        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
260        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
261
262        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
263        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
264
265
266        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
267        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
268
269        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
270
271        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
272        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
273
274        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
275
276        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
277        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
278
279        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
280
281        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
282        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
283
284
285        paddsw      xmm4,       xmm7
286        paddsw      xmm4,       xmm5
287
288        paddsw      xmm4,       xmm3
289        paddsw      xmm4,       xmm6
290
291        paddsw      xmm4,       xmm2
292        paddsw      xmm4,       [GLOBAL(rd)]
293
294        psraw       xmm4,       7
295
296        packuswb    xmm4,       xmm0
297        punpcklbw   xmm4,       xmm0
298
299        movdqa      XMMWORD Ptr [rdi+16],      xmm4
300
301        lea         rsi,        [rsi + rax]
302%if ABI_IS_32BIT
303        add         rdi,        DWORD Ptr arg(5) ;[output_width]
304%else
305        add         rdi,        r8
306%endif
307
308        dec         rcx
309        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
310
311    ; begin epilog
312    pop rdi
313    pop rsi
314    RESTORE_GOT
315    RESTORE_XMM
316    UNSHADOW_ARGS
317    pop         rbp
318    ret
319
320
321;void vp8_filter_block1d8_v6_sse2
322;(
323;    short *src_ptr,
324;    unsigned char *output_ptr,
325;    int dst_ptich,
326;    unsigned int pixels_per_line,
327;    unsigned int pixel_step,
328;    unsigned int output_height,
329;    unsigned int output_width,
330;    short * vp8_filter
331;)
332;/************************************************************************************
333; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
334; input pixel array has output_height rows.
335;*************************************************************************************/
336global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
337sym(vp8_filter_block1d8_v6_sse2):
338    push        rbp
339    mov         rbp, rsp
340    SHADOW_ARGS_TO_STACK 8
341    SAVE_XMM 7
342    GET_GOT     rbx
343    push        rsi
344    push        rdi
345    ; end prolog
346
347        mov         rax,        arg(7) ;vp8_filter
348        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
349
350        mov         rdi,        arg(1) ;output_ptr
351        mov         rsi,        arg(0) ;src_ptr
352
353        sub         rsi,        rdx
354        sub         rsi,        rdx
355
356        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
357        pxor        xmm0,       xmm0                        ; clear xmm0
358
359        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
360%if ABI_IS_32BIT=0
361        movsxd      r8,         dword ptr arg(2) ; dst_ptich
362%endif
363
364.vp8_filter_block1d8_v6_sse2_loop:
365        movdqa      xmm1,       XMMWORD PTR [rsi]
366        pmullw      xmm1,       [rax]
367
368        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
369        pmullw      xmm2,       [rax + 16]
370
371        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
372        pmullw      xmm3,       [rax + 32]
373
374        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
375        pmullw      xmm5,       [rax + 64]
376
377        add         rsi,        rdx
378        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
379
380        pmullw      xmm4,       [rax + 48]
381        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
382
383        pmullw      xmm6,       [rax + 80]
384
385        paddsw      xmm2,       xmm5
386        paddsw      xmm2,       xmm3
387
388        paddsw      xmm2,       xmm1
389        paddsw      xmm2,       xmm4
390
391        paddsw      xmm2,       xmm6
392        paddsw      xmm2,       xmm7
393
394        psraw       xmm2,       7
395        packuswb    xmm2,       xmm0              ; pack and saturate
396
397        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
398%if ABI_IS_32BIT
399        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
400%else
401        add         rdi,        r8
402%endif
403        dec         rcx         ; decrement count
404        jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
405
406    ; begin epilog
407    pop rdi
408    pop rsi
409    RESTORE_GOT
410    RESTORE_XMM
411    UNSHADOW_ARGS
412    pop         rbp
413    ret
414
415
416;void vp8_filter_block1d16_v6_sse2
417;(
418;    unsigned short *src_ptr,
419;    unsigned char *output_ptr,
420;    int dst_ptich,
421;    unsigned int pixels_per_line,
422;    unsigned int pixel_step,
423;    unsigned int output_height,
424;    unsigned int output_width,
425;    const short    *vp8_filter
426;)
427;/************************************************************************************
428; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
429; input pixel array has output_height rows.
430;*************************************************************************************/
431global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
432sym(vp8_filter_block1d16_v6_sse2):
433    push        rbp
434    mov         rbp, rsp
435    SHADOW_ARGS_TO_STACK 8
436    SAVE_XMM 7
437    GET_GOT     rbx
438    push        rsi
439    push        rdi
440    ; end prolog
441
442        mov         rax,        arg(7) ;vp8_filter
443        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
444
445        mov         rdi,        arg(1) ;output_ptr
446        mov         rsi,        arg(0) ;src_ptr
447
448        sub         rsi,        rdx
449        sub         rsi,        rdx
450
451        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
452%if ABI_IS_32BIT=0
453        movsxd      r8,         dword ptr arg(2) ; dst_ptich
454%endif
455
456.vp8_filter_block1d16_v6_sse2_loop:
457; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
458        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
459        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
460        pmullw      xmm1,       [rax + 16]
461        pmullw      xmm2,       [rax + 16]
462
463        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
464        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
465        pmullw      xmm3,       [rax + 64]
466        pmullw      xmm4,       [rax + 64]
467
468        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
469        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
470        pmullw      xmm5,       [rax + 32]
471        pmullw      xmm6,       [rax + 32]
472
473        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
474        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
475        pmullw      xmm7,       [rax]
476        pmullw      xmm0,       [rax]
477
478        paddsw      xmm1,       xmm3
479        paddsw      xmm2,       xmm4
480        paddsw      xmm1,       xmm5
481        paddsw      xmm2,       xmm6
482        paddsw      xmm1,       xmm7
483        paddsw      xmm2,       xmm0
484
485        add         rsi,        rdx
486
487        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
488        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
489        pmullw      xmm3,       [rax + 48]
490        pmullw      xmm4,       [rax + 48]
491
492        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
493        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
494        pmullw      xmm5,       [rax + 80]
495        pmullw      xmm6,       [rax + 80]
496
497        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
498        pxor        xmm0,       xmm0                        ; clear xmm0
499
500        paddsw      xmm1,       xmm3
501        paddsw      xmm2,       xmm4
502        paddsw      xmm1,       xmm5
503        paddsw      xmm2,       xmm6
504
505        paddsw      xmm1,       xmm7
506        paddsw      xmm2,       xmm7
507
508        psraw       xmm1,       7
509        psraw       xmm2,       7
510
511        packuswb    xmm1,       xmm2              ; pack and saturate
512        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
513%if ABI_IS_32BIT
514        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
515%else
516        add         rdi,        r8
517%endif
518        dec         rcx         ; decrement count
519        jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
520
521    ; begin epilog
522    pop rdi
523    pop rsi
524    RESTORE_GOT
525    RESTORE_XMM
526    UNSHADOW_ARGS
527    pop         rbp
528    ret
529
530
531;void vp8_filter_block1d8_h6_only_sse2
532;(
533;    unsigned char  *src_ptr,
534;    unsigned int    src_pixels_per_line,
535;    unsigned char  *output_ptr,
536;    int dst_ptich,
537;    unsigned int    output_height,
538;    const short    *vp8_filter
539;)
540; First-pass filter only when yoffset==0
541global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
542sym(vp8_filter_block1d8_h6_only_sse2):
543    push        rbp
544    mov         rbp, rsp
545    SHADOW_ARGS_TO_STACK 6
546    SAVE_XMM 7
547    GET_GOT     rbx
548    push        rsi
549    push        rdi
550    ; end prolog
551
552        mov         rdx,        arg(5) ;vp8_filter
553        mov         rsi,        arg(0) ;src_ptr
554
555        mov         rdi,        arg(2) ;output_ptr
556
557        movsxd      rcx,        dword ptr arg(4) ;output_height
558        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
559%if ABI_IS_32BIT=0
560        movsxd      r8,         dword ptr arg(3) ;dst_ptich
561%endif
562        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
563
564.filter_block1d8_h6_only_rowloop:
565        movq        xmm3,       MMWORD PTR [rsi - 2]
566        movq        xmm1,       MMWORD PTR [rsi + 6]
567
568        prefetcht2  [rsi+rax-2]
569
570        pslldq      xmm1,       8
571        por         xmm1,       xmm3
572
573        movdqa      xmm4,       xmm1
574        movdqa      xmm5,       xmm1
575
576        movdqa      xmm6,       xmm1
577        movdqa      xmm7,       xmm1
578
579        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
580        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
581
582        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
583        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
584
585        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
586        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
587
588
589        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
590        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
591
592        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
593
594        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
595        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
596
597        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
598
599        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
600        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
601
602
603        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
604
605        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
606        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
607
608
609        paddsw      xmm4,       xmm7
610        paddsw      xmm4,       xmm5
611
612        paddsw      xmm4,       xmm3
613        paddsw      xmm4,       xmm6
614
615        paddsw      xmm4,       xmm1
616        paddsw      xmm4,       [GLOBAL(rd)]
617
618        psraw       xmm4,       7
619
620        packuswb    xmm4,       xmm0
621
622        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
623        lea         rsi,        [rsi + rax]
624
625%if ABI_IS_32BIT
626        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
627%else
628        add         rdi,        r8
629%endif
630        dec         rcx
631
632        jnz         .filter_block1d8_h6_only_rowloop               ; next row
633
634    ; begin epilog
635    pop rdi
636    pop rsi
637    RESTORE_GOT
638    RESTORE_XMM
639    UNSHADOW_ARGS
640    pop         rbp
641    ret
642
643
644;void vp8_filter_block1d16_h6_only_sse2
645;(
646;    unsigned char  *src_ptr,
647;    unsigned int    src_pixels_per_line,
648;    unsigned char  *output_ptr,
649;    int dst_ptich,
650;    unsigned int    output_height,
651;    const short    *vp8_filter
652;)
653; First-pass filter only when yoffset==0
654global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
655sym(vp8_filter_block1d16_h6_only_sse2):
656    push        rbp
657    mov         rbp, rsp
658    SHADOW_ARGS_TO_STACK 6
659    SAVE_XMM 7
660    GET_GOT     rbx
661    push        rsi
662    push        rdi
663    ; end prolog
664
665        mov         rdx,        arg(5) ;vp8_filter
666        mov         rsi,        arg(0) ;src_ptr
667
668        mov         rdi,        arg(2) ;output_ptr
669
670        movsxd      rcx,        dword ptr arg(4) ;output_height
671        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
672%if ABI_IS_32BIT=0
673        movsxd      r8,         dword ptr arg(3) ;dst_ptich
674%endif
675
676        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
677
678.filter_block1d16_h6_only_sse2_rowloop:
679        movq        xmm3,       MMWORD PTR [rsi - 2]
680        movq        xmm1,       MMWORD PTR [rsi + 6]
681
682        movq        xmm2,       MMWORD PTR [rsi +14]
683        pslldq      xmm2,       8
684
685        por         xmm2,       xmm1
686        prefetcht2  [rsi+rax-2]
687
688        pslldq      xmm1,       8
689        por         xmm1,       xmm3
690
691        movdqa      xmm4,       xmm1
692        movdqa      xmm5,       xmm1
693
694        movdqa      xmm6,       xmm1
695        movdqa      xmm7,       xmm1
696
697        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
698        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
699
700        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
701        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
702
703        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
704        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
705
706        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
707        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
708
709        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
710
711        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
712        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
713
714        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
715
716        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
717        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
718
719        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
720
721        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
722        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
723
724        paddsw      xmm4,       xmm7
725        paddsw      xmm4,       xmm5
726
727        paddsw      xmm4,       xmm3
728        paddsw      xmm4,       xmm6
729
730        paddsw      xmm4,       xmm1
731        paddsw      xmm4,       [GLOBAL(rd)]
732
733        psraw       xmm4,       7
734
735        packuswb    xmm4,       xmm0                        ; lower 8 bytes
736
737        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
738
739        movdqa      xmm3,       xmm2
740        movdqa      xmm4,       xmm2
741
742        movdqa      xmm5,       xmm2
743        movdqa      xmm6,       xmm2
744
745        movdqa      xmm7,       xmm2
746
747        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
748        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
749
750        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
751        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
752
753        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
754        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
755
756        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
757        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
758
759        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
760
761        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
762        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
763
764        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
765
766        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
767        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
768
769        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
770
771        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
772        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
773
774        paddsw      xmm4,       xmm7
775        paddsw      xmm4,       xmm5
776
777        paddsw      xmm4,       xmm3
778        paddsw      xmm4,       xmm6
779
780        paddsw      xmm4,       xmm2
781        paddsw      xmm4,       [GLOBAL(rd)]
782
783        psraw       xmm4,       7
784
785        packuswb    xmm4,       xmm0                        ; higher 8 bytes
786
787        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
788
789        lea         rsi,        [rsi + rax]
790%if ABI_IS_32BIT
791        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
792%else
793        add         rdi,        r8
794%endif
795
796        dec         rcx
797        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
798
799    ; begin epilog
800    pop rdi
801    pop rsi
802    RESTORE_GOT
803    RESTORE_XMM
804    UNSHADOW_ARGS
805    pop         rbp
806    ret
807
808
809;void vp8_filter_block1d8_v6_only_sse2
810;(
811;    unsigned char *src_ptr,
812;    unsigned int    src_pixels_per_line,
813;    unsigned char *output_ptr,
814;    int dst_ptich,
815;    unsigned int output_height,
816;    const short    *vp8_filter
817;)
818; Second-pass filter only when xoffset==0
819global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
820sym(vp8_filter_block1d8_v6_only_sse2):
821    push        rbp
822    mov         rbp, rsp
823    SHADOW_ARGS_TO_STACK 6
824    SAVE_XMM 7
825    GET_GOT     rbx
826    push        rsi
827    push        rdi
828    ; end prolog
829
830        mov         rsi,        arg(0) ;src_ptr
831        mov         rdi,        arg(2) ;output_ptr
832
833        movsxd      rcx,        dword ptr arg(4) ;output_height
834        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
835
836        mov         rax,        arg(5) ;vp8_filter
837
838        pxor        xmm0,       xmm0                        ; clear xmm0
839
840        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
841%if ABI_IS_32BIT=0
842        movsxd      r8,         dword ptr arg(3) ; dst_ptich
843%endif
844
845.vp8_filter_block1d8_v6_only_sse2_loop:
846        movq        xmm1,       MMWORD PTR [rsi]
847        movq        xmm2,       MMWORD PTR [rsi + rdx]
848        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
849        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
850        add         rsi,        rdx
851        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
852        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
853
854        punpcklbw   xmm1,       xmm0
855        pmullw      xmm1,       [rax]
856
857        punpcklbw   xmm2,       xmm0
858        pmullw      xmm2,       [rax + 16]
859
860        punpcklbw   xmm3,       xmm0
861        pmullw      xmm3,       [rax + 32]
862
863        punpcklbw   xmm5,       xmm0
864        pmullw      xmm5,       [rax + 64]
865
866        punpcklbw   xmm4,       xmm0
867        pmullw      xmm4,       [rax + 48]
868
869        punpcklbw   xmm6,       xmm0
870        pmullw      xmm6,       [rax + 80]
871
872        paddsw      xmm2,       xmm5
873        paddsw      xmm2,       xmm3
874
875        paddsw      xmm2,       xmm1
876        paddsw      xmm2,       xmm4
877
878        paddsw      xmm2,       xmm6
879        paddsw      xmm2,       xmm7
880
881        psraw       xmm2,       7
882        packuswb    xmm2,       xmm0              ; pack and saturate
883
884        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
885%if ABI_IS_32BIT
886        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
887%else
888        add         rdi,        r8
889%endif
890        dec         rcx         ; decrement count
891        jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
892
893    ; begin epilog
894    pop rdi
895    pop rsi
896    RESTORE_GOT
897    RESTORE_XMM
898    UNSHADOW_ARGS
899    pop         rbp
900    ret
901
902
903;void vp8_unpack_block1d16_h6_sse2
904;(
905;    unsigned char  *src_ptr,
906;    unsigned short *output_ptr,
907;    unsigned int    src_pixels_per_line,
908;    unsigned int    output_height,
909;    unsigned int    output_width
910;)
911global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
912sym(vp8_unpack_block1d16_h6_sse2):
913    push        rbp
914    mov         rbp, rsp
915    SHADOW_ARGS_TO_STACK 5
916    GET_GOT     rbx
917    push        rsi
918    push        rdi
919    ; end prolog
920
921        mov         rsi,        arg(0) ;src_ptr
922        mov         rdi,        arg(1) ;output_ptr
923
924        movsxd      rcx,        dword ptr arg(3) ;output_height
925        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
926
927        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
928%if ABI_IS_32BIT=0
929        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
930%endif
931
932.unpack_block1d16_h6_sse2_rowloop:
933        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
934        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
935
936        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
937        punpcklbw   xmm1,       xmm0
938
939        movdqa      XMMWORD Ptr [rdi],         xmm1
940        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
941
942        lea         rsi,        [rsi + rax]
943%if ABI_IS_32BIT
944        add         rdi,        DWORD Ptr arg(4) ;[output_width]
945%else
946        add         rdi,        r8
947%endif
948        dec         rcx
949        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
950
951    ; begin epilog
952    pop rdi
953    pop rsi
954    RESTORE_GOT
955    UNSHADOW_ARGS
956    pop         rbp
957    ret
958
959
960SECTION_RODATA
961align 16
962rd:
963    times 8 dw 0x40
964