• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;Note: tap3 and tap4 have to be applied and added after other taps to avoid
15;overflow.
16
17%macro GET_FILTERS_4 0
18    mov         rdx, arg(5)                 ;filter ptr
19    mov         rcx, 0x0400040
20
21    movdqa      xmm7, [rdx]                 ;load filters
22    pshuflw     xmm0, xmm7, 0b              ;k0
23    pshuflw     xmm1, xmm7, 01010101b       ;k1
24    pshuflw     xmm2, xmm7, 10101010b       ;k2
25    pshuflw     xmm3, xmm7, 11111111b       ;k3
26    psrldq      xmm7, 8
27    pshuflw     xmm4, xmm7, 0b              ;k4
28    pshuflw     xmm5, xmm7, 01010101b       ;k5
29    pshuflw     xmm6, xmm7, 10101010b       ;k6
30    pshuflw     xmm7, xmm7, 11111111b       ;k7
31
32    punpcklqdq  xmm0, xmm1
33    punpcklqdq  xmm2, xmm3
34    punpcklqdq  xmm5, xmm4
35    punpcklqdq  xmm6, xmm7
36
37    movdqa      k0k1, xmm0
38    movdqa      k2k3, xmm2
39    movdqa      k5k4, xmm5
40    movdqa      k6k7, xmm6
41
42    movq        xmm6, rcx
43    pshufd      xmm6, xmm6, 0
44    movdqa      krd, xmm6
45
46    pxor        xmm7, xmm7
47    movdqa      zero, xmm7
48%endm
49
50%macro APPLY_FILTER_4 1
51    punpckldq   xmm0, xmm1                  ;two row in one register
52    punpckldq   xmm6, xmm7
53    punpckldq   xmm2, xmm3
54    punpckldq   xmm5, xmm4
55
56    punpcklbw   xmm0, zero                  ;unpack to word
57    punpcklbw   xmm6, zero
58    punpcklbw   xmm2, zero
59    punpcklbw   xmm5, zero
60
61    pmullw      xmm0, k0k1                  ;multiply the filter factors
62    pmullw      xmm6, k6k7
63    pmullw      xmm2, k2k3
64    pmullw      xmm5, k5k4
65
66    paddsw      xmm0, xmm6                  ;sum
67    movdqa      xmm1, xmm0
68    psrldq      xmm1, 8
69    paddsw      xmm0, xmm1
70    paddsw      xmm0, xmm2
71    psrldq      xmm2, 8
72    paddsw      xmm0, xmm5
73    psrldq      xmm5, 8
74    paddsw      xmm0, xmm2
75    paddsw      xmm0, xmm5
76
77    paddsw      xmm0, krd                   ;rounding
78    psraw       xmm0, 7                     ;shift
79    packuswb    xmm0, xmm0                  ;pack to byte
80
81%if %1
82    movd        xmm1, [rdi]
83    pavgb       xmm0, xmm1
84%endif
85    movd        [rdi], xmm0
86%endm
87
88%macro GET_FILTERS 0
89    mov         rdx, arg(5)                 ;filter ptr
90    mov         rsi, arg(0)                 ;src_ptr
91    mov         rdi, arg(2)                 ;output_ptr
92    mov         rcx, 0x0400040
93
94    movdqa      xmm7, [rdx]                 ;load filters
95    pshuflw     xmm0, xmm7, 0b              ;k0
96    pshuflw     xmm1, xmm7, 01010101b       ;k1
97    pshuflw     xmm2, xmm7, 10101010b       ;k2
98    pshuflw     xmm3, xmm7, 11111111b       ;k3
99    pshufhw     xmm4, xmm7, 0b              ;k4
100    pshufhw     xmm5, xmm7, 01010101b       ;k5
101    pshufhw     xmm6, xmm7, 10101010b       ;k6
102    pshufhw     xmm7, xmm7, 11111111b       ;k7
103
104    punpcklwd   xmm0, xmm0
105    punpcklwd   xmm1, xmm1
106    punpcklwd   xmm2, xmm2
107    punpcklwd   xmm3, xmm3
108    punpckhwd   xmm4, xmm4
109    punpckhwd   xmm5, xmm5
110    punpckhwd   xmm6, xmm6
111    punpckhwd   xmm7, xmm7
112
113    movdqa      k0,   xmm0                  ;store filter factors on stack
114    movdqa      k1,   xmm1
115    movdqa      k2,   xmm2
116    movdqa      k3,   xmm3
117    movdqa      k4,   xmm4
118    movdqa      k5,   xmm5
119    movdqa      k6,   xmm6
120    movdqa      k7,   xmm7
121
122    movq        xmm6, rcx
123    pshufd      xmm6, xmm6, 0
124    movdqa      krd, xmm6                   ;rounding
125
126    pxor        xmm7, xmm7
127    movdqa      zero, xmm7
128%endm
129
130%macro LOAD_VERT_8 1
131    movq        xmm0, [rsi + %1]            ;0
132    movq        xmm1, [rsi + rax + %1]      ;1
133    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
134    lea         rsi,  [rsi + rax]
135    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
136    movq        xmm2, [rsi + rax + %1]      ;2
137    movq        xmm3, [rsi + rax * 2 + %1]  ;3
138    movq        xmm4, [rsi + rdx + %1]      ;4
139    movq        xmm5, [rsi + rax * 4 + %1]  ;5
140%endm
141
142%macro APPLY_FILTER_8 2
143    punpcklbw   xmm0, zero
144    punpcklbw   xmm1, zero
145    punpcklbw   xmm6, zero
146    punpcklbw   xmm7, zero
147    punpcklbw   xmm2, zero
148    punpcklbw   xmm5, zero
149    punpcklbw   xmm3, zero
150    punpcklbw   xmm4, zero
151
152    pmullw      xmm0, k0
153    pmullw      xmm1, k1
154    pmullw      xmm6, k6
155    pmullw      xmm7, k7
156    pmullw      xmm2, k2
157    pmullw      xmm5, k5
158    pmullw      xmm3, k3
159    pmullw      xmm4, k4
160
161    paddsw      xmm0, xmm1
162    paddsw      xmm0, xmm6
163    paddsw      xmm0, xmm7
164    paddsw      xmm0, xmm2
165    paddsw      xmm0, xmm5
166    paddsw      xmm0, xmm3
167    paddsw      xmm0, xmm4
168
169    paddsw      xmm0, krd                   ;rounding
170    psraw       xmm0, 7                     ;shift
171    packuswb    xmm0, xmm0                  ;pack back to byte
172%if %1
173    movq        xmm1, [rdi + %2]
174    pavgb       xmm0, xmm1
175%endif
176    movq        [rdi + %2], xmm0
177%endm
178
179SECTION .text
180
181;void vpx_filter_block1d4_v8_sse2
182;(
183;    unsigned char *src_ptr,
184;    unsigned int   src_pitch,
185;    unsigned char *output_ptr,
186;    unsigned int   out_pitch,
187;    unsigned int   output_height,
188;    short *filter
189;)
190global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
191sym(vpx_filter_block1d4_v8_sse2):
192    push        rbp
193    mov         rbp, rsp
194    SHADOW_ARGS_TO_STACK 6
195    SAVE_XMM 7
196    push        rsi
197    push        rdi
198    push        rbx
199    ; end prolog
200
201    ALIGN_STACK 16, rax
202    sub         rsp, 16 * 6
203    %define k0k1 [rsp + 16 * 0]
204    %define k2k3 [rsp + 16 * 1]
205    %define k5k4 [rsp + 16 * 2]
206    %define k6k7 [rsp + 16 * 3]
207    %define krd [rsp + 16 * 4]
208    %define zero [rsp + 16 * 5]
209
210    GET_FILTERS_4
211
212    mov         rsi, arg(0)                 ;src_ptr
213    mov         rdi, arg(2)                 ;output_ptr
214
215    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
216    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
217    lea         rdx, [rax + rax * 2]
218    movsxd      rcx, DWORD PTR arg(4)       ;output_height
219
220.loop:
221    movd        xmm0, [rsi]                 ;load src: row 0
222    movd        xmm1, [rsi + rax]           ;1
223    movd        xmm6, [rsi + rdx * 2]       ;6
224    lea         rsi,  [rsi + rax]
225    movd        xmm7, [rsi + rdx * 2]       ;7
226    movd        xmm2, [rsi + rax]           ;2
227    movd        xmm3, [rsi + rax * 2]       ;3
228    movd        xmm4, [rsi + rdx]           ;4
229    movd        xmm5, [rsi + rax * 4]       ;5
230
231    APPLY_FILTER_4 0
232
233    lea         rdi, [rdi + rbx]
234    dec         rcx
235    jnz         .loop
236
237    add rsp, 16 * 6
238    pop rsp
239    pop rbx
240    ; begin epilog
241    pop rdi
242    pop rsi
243    RESTORE_XMM
244    UNSHADOW_ARGS
245    pop         rbp
246    ret
247
248;void vpx_filter_block1d8_v8_sse2
249;(
250;    unsigned char *src_ptr,
251;    unsigned int   src_pitch,
252;    unsigned char *output_ptr,
253;    unsigned int   out_pitch,
254;    unsigned int   output_height,
255;    short *filter
256;)
257global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
258sym(vpx_filter_block1d8_v8_sse2):
259    push        rbp
260    mov         rbp, rsp
261    SHADOW_ARGS_TO_STACK 6
262    SAVE_XMM 7
263    push        rsi
264    push        rdi
265    push        rbx
266    ; end prolog
267
268    ALIGN_STACK 16, rax
269    sub         rsp, 16 * 10
270    %define k0 [rsp + 16 * 0]
271    %define k1 [rsp + 16 * 1]
272    %define k2 [rsp + 16 * 2]
273    %define k3 [rsp + 16 * 3]
274    %define k4 [rsp + 16 * 4]
275    %define k5 [rsp + 16 * 5]
276    %define k6 [rsp + 16 * 6]
277    %define k7 [rsp + 16 * 7]
278    %define krd [rsp + 16 * 8]
279    %define zero [rsp + 16 * 9]
280
281    GET_FILTERS
282
283    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
284    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
285    lea         rdx, [rax + rax * 2]
286    movsxd      rcx, DWORD PTR arg(4)       ;output_height
287
288.loop:
289    LOAD_VERT_8 0
290    APPLY_FILTER_8 0, 0
291
292    lea         rdi, [rdi + rbx]
293    dec         rcx
294    jnz         .loop
295
296    add rsp, 16 * 10
297    pop rsp
298    pop rbx
299    ; begin epilog
300    pop rdi
301    pop rsi
302    RESTORE_XMM
303    UNSHADOW_ARGS
304    pop         rbp
305    ret
306
307;void vpx_filter_block1d16_v8_sse2
308;(
309;    unsigned char *src_ptr,
310;    unsigned int   src_pitch,
311;    unsigned char *output_ptr,
312;    unsigned int   out_pitch,
313;    unsigned int   output_height,
314;    short *filter
315;)
316global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
317sym(vpx_filter_block1d16_v8_sse2):
318    push        rbp
319    mov         rbp, rsp
320    SHADOW_ARGS_TO_STACK 6
321    SAVE_XMM 7
322    push        rsi
323    push        rdi
324    push        rbx
325    ; end prolog
326
327    ALIGN_STACK 16, rax
328    sub         rsp, 16 * 10
329    %define k0 [rsp + 16 * 0]
330    %define k1 [rsp + 16 * 1]
331    %define k2 [rsp + 16 * 2]
332    %define k3 [rsp + 16 * 3]
333    %define k4 [rsp + 16 * 4]
334    %define k5 [rsp + 16 * 5]
335    %define k6 [rsp + 16 * 6]
336    %define k7 [rsp + 16 * 7]
337    %define krd [rsp + 16 * 8]
338    %define zero [rsp + 16 * 9]
339
340    GET_FILTERS
341
342    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
343    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
344    lea         rdx, [rax + rax * 2]
345    movsxd      rcx, DWORD PTR arg(4)       ;output_height
346
347.loop:
348    LOAD_VERT_8 0
349    APPLY_FILTER_8 0, 0
350    sub         rsi, rax
351
352    LOAD_VERT_8 8
353    APPLY_FILTER_8 0, 8
354    add         rdi, rbx
355
356    dec         rcx
357    jnz         .loop
358
359    add rsp, 16 * 10
360    pop rsp
361    pop rbx
362    ; begin epilog
363    pop rdi
364    pop rsi
365    RESTORE_XMM
366    UNSHADOW_ARGS
367    pop         rbp
368    ret
369
370global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
371sym(vpx_filter_block1d4_v8_avg_sse2):
372    push        rbp
373    mov         rbp, rsp
374    SHADOW_ARGS_TO_STACK 6
375    SAVE_XMM 7
376    push        rsi
377    push        rdi
378    push        rbx
379    ; end prolog
380
381    ALIGN_STACK 16, rax
382    sub         rsp, 16 * 6
383    %define k0k1 [rsp + 16 * 0]
384    %define k2k3 [rsp + 16 * 1]
385    %define k5k4 [rsp + 16 * 2]
386    %define k6k7 [rsp + 16 * 3]
387    %define krd [rsp + 16 * 4]
388    %define zero [rsp + 16 * 5]
389
390    GET_FILTERS_4
391
392    mov         rsi, arg(0)                 ;src_ptr
393    mov         rdi, arg(2)                 ;output_ptr
394
395    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
396    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
397    lea         rdx, [rax + rax * 2]
398    movsxd      rcx, DWORD PTR arg(4)       ;output_height
399
400.loop:
401    movd        xmm0, [rsi]                 ;load src: row 0
402    movd        xmm1, [rsi + rax]           ;1
403    movd        xmm6, [rsi + rdx * 2]       ;6
404    lea         rsi,  [rsi + rax]
405    movd        xmm7, [rsi + rdx * 2]       ;7
406    movd        xmm2, [rsi + rax]           ;2
407    movd        xmm3, [rsi + rax * 2]       ;3
408    movd        xmm4, [rsi + rdx]           ;4
409    movd        xmm5, [rsi + rax * 4]       ;5
410
411    APPLY_FILTER_4 1
412
413    lea         rdi, [rdi + rbx]
414    dec         rcx
415    jnz         .loop
416
417    add rsp, 16 * 6
418    pop rsp
419    pop rbx
420    ; begin epilog
421    pop rdi
422    pop rsi
423    RESTORE_XMM
424    UNSHADOW_ARGS
425    pop         rbp
426    ret
427
428global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
429sym(vpx_filter_block1d8_v8_avg_sse2):
430    push        rbp
431    mov         rbp, rsp
432    SHADOW_ARGS_TO_STACK 6
433    SAVE_XMM 7
434    push        rsi
435    push        rdi
436    push        rbx
437    ; end prolog
438
439    ALIGN_STACK 16, rax
440    sub         rsp, 16 * 10
441    %define k0 [rsp + 16 * 0]
442    %define k1 [rsp + 16 * 1]
443    %define k2 [rsp + 16 * 2]
444    %define k3 [rsp + 16 * 3]
445    %define k4 [rsp + 16 * 4]
446    %define k5 [rsp + 16 * 5]
447    %define k6 [rsp + 16 * 6]
448    %define k7 [rsp + 16 * 7]
449    %define krd [rsp + 16 * 8]
450    %define zero [rsp + 16 * 9]
451
452    GET_FILTERS
453
454    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
455    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
456    lea         rdx, [rax + rax * 2]
457    movsxd      rcx, DWORD PTR arg(4)       ;output_height
458.loop:
459    LOAD_VERT_8 0
460    APPLY_FILTER_8 1, 0
461
462    lea         rdi, [rdi + rbx]
463    dec         rcx
464    jnz         .loop
465
466    add rsp, 16 * 10
467    pop rsp
468    pop rbx
469    ; begin epilog
470    pop rdi
471    pop rsi
472    RESTORE_XMM
473    UNSHADOW_ARGS
474    pop         rbp
475    ret
476
477global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
478sym(vpx_filter_block1d16_v8_avg_sse2):
479    push        rbp
480    mov         rbp, rsp
481    SHADOW_ARGS_TO_STACK 6
482    SAVE_XMM 7
483    push        rsi
484    push        rdi
485    push        rbx
486    ; end prolog
487
488    ALIGN_STACK 16, rax
489    sub         rsp, 16 * 10
490    %define k0 [rsp + 16 * 0]
491    %define k1 [rsp + 16 * 1]
492    %define k2 [rsp + 16 * 2]
493    %define k3 [rsp + 16 * 3]
494    %define k4 [rsp + 16 * 4]
495    %define k5 [rsp + 16 * 5]
496    %define k6 [rsp + 16 * 6]
497    %define k7 [rsp + 16 * 7]
498    %define krd [rsp + 16 * 8]
499    %define zero [rsp + 16 * 9]
500
501    GET_FILTERS
502
503    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
504    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
505    lea         rdx, [rax + rax * 2]
506    movsxd      rcx, DWORD PTR arg(4)       ;output_height
507.loop:
508    LOAD_VERT_8 0
509    APPLY_FILTER_8 1, 0
510    sub         rsi, rax
511
512    LOAD_VERT_8 8
513    APPLY_FILTER_8 1, 8
514    add         rdi, rbx
515
516    dec         rcx
517    jnz         .loop
518
519    add rsp, 16 * 10
520    pop rsp
521    pop rbx
522    ; begin epilog
523    pop rdi
524    pop rsi
525    RESTORE_XMM
526    UNSHADOW_ARGS
527    pop         rbp
528    ret
529
530;void vpx_filter_block1d4_h8_sse2
531;(
532;    unsigned char  *src_ptr,
533;    unsigned int    src_pixels_per_line,
534;    unsigned char  *output_ptr,
535;    unsigned int    output_pitch,
536;    unsigned int    output_height,
537;    short *filter
538;)
539global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
540sym(vpx_filter_block1d4_h8_sse2):
541    push        rbp
542    mov         rbp, rsp
543    SHADOW_ARGS_TO_STACK 6
544    SAVE_XMM 7
545    push        rsi
546    push        rdi
547    ; end prolog
548
549    ALIGN_STACK 16, rax
550    sub         rsp, 16 * 6
551    %define k0k1 [rsp + 16 * 0]
552    %define k2k3 [rsp + 16 * 1]
553    %define k5k4 [rsp + 16 * 2]
554    %define k6k7 [rsp + 16 * 3]
555    %define krd [rsp + 16 * 4]
556    %define zero [rsp + 16 * 5]
557
558    GET_FILTERS_4
559
560    mov         rsi, arg(0)                 ;src_ptr
561    mov         rdi, arg(2)                 ;output_ptr
562
563    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
564    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
565    movsxd      rcx, DWORD PTR arg(4)       ;output_height
566
567.loop:
568    movdqu      xmm0,   [rsi - 3]           ;load src
569
570    movdqa      xmm1, xmm0
571    movdqa      xmm6, xmm0
572    movdqa      xmm7, xmm0
573    movdqa      xmm2, xmm0
574    movdqa      xmm3, xmm0
575    movdqa      xmm5, xmm0
576    movdqa      xmm4, xmm0
577
578    psrldq      xmm1, 1
579    psrldq      xmm6, 6
580    psrldq      xmm7, 7
581    psrldq      xmm2, 2
582    psrldq      xmm3, 3
583    psrldq      xmm5, 5
584    psrldq      xmm4, 4
585
586    APPLY_FILTER_4 0
587
588    lea         rsi, [rsi + rax]
589    lea         rdi, [rdi + rdx]
590    dec         rcx
591    jnz         .loop
592
593    add rsp, 16 * 6
594    pop rsp
595
596    ; begin epilog
597    pop rdi
598    pop rsi
599    RESTORE_XMM
600    UNSHADOW_ARGS
601    pop         rbp
602    ret
603
604;void vpx_filter_block1d8_h8_sse2
605;(
606;    unsigned char  *src_ptr,
607;    unsigned int    src_pixels_per_line,
608;    unsigned char  *output_ptr,
609;    unsigned int    output_pitch,
610;    unsigned int    output_height,
611;    short *filter
612;)
613global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
614sym(vpx_filter_block1d8_h8_sse2):
615    push        rbp
616    mov         rbp, rsp
617    SHADOW_ARGS_TO_STACK 6
618    SAVE_XMM 7
619    push        rsi
620    push        rdi
621    ; end prolog
622
623    ALIGN_STACK 16, rax
624    sub         rsp, 16 * 10
625    %define k0 [rsp + 16 * 0]
626    %define k1 [rsp + 16 * 1]
627    %define k2 [rsp + 16 * 2]
628    %define k3 [rsp + 16 * 3]
629    %define k4 [rsp + 16 * 4]
630    %define k5 [rsp + 16 * 5]
631    %define k6 [rsp + 16 * 6]
632    %define k7 [rsp + 16 * 7]
633    %define krd [rsp + 16 * 8]
634    %define zero [rsp + 16 * 9]
635
636    GET_FILTERS
637
638    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
639    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
640    movsxd      rcx, DWORD PTR arg(4)       ;output_height
641
642.loop:
643    movdqu      xmm0,   [rsi - 3]           ;load src
644
645    movdqa      xmm1, xmm0
646    movdqa      xmm6, xmm0
647    movdqa      xmm7, xmm0
648    movdqa      xmm2, xmm0
649    movdqa      xmm5, xmm0
650    movdqa      xmm3, xmm0
651    movdqa      xmm4, xmm0
652
653    psrldq      xmm1, 1
654    psrldq      xmm6, 6
655    psrldq      xmm7, 7
656    psrldq      xmm2, 2
657    psrldq      xmm5, 5
658    psrldq      xmm3, 3
659    psrldq      xmm4, 4
660
661    APPLY_FILTER_8 0, 0
662
663    lea         rsi, [rsi + rax]
664    lea         rdi, [rdi + rdx]
665    dec         rcx
666    jnz         .loop
667
668    add rsp, 16 * 10
669    pop rsp
670
671    ; begin epilog
672    pop rdi
673    pop rsi
674    RESTORE_XMM
675    UNSHADOW_ARGS
676    pop         rbp
677    ret
678
679;void vpx_filter_block1d16_h8_sse2
680;(
681;    unsigned char  *src_ptr,
682;    unsigned int    src_pixels_per_line,
683;    unsigned char  *output_ptr,
684;    unsigned int    output_pitch,
685;    unsigned int    output_height,
686;    short *filter
687;)
688global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
689sym(vpx_filter_block1d16_h8_sse2):
690    push        rbp
691    mov         rbp, rsp
692    SHADOW_ARGS_TO_STACK 6
693    SAVE_XMM 7
694    push        rsi
695    push        rdi
696    ; end prolog
697
698    ALIGN_STACK 16, rax
699    sub         rsp, 16 * 10
700    %define k0 [rsp + 16 * 0]
701    %define k1 [rsp + 16 * 1]
702    %define k2 [rsp + 16 * 2]
703    %define k3 [rsp + 16 * 3]
704    %define k4 [rsp + 16 * 4]
705    %define k5 [rsp + 16 * 5]
706    %define k6 [rsp + 16 * 6]
707    %define k7 [rsp + 16 * 7]
708    %define krd [rsp + 16 * 8]
709    %define zero [rsp + 16 * 9]
710
711    GET_FILTERS
712
713    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
714    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
715    movsxd      rcx, DWORD PTR arg(4)       ;output_height
716
717.loop:
718    movdqu      xmm0,   [rsi - 3]           ;load src
719
720    movdqa      xmm1, xmm0
721    movdqa      xmm6, xmm0
722    movdqa      xmm7, xmm0
723    movdqa      xmm2, xmm0
724    movdqa      xmm5, xmm0
725    movdqa      xmm3, xmm0
726    movdqa      xmm4, xmm0
727
728    psrldq      xmm1, 1
729    psrldq      xmm6, 6
730    psrldq      xmm7, 7
731    psrldq      xmm2, 2
732    psrldq      xmm5, 5
733    psrldq      xmm3, 3
734    psrldq      xmm4, 4
735
736    APPLY_FILTER_8 0, 0
737
738    movdqu      xmm0,   [rsi + 5]           ;load src
739
740    movdqa      xmm1, xmm0
741    movdqa      xmm6, xmm0
742    movdqa      xmm7, xmm0
743    movdqa      xmm2, xmm0
744    movdqa      xmm5, xmm0
745    movdqa      xmm3, xmm0
746    movdqa      xmm4, xmm0
747
748    psrldq      xmm1, 1
749    psrldq      xmm6, 6
750    psrldq      xmm7, 7
751    psrldq      xmm2, 2
752    psrldq      xmm5, 5
753    psrldq      xmm3, 3
754    psrldq      xmm4, 4
755
756    APPLY_FILTER_8 0, 8
757
758    lea         rsi, [rsi + rax]
759    lea         rdi, [rdi + rdx]
760    dec         rcx
761    jnz         .loop
762
763    add rsp, 16 * 10
764    pop rsp
765
766    ; begin epilog
767    pop rdi
768    pop rsi
769    RESTORE_XMM
770    UNSHADOW_ARGS
771    pop         rbp
772    ret
773
774global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
775sym(vpx_filter_block1d4_h8_avg_sse2):
776    push        rbp
777    mov         rbp, rsp
778    SHADOW_ARGS_TO_STACK 6
779    SAVE_XMM 7
780    push        rsi
781    push        rdi
782    ; end prolog
783
784    ALIGN_STACK 16, rax
785    sub         rsp, 16 * 6
786    %define k0k1 [rsp + 16 * 0]
787    %define k2k3 [rsp + 16 * 1]
788    %define k5k4 [rsp + 16 * 2]
789    %define k6k7 [rsp + 16 * 3]
790    %define krd [rsp + 16 * 4]
791    %define zero [rsp + 16 * 5]
792
793    GET_FILTERS_4
794
795    mov         rsi, arg(0)                 ;src_ptr
796    mov         rdi, arg(2)                 ;output_ptr
797
798    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
799    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
800    movsxd      rcx, DWORD PTR arg(4)       ;output_height
801
802.loop:
803    movdqu      xmm0,   [rsi - 3]           ;load src
804
805    movdqa      xmm1, xmm0
806    movdqa      xmm6, xmm0
807    movdqa      xmm7, xmm0
808    movdqa      xmm2, xmm0
809    movdqa      xmm3, xmm0
810    movdqa      xmm5, xmm0
811    movdqa      xmm4, xmm0
812
813    psrldq      xmm1, 1
814    psrldq      xmm6, 6
815    psrldq      xmm7, 7
816    psrldq      xmm2, 2
817    psrldq      xmm3, 3
818    psrldq      xmm5, 5
819    psrldq      xmm4, 4
820
821    APPLY_FILTER_4 1
822
823    lea         rsi, [rsi + rax]
824    lea         rdi, [rdi + rdx]
825    dec         rcx
826    jnz         .loop
827
828    add rsp, 16 * 6
829    pop rsp
830
831    ; begin epilog
832    pop rdi
833    pop rsi
834    RESTORE_XMM
835    UNSHADOW_ARGS
836    pop         rbp
837    ret
838
839global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
840sym(vpx_filter_block1d8_h8_avg_sse2):
841    push        rbp
842    mov         rbp, rsp
843    SHADOW_ARGS_TO_STACK 6
844    SAVE_XMM 7
845    push        rsi
846    push        rdi
847    ; end prolog
848
849    ALIGN_STACK 16, rax
850    sub         rsp, 16 * 10
851    %define k0 [rsp + 16 * 0]
852    %define k1 [rsp + 16 * 1]
853    %define k2 [rsp + 16 * 2]
854    %define k3 [rsp + 16 * 3]
855    %define k4 [rsp + 16 * 4]
856    %define k5 [rsp + 16 * 5]
857    %define k6 [rsp + 16 * 6]
858    %define k7 [rsp + 16 * 7]
859    %define krd [rsp + 16 * 8]
860    %define zero [rsp + 16 * 9]
861
862    GET_FILTERS
863
864    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
865    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
866    movsxd      rcx, DWORD PTR arg(4)       ;output_height
867
868.loop:
869    movdqu      xmm0,   [rsi - 3]           ;load src
870
871    movdqa      xmm1, xmm0
872    movdqa      xmm6, xmm0
873    movdqa      xmm7, xmm0
874    movdqa      xmm2, xmm0
875    movdqa      xmm5, xmm0
876    movdqa      xmm3, xmm0
877    movdqa      xmm4, xmm0
878
879    psrldq      xmm1, 1
880    psrldq      xmm6, 6
881    psrldq      xmm7, 7
882    psrldq      xmm2, 2
883    psrldq      xmm5, 5
884    psrldq      xmm3, 3
885    psrldq      xmm4, 4
886
887    APPLY_FILTER_8 1, 0
888
889    lea         rsi, [rsi + rax]
890    lea         rdi, [rdi + rdx]
891    dec         rcx
892    jnz         .loop
893
894    add rsp, 16 * 10
895    pop rsp
896
897    ; begin epilog
898    pop rdi
899    pop rsi
900    RESTORE_XMM
901    UNSHADOW_ARGS
902    pop         rbp
903    ret
904
905global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
906sym(vpx_filter_block1d16_h8_avg_sse2):
907    push        rbp
908    mov         rbp, rsp
909    SHADOW_ARGS_TO_STACK 6
910    SAVE_XMM 7
911    push        rsi
912    push        rdi
913    ; end prolog
914
915    ALIGN_STACK 16, rax
916    sub         rsp, 16 * 10
917    %define k0 [rsp + 16 * 0]
918    %define k1 [rsp + 16 * 1]
919    %define k2 [rsp + 16 * 2]
920    %define k3 [rsp + 16 * 3]
921    %define k4 [rsp + 16 * 4]
922    %define k5 [rsp + 16 * 5]
923    %define k6 [rsp + 16 * 6]
924    %define k7 [rsp + 16 * 7]
925    %define krd [rsp + 16 * 8]
926    %define zero [rsp + 16 * 9]
927
928    GET_FILTERS
929
930    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
931    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
932    movsxd      rcx, DWORD PTR arg(4)       ;output_height
933
934.loop:
935    movdqu      xmm0,   [rsi - 3]           ;load src
936
937    movdqa      xmm1, xmm0
938    movdqa      xmm6, xmm0
939    movdqa      xmm7, xmm0
940    movdqa      xmm2, xmm0
941    movdqa      xmm5, xmm0
942    movdqa      xmm3, xmm0
943    movdqa      xmm4, xmm0
944
945    psrldq      xmm1, 1
946    psrldq      xmm6, 6
947    psrldq      xmm7, 7
948    psrldq      xmm2, 2
949    psrldq      xmm5, 5
950    psrldq      xmm3, 3
951    psrldq      xmm4, 4
952
953    APPLY_FILTER_8 1, 0
954
955    movdqu      xmm0,   [rsi + 5]           ;load src
956
957    movdqa      xmm1, xmm0
958    movdqa      xmm6, xmm0
959    movdqa      xmm7, xmm0
960    movdqa      xmm2, xmm0
961    movdqa      xmm5, xmm0
962    movdqa      xmm3, xmm0
963    movdqa      xmm4, xmm0
964
965    psrldq      xmm1, 1
966    psrldq      xmm6, 6
967    psrldq      xmm7, 7
968    psrldq      xmm2, 2
969    psrldq      xmm5, 5
970    psrldq      xmm3, 3
971    psrldq      xmm4, 4
972
973    APPLY_FILTER_8 1, 8
974
975    lea         rsi, [rsi + rax]
976    lea         rdi, [rdi + rdx]
977    dec         rcx
978    jnz         .loop
979
980    add rsp, 16 * 10
981    pop rsp
982
983    ; begin epilog
984    pop rdi
985    pop rsi
986    RESTORE_XMM
987    UNSHADOW_ARGS
988    pop         rbp
989    ret
990