• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*!
2;* \copy
3;*     Copyright (c)  2009-2013, Cisco Systems
4;*     All rights reserved.
5;*
6;*     Redistribution and use in source and binary forms, with or without
7;*     modification, are permitted provided that the following conditions
8;*     are met:
9;*
10;*        * Redistributions of source code must retain the above copyright
11;*          notice, this list of conditions and the following disclaimer.
12;*
13;*        * Redistributions in binary form must reproduce the above copyright
14;*          notice, this list of conditions and the following disclaimer in
15;*          the documentation and/or other materials provided with the
16;*          distribution.
17;*
18;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29;*     POSSIBILITY OF SUCH DAMAGE.
30;*
31;*
32;*  sse2inc.asm
33;*
34;*  Abstract
35;*      macro and constant
36;*
37;*  History
38;*      8/5/2009 Created
39;*
40;*
41;*************************************************************************/
42;***********************************************************************
43; Options, for DEBUG
44;***********************************************************************
45
46%if 1
47    %define MOVDQ movdqa
48%else
49    %define MOVDQ movdqu
50%endif
51
52%if 1
53    %define WELSEMMS emms
54%else
55    %define WELSEMMS
56%endif
57
58
59;***********************************************************************
60; Macros
61;***********************************************************************
62
63%ifdef WIN64 ; Windows x64 ;************************************
64
65DEFAULT REL
66
67BITS 64
68
69%define arg1 rcx
70%define arg2 rdx
71%define arg3 r8
72%define arg4 r9
73%define arg5 [rsp + push_num*8 + 40]
74%define arg6 [rsp + push_num*8 + 48]
75%define arg7 [rsp + push_num*8 + 56]
76%define arg8 [rsp + push_num*8 + 64]
77%define arg9 [rsp + push_num*8 + 72]
78%define arg10 [rsp + push_num*8 + 80]
79%define arg11 [rsp + push_num*8 + 88]
80%define arg12 [rsp + push_num*8 + 96]
81
82%define arg1d ecx
83%define arg2d edx
84%define arg3d r8d
85%define arg4d r9d
86%define arg5d arg5
87%define arg6d arg6
88%define arg7d arg7
89%define arg8d arg8
90%define arg9d arg9
91%define arg10d arg10
92%define arg11d arg11
93%define arg12d arg12
94
95%define r0 rcx
96%define r1 rdx
97%define r2 r8
98%define r3 r9
99%define r4 rax
100%define r5 r10
101%define r6 r11
102%define r7 rsp
103
104%define r0d ecx
105%define r1d edx
106%define r2d r8d
107%define r3d r9d
108%define r4d eax
109%define r5d r10d
110%define r6d r11d
111
112%define r0w  cx
113%define r1w  dx
114%define r2w  r8w
115%define r3w  r9w
116%define r4w  ax
117%define r6w  r11w
118
119%define r0b  cl
120%define r1b  dl
121%define r2b  r8l
122%define r3b  r9l
123
124%define  PUSHRFLAGS     pushfq
125%define  POPRFLAGS      popfq
126%define  retrq          rax
127%define  retrd          eax
128
129%elifdef UNIX64 ; Unix x64 ;************************************
130
131DEFAULT REL
132
133BITS 64
134
135%ifidn __OUTPUT_FORMAT__,elf64
136SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-executable
137%endif
138
139%define arg1 rdi
140%define arg2 rsi
141%define arg3 rdx
142%define arg4 rcx
143%define arg5 r8
144%define arg6 r9
145%define arg7 [rsp + push_num*8 + 8]
146%define arg8 [rsp + push_num*8 + 16]
147%define arg9 [rsp + push_num*8 + 24]
148%define arg10 [rsp + push_num*8 + 32]
149%define arg11 [rsp + push_num*8 + 40]
150%define arg12 [rsp + push_num*8 + 48]
151
152%define arg1d edi
153%define arg2d esi
154%define arg3d edx
155%define arg4d ecx
156%define arg5d r8d
157%define arg6d r9d
158%define arg7d arg7
159%define arg8d arg8
160%define arg9d arg9
161%define arg10d arg10
162%define arg11d arg11
163%define arg12d arg12
164
165%define r0 rdi
166%define r1 rsi
167%define r2 rdx
168%define r3 rcx
169%define r4 r8
170%define r5 r9
171%define r6 r10
172%define r7 rsp
173
174%define r0d edi
175%define r1d esi
176%define r2d edx
177%define r3d ecx
178%define r4d r8d
179%define r5d r9d
180%define r6d r10d
181
182%define r0w  di
183%define r1w  si
184%define r2w  dx
185%define r3w  cx
186%define r4w  r8w
187%define r6w  r10w
188
189%define r0b  dil
190%define r1b  sil
191%define r2b  dl
192%define r3b  cl
193
194%define  PUSHRFLAGS     pushfq
195%define  POPRFLAGS      popfq
196%define  retrq          rax
197%define  retrd          eax
198
199%elifdef X86_32 ; X86_32 ;************************************
200
201BITS 32
202
203%ifidn __OUTPUT_FORMAT__,elf
204SECTION .note.GNU-stack noalloc noexec nowrite progbits ; Mark the stack as non-executable
205%endif
206
207%define arg1 [esp + push_num*4 + 4]
208%define arg2 [esp + push_num*4 + 8]
209%define arg3 [esp + push_num*4 + 12]
210%define arg4 [esp + push_num*4 + 16]
211%define arg5 [esp + push_num*4 + 20]
212%define arg6 [esp + push_num*4 + 24]
213%define arg7 [esp + push_num*4 + 28]
214%define arg8 [esp + push_num*4 + 32]
215%define arg9 [esp + push_num*4 + 36]
216%define arg10 [esp + push_num*4 + 40]
217%define arg11 [esp + push_num*4 + 44]
218%define arg12 [esp + push_num*4 + 48]
219
220%define arg1d arg1
221%define arg2d arg2
222%define arg3d arg3
223%define arg4d arg4
224%define arg5d arg5
225%define arg6d arg6
226%define arg7d arg7
227%define arg8d arg8
228%define arg9d arg9
229%define arg10d arg10
230%define arg11d arg11
231%define arg12d arg12
232
233%define r0 eax
234%define r1 ecx
235%define r2 edx
236%define r3 ebx
237%define r4 esi
238%define r5 edi
239%define r6 ebp
240%define r7 esp
241
242%define r0d eax
243%define r1d ecx
244%define r2d edx
245%define r3d ebx
246%define r4d esi
247%define r5d edi
248%define r6d ebp
249
250%define r0w ax
251%define r1w cx
252%define r2w dx
253%define r3w bx
254%define r4w si
255%define r6w bp
256
257%define r0b al
258%define r1b cl
259%define r2b dl
260%define r3b bl
261
262%define  PUSHRFLAGS     pushfd
263%define  POPRFLAGS      popfd
264%define  retrq          eax      ; 32 bit mode do not support 64 bits regesters
265%define  retrd          eax
266
267%endif
268
269%macro LOAD_PARA 2
270    mov %1, %2
271%endmacro
272
273%macro LOAD_1_PARA 0
274    %ifdef X86_32
275        mov r0, [esp + push_num*4 + 4]
276    %endif
277%endmacro
278
279%macro LOAD_2_PARA 0
280    %ifdef X86_32
281        mov r0, [esp + push_num*4 + 4]
282        mov r1, [esp + push_num*4 + 8]
283    %endif
284%endmacro
285
286%macro LOAD_3_PARA 0
287    %ifdef X86_32
288        mov r0, [esp + push_num*4 + 4]
289        mov r1, [esp + push_num*4 + 8]
290        mov r2, [esp + push_num*4 + 12]
291    %endif
292%endmacro
293
294%macro LOAD_4_PARA 0
295    %ifdef X86_32
296        push r3
297        %assign  push_num push_num+1
298        mov r0, [esp + push_num*4 + 4]
299        mov r1, [esp + push_num*4 + 8]
300        mov r2, [esp + push_num*4 + 12]
301        mov r3, [esp + push_num*4 + 16]
302    %endif
303%endmacro
304
305%macro LOAD_5_PARA 0
306    %ifdef X86_32
307        push r3
308        push r4
309        %assign  push_num push_num+2
310        mov r0, [esp + push_num*4 + 4]
311        mov r1, [esp + push_num*4 + 8]
312        mov r2, [esp + push_num*4 + 12]
313        mov r3, [esp + push_num*4 + 16]
314        mov r4, [esp + push_num*4 + 20]
315    %elifdef WIN64
316        mov r4, [rsp + push_num*8 + 40]
317    %endif
318%endmacro
319
320%macro LOAD_6_PARA 0
321    %ifdef X86_32
322        push r3
323        push r4
324        push r5
325        %assign  push_num push_num+3
326        mov r0, [esp + push_num*4 + 4]
327        mov r1, [esp + push_num*4 + 8]
328        mov r2, [esp + push_num*4 + 12]
329        mov r3, [esp + push_num*4 + 16]
330        mov r4, [esp + push_num*4 + 20]
331        mov r5, [esp + push_num*4 + 24]
332    %elifdef WIN64
333        mov r4, [rsp + push_num*8 + 40]
334        mov r5, [rsp + push_num*8 + 48]
335    %endif
336%endmacro
337
338%macro LOAD_7_PARA 0
339    %ifdef X86_32
340        push r3
341        push r4
342        push r5
343        push r6
344        %assign  push_num push_num+4
345        mov r0, [esp + push_num*4 + 4]
346        mov r1, [esp + push_num*4 + 8]
347        mov r2, [esp + push_num*4 + 12]
348        mov r3, [esp + push_num*4 + 16]
349        mov r4, [esp + push_num*4 + 20]
350        mov r5, [esp + push_num*4 + 24]
351        mov r6, [esp + push_num*4 + 28]
352    %elifdef WIN64
353        mov r4, [rsp + push_num*8 + 40]
354        mov r5, [rsp + push_num*8 + 48]
355        mov r6, [rsp + push_num*8 + 56]
356    %elifdef UNIX64
357        mov r6, [rsp + push_num*8 + 8]
358    %endif
359%endmacro
360
361
362
363%macro LOAD_4_PARA_POP 0
364    %ifdef X86_32
365        pop r3
366    %endif
367%endmacro
368
369%macro LOAD_5_PARA_POP 0
370    %ifdef X86_32
371        pop r4
372        pop r3
373    %endif
374%endmacro
375
376%macro LOAD_6_PARA_POP 0
377    %ifdef X86_32
378        pop r5
379        pop r4
380        pop r3
381    %endif
382%endmacro
383
384%macro LOAD_7_PARA_POP 0
385    %ifdef X86_32
386        pop r6
387        pop r5
388        pop r4
389        pop r3
390    %endif
391%endmacro
392
393%macro PUSH_XMM 1
394    %ifdef WIN64
395        %assign xmm_num_regs %1
396        %if xmm_num_regs > 6
397            %ifdef push_num
398                %assign push_num push_num+2*(%1-6)
399            %endif
400            sub rsp, 16*(%1 - 6)
401            movdqu [rsp], xmm6
402        %endif
403        %if xmm_num_regs > 7
404            movdqu [rsp+16], xmm7
405        %endif
406        %if xmm_num_regs > 8
407            movdqu [rsp+32], xmm8
408        %endif
409        %if xmm_num_regs > 9
410            movdqu [rsp+48], xmm9
411        %endif
412        %if xmm_num_regs > 10
413            movdqu [rsp+64], xmm10
414        %endif
415        %if xmm_num_regs > 11
416            movdqu [rsp+80], xmm11
417        %endif
418        %if xmm_num_regs > 12
419            movdqu [rsp+96], xmm12
420        %endif
421        %if xmm_num_regs > 13
422            movdqu [rsp+112], xmm13
423        %endif
424        %if xmm_num_regs > 14
425            movdqu [rsp+128], xmm14
426        %endif
427        %if xmm_num_regs > 15
428            movdqu [rsp+144], xmm15
429        %endif
430    %endif
431%endmacro
432
433%macro POP_XMM 0
434    %ifdef WIN64
435        %if xmm_num_regs > 15
436            movdqu xmm15, [rsp+144]
437        %endif
438        %if xmm_num_regs > 14
439            movdqu xmm14, [rsp+128]
440        %endif
441        %if xmm_num_regs > 13
442            movdqu xmm13, [rsp+112]
443        %endif
444        %if xmm_num_regs > 12
445            movdqu xmm12, [rsp+96]
446        %endif
447        %if xmm_num_regs > 11
448            movdqu xmm11, [rsp+80]
449        %endif
450        %if xmm_num_regs > 10
451            movdqu xmm10, [rsp+64]
452        %endif
453        %if xmm_num_regs > 9
454            movdqu xmm9, [rsp+48]
455        %endif
456        %if xmm_num_regs > 8
457            movdqu xmm8, [rsp+32]
458        %endif
459        %if xmm_num_regs > 7
460            movdqu xmm7, [rsp+16]
461        %endif
462        %if xmm_num_regs > 6
463            movdqu xmm6, [rsp]
464            add rsp, 16*(xmm_num_regs - 6)
465        %endif
466    %endif
467%endmacro
468
469%macro SIGN_EXTENSION 2
470    %ifndef X86_32
471        movsxd %1, %2
472    %endif
473%endmacro
474
475%macro SIGN_EXTENSIONW 2
476    %ifndef X86_32
477        movsx %1, %2
478    %endif
479%endmacro
480
481%macro ZERO_EXTENSION 1
482    %ifndef X86_32
483        mov dword %1, %1
484    %endif
485%endmacro
486
487%macro WELS_EXTERN 1
488    ALIGN 16, nop
489    %ifdef PREFIX
490        %ifdef WELS_PRIVATE_EXTERN
491            global _%1: WELS_PRIVATE_EXTERN
492        %else
493            global _%1
494        %endif
495        %define %1 _%1
496    %else
497        %ifdef WELS_PRIVATE_EXTERN
498            global %1: WELS_PRIVATE_EXTERN
499        %else
500            global %1
501        %endif
502    %endif
503    %1:
504%endmacro
505
506%macro WELS_AbsW 2
507    pxor        %2, %2
508    psubw       %2, %1
509    pmaxsw      %1, %2
510%endmacro
511
512%macro MMX_XSwap  4
513    movq        %4, %2
514    punpckh%1   %4, %3
515    punpckl%1   %2, %3
516%endmacro
517
518; pOut mm1, mm4, mm5, mm3
519%macro MMX_Trans4x4W 5
520    MMX_XSwap wd, %1, %2, %5
521    MMX_XSwap wd, %3, %4, %2
522    MMX_XSwap dq, %1, %3, %4
523    MMX_XSwap dq, %5, %2, %3
524%endmacro
525
526;for TRANSPOSE
527%macro SSE2_XSawp 4
528    movdqa      %4, %2
529    punpckl%1   %2, %3
530    punpckh%1   %4, %3
531%endmacro
532
533; in: xmm1, xmm2, xmm3, xmm4  pOut:  xmm1, xmm4, xmm5, mm3
534%macro SSE2_Trans4x4D 5
535    SSE2_XSawp dq,  %1, %2, %5
536    SSE2_XSawp dq,  %3, %4, %2
537    SSE2_XSawp qdq, %1, %3, %4
538    SSE2_XSawp qdq, %5, %2, %3
539%endmacro
540
541;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
542%macro SSE2_TransTwo4x4W 5
543    SSE2_XSawp wd,  %1, %2, %5
544    SSE2_XSawp wd,  %3, %4, %2
545    SSE2_XSawp dq,  %1, %3, %4
546    SSE2_XSawp dq,  %5, %2, %3
547    SSE2_XSawp qdq, %1, %5, %2
548    SSE2_XSawp qdq, %4, %3, %5
549%endmacro
550
551;in:  m1, m2, m3, m4, m5, m6, m7, m8
552;pOut: m5, m3, m4, m8, m6, m2, m7, m1
553%macro SSE2_TransTwo8x8B 9
554    movdqa  %9,     %8
555    SSE2_XSawp bw,  %1, %2, %8
556    SSE2_XSawp bw,  %3, %4, %2
557    SSE2_XSawp bw,  %5, %6, %4
558    movdqa  %6, %9
559    movdqa  %9, %4
560    SSE2_XSawp bw,  %7, %6, %4
561
562    SSE2_XSawp wd,  %1, %3, %6
563    SSE2_XSawp wd,  %8, %2, %3
564    SSE2_XSawp wd,  %5, %7, %2
565    movdqa  %7, %9
566    movdqa  %9, %3
567    SSE2_XSawp wd,  %7, %4, %3
568
569    SSE2_XSawp dq,  %1, %5, %4
570    SSE2_XSawp dq,  %6, %2, %5
571    SSE2_XSawp dq,  %8, %7, %2
572    movdqa  %7, %9
573    movdqa  %9, %5
574    SSE2_XSawp dq,  %7, %3, %5
575
576    SSE2_XSawp qdq,  %1, %8, %3
577    SSE2_XSawp qdq,  %4, %2, %8
578    SSE2_XSawp qdq,  %6, %7, %2
579    movdqa  %7, %9
580    movdqa  %9, %1
581    SSE2_XSawp qdq,  %7, %5, %1
582    movdqa  %5, %9
583%endmacro
584
585;xmm0, xmm6, xmm7, [eax], [ecx]
586;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
587%macro SSE2_LoadDiff8P 5
588    movq         %1, %4
589    punpcklbw    %1, %3
590    movq         %2, %5
591    punpcklbw    %2, %3
592    psubw        %1, %2
593%endmacro
594
595; m2 = m1 + m2, m1 = m1 - m2
596%macro SSE2_SumSub 3
597    movdqa  %3, %2
598    paddw   %2, %1
599    psubw   %1, %3
600%endmacro
601
602
603%macro butterfly_1to16_sse      3       ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
604    mov %3h, %3l
605    movd %1, e%3x           ; i.e, 1% = eax (=b0)
606    pshuflw %2, %1, 00h     ; ..., b0 b0 b0 b0 b0 b0 b0 b0
607    pshufd %1, %2, 00h      ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
608%endmacro
609
610;copy a dw into a xmm for 8 times
611%macro SSE2_Copy8Times 2
612    movd    %1, %2
613    punpcklwd %1, %1
614    pshufd  %1,     %1,     0
615%endmacro
616
617;copy a db into a xmm for 16 times
618%macro SSE2_Copy16Times 2
619    movd            %1, %2
620    pshuflw         %1, %1, 0
621    punpcklqdq      %1, %1
622    packuswb        %1,     %1
623%endmacro
624
625
626
627;***********************************************************************
628;preprocessor constants
629;***********************************************************************
630;dw 32,32,32,32,32,32,32,32 for xmm
631;dw 32,32,32,32 for mm
632%macro WELS_DW32 1
633    pcmpeqw %1,%1
634    psrlw %1,15
635    psllw %1,5
636%endmacro
637
638;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
639;dw 1, 1, 1, 1 for mm
640%macro WELS_DW1 1
641    pcmpeqw %1,%1
642    psrlw %1,15
643%endmacro
644
645;all 0 for xmm and mm
646%macro WELS_Zero 1
647    pxor %1, %1
648%endmacro
649
650;dd 1, 1, 1, 1 for xmm
651;dd 1, 1 for mm
652%macro WELS_DD1 1
653    pcmpeqw %1,%1
654    psrld %1,31
655%endmacro
656
657;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
658%macro WELS_DB1 1
659    pcmpeqw %1,%1
660    psrlw %1,15
661    packuswb %1,%1
662%endmacro
663
664%macro WELS_DW1_VEX 1
665    vpcmpeqw %1, %1, %1
666    vpsrlw   %1, %1, 15
667%endmacro
668
669%macro WELS_DW32_VEX 1
670    vpcmpeqw %1, %1, %1
671    vpsrlw   %1, %1, 15
672    vpsllw   %1, %1,  5
673%endmacro
674
675%macro WELS_DW32767_VEX 1
676    vpcmpeqw %1, %1, %1
677    vpsrlw   %1, %1,  1
678%endmacro
679
680
681;***********************************************************************
682; Utility macros for X86_32 PIC support
683;***********************************************************************
684
685; Used internally by other macros.
686%macro INIT_X86_32_PIC_ 2
687%ifdef X86_32_PICASM
688    %xdefine pic_ptr %1
689    %xdefine pic_ptr_preserve %2
690  %if pic_ptr_preserve
691    %assign push_num push_num+1
692    push            pic_ptr
693  %endif
694    call            %%get_pc
695%%pic_refpoint:
696    jmp             %%pic_init_done
697%%get_pc:
698    mov             pic_ptr, [esp]
699    ret
700%%pic_init_done:
701    %define pic(data_addr) (pic_ptr+(data_addr)-%%pic_refpoint)
702%else
703    %define pic(data_addr) (data_addr)
704%endif
705%endmacro
706
707; Get program counter and define a helper macro "pic(addr)" to convert absolute
708; addresses to program counter-relative addresses if X86_32_PICASM is defined.
709; Otherwise define "pic(addr)" as an identity function.
710; %1=register to store PC/EIP in.
711%macro INIT_X86_32_PIC 1
712    INIT_X86_32_PIC_ %1, 1
713%endmacro
714
715; Equivalent as above, but without preserving the value of the register argument.
716%macro INIT_X86_32_PIC_NOPRESERVE 1
717    INIT_X86_32_PIC_ %1, 0
718%endmacro
719
720; Clean up after INIT_X86_32_PIC.
721; Restore the register used to hold PC/EIP if applicable, and undefine defines.
722%macro DEINIT_X86_32_PIC 0
723%ifdef X86_32_PICASM
724  %if pic_ptr_preserve
725    pop             pic_ptr
726    %assign push_num push_num-1
727  %endif
728    %undef pic_ptr
729    %undef pic_ptr_preserve
730%endif
731    %undef pic
732%endmacro
733
734; Equivalent as above, but without undefining. Useful for functions with
735; multiple epilogues.
736%macro DEINIT_X86_32_PIC_KEEPDEF 0
737%ifdef X86_32_PICASM
738  %if pic_ptr_preserve
739    pop             pic_ptr
740    %assign push_num push_num-1
741  %endif
742%endif
743%endmacro
744