1# SIMD SSE2 dot product 2# Equivalent to the following C code: 3# long dotprod(signed short *a,signed short *b,int cnt) 4# { 5# long sum = 0; 6# cnt *= 8; 7# while(cnt--) 8# sum += *a++ + *b++; 9# return sum; 10# } 11# a and b must be 128-bit aligned 12# Copyright 2001, Phil Karn KA9Q 13# May be used under the terms of the GNU Lesser General Public License (LGPL) 14 15 .text 16 .global dotprod_sse2_assist 17 .type dotprod_sse2_assist,@function 18dotprod_sse2_assist: 19 pushl %ebp 20 movl %esp,%ebp 21 pushl %esi 22 pushl %edi 23 pushl %ecx 24 pushl %ebx 25 movl 8(%ebp),%esi # a 26 movl 12(%ebp),%edi # b 27 movl 16(%ebp),%ecx # cnt 28 pxor %xmm0,%xmm0 # clear running sum (in two 32-bit halves) 29 30# SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop 31 .align 16 32.Loop1: subl $4,%ecx 33 jl .Loop1Done 34 35 movdqa (%esi),%xmm1 36 pmaddwd (%edi),%xmm1 37 paddd %xmm1,%xmm0 38 39 movdqa 16(%esi),%xmm1 40 pmaddwd 16(%edi),%xmm1 41 paddd %xmm1,%xmm0 42 43 movdqa 32(%esi),%xmm1 44 pmaddwd 32(%edi),%xmm1 45 paddd %xmm1,%xmm0 46 47 movdqa 48(%esi),%xmm1 48 addl $64,%esi 49 pmaddwd 48(%edi),%xmm1 50 addl $64,%edi 51 paddd %xmm1,%xmm0 52 53 jmp .Loop1 54.Loop1Done: 55 56 addl $4,%ecx 57 58# SSE2 dot product loop, not unrolled, crunching 4 terms per loop 59# This could be redone as Duff's Device on the unrolled loop above 60.Loop2: subl $1,%ecx 61 jl .Loop2Done 62 63 movdqa (%esi),%xmm1 64 addl $16,%esi 65 pmaddwd (%edi),%xmm1 66 addl $16,%edi 67 paddd %xmm1,%xmm0 68 jmp .Loop2 69.Loop2Done: 70 71 movdqa %xmm0,%xmm1 72 psrldq $8,%xmm0 73 paddd %xmm1,%xmm0 74 movd %xmm0,%eax # right-hand word to eax 75 psrldq $4,%xmm0 76 movd %xmm0,%ebx 77 addl %ebx,%eax 78 79 popl %ebx 80 popl %ecx 81 popl %edi 82 popl %esi 83 movl %ebp,%esp 84 popl %ebp 85 ret 86