1/* Intel SIMD SSE implementation of Viterbi ACS butterflies 2 for 256-state (k=9) convolutional code 3 Copyright 2004 Phil Karn, KA9Q 4 This code may be used under the terms of the GNU Lesser General Public License (LGPL) 5 6 void update_viterbi29_blk_sse(struct v29 *vp,unsigned char syms[],int nbits); 7*/ 8 # SSE (64-bit integer SIMD) version 9 # Requires Pentium III or better 10 # These are offsets into struct v29, defined in viterbi29.h 11 .set DP,512 12 .set OLDMETRICS,516 13 .set NEWMETRICS,520 14 .text 15 .global update_viterbi29_blk_sse,Branchtab29_sse 16 .type update_viterbi29_blk_sse,@function 17 .align 16 18 19update_viterbi29_blk_sse: 20 pushl %ebp 21 movl %esp,%ebp 22 pushl %esi 23 pushl %edi 24 pushl %edx 25 pushl %ebx 26 27 movl 8(%ebp),%edx # edx = vp 28 testl %edx,%edx 29 jnz 0f 30 movl -1,%eax 31 jmp err 320: movl OLDMETRICS(%edx),%esi # esi -> old metrics 33 movl NEWMETRICS(%edx),%edi # edi -> new metrics 34 movl DP(%edx),%edx # edx -> decisions 35 361: movl 16(%ebp),%eax # eax = nbits 37 decl %eax 38 jl 2f # passed zero, we're done 39 movl %eax,16(%ebp) 40 41 xorl %eax,%eax 42 movl 12(%ebp),%ebx # ebx = syms 43 movb (%ebx),%al 44 movd %eax,%mm6 # mm6[0] = first symbol 45 movb 1(%ebx),%al 46 movd %eax,%mm5 # mm5[0] = second symbol 47 addl $2,%ebx 48 movl %ebx,12(%ebp) 49 50 punpcklbw %mm6,%mm6 # mm6[1] = mm6[0] 51 punpcklbw %mm5,%mm5 52 53 movq thirtyones,%mm7 54 pshufw $0,%mm6,%mm6 # copy low word to upper 3 55 pshufw $0,%mm5,%mm5 56 # mm6 now contains first symbol in each byte, mm5 the second 57 58 # each invocation of this macro does 8 butterflies in parallel 59 .MACRO butterfly GROUP 60 # compute branch metrics 61 movq Branchtab29_sse+(8*\GROUP),%mm4 62 movq Branchtab29_sse+128+(8*\GROUP),%mm3 63 pxor %mm6,%mm4 64 pxor %mm5,%mm3 65 pavgb %mm3,%mm4 # mm4 contains branch metrics 66 psrlw $3,%mm4 67 pand %mm7,%mm4 68 69 movq (8*\GROUP)(%esi),%mm0 # Incoming path metric, high bit = 0 70 movq ((8*\GROUP)+128)(%esi),%mm3 # Incoming path metric, high bit = 1 71 movq %mm0,%mm2 72 movq %mm3,%mm1 73 paddusb %mm4,%mm0 74 paddusb %mm4,%mm3 75 76 # invert branch metrics. This works only because they're 5 bits 77 pxor %mm7,%mm4 78 79 paddusb %mm4,%mm1 80 paddusb %mm4,%mm2 81 82 # Find survivors, leave in mm0,2 83 pminub %mm1,%mm0 84 pminub %mm3,%mm2 85 # get decisions, leave in mm1,3 86 pcmpeqb %mm0,%mm1 87 pcmpeqb %mm2,%mm3 88 89 # interleave and store new branch metrics in mm0,2 90 movq %mm0,%mm4 91 punpckhbw %mm2,%mm0 # interleave second 8 new metrics 92 punpcklbw %mm2,%mm4 # interleave first 8 new metrics 93 movq %mm0,(16*\GROUP+8)(%edi) 94 movq %mm4,(16*\GROUP)(%edi) 95 96 # interleave decisions, accumulate into %ebx 97 movq %mm1,%mm4 98 punpckhbw %mm3,%mm1 99 punpcklbw %mm3,%mm4 100 # Due to an error in the Intel instruction set ref (the register 101 # fields are swapped), gas assembles pmovmskb incorrectly 102 # See http://mail.gnu.org/pipermail/bug-gnu-utils/2000-August/002341.html 103 .byte 0x0f,0xd7,0xc1 # pmovmskb %mm1,%eax 104 shll $((16*\GROUP+8)&31),%eax 105 orl %eax,%ebx 106 .byte 0x0f,0xd7,0xc4 # pmovmskb %mm4,%eax 107 shll $((16*\GROUP)&31),%eax 108 orl %eax,%ebx 109 .endm 110 111 # invoke macro 16 times for a total of 128 butterflies 112 xorl %ebx,%ebx # clear decisions 113 butterfly GROUP=0 114 butterfly GROUP=1 115 movl %ebx,(%edx) # stash first 32 decisions 116 xorl %ebx,%ebx 117 butterfly GROUP=2 118 butterfly GROUP=3 119 movl %ebx,4(%edx) # stash second 32 decisions 120 xorl %ebx,%ebx # clear decisions 121 butterfly GROUP=4 122 butterfly GROUP=5 123 movl %ebx,8(%edx) # stash first 32 decisions 124 xorl %ebx,%ebx 125 butterfly GROUP=6 126 butterfly GROUP=7 127 movl %ebx,12(%edx) # stash second 32 decisions 128 xorl %ebx,%ebx # clear decisions 129 butterfly GROUP=8 130 butterfly GROUP=9 131 movl %ebx,16(%edx) # stash first 32 decisions 132 xorl %ebx,%ebx 133 butterfly GROUP=10 134 butterfly GROUP=11 135 movl %ebx,20(%edx) # stash second 32 decisions 136 xorl %ebx,%ebx # clear decisions 137 butterfly GROUP=12 138 butterfly GROUP=13 139 movl %ebx,24(%edx) # stash first 32 decisions 140 xorl %ebx,%ebx 141 butterfly GROUP=14 142 butterfly GROUP=15 143 movl %ebx,28(%edx) # stash second 32 decisions 144 145 addl $32,%edx # bump decision pointer 146 147 # see if we have to normalize 148 movl (%edi),%eax # extract first output metric 149 andl $255,%eax 150 cmp $50,%eax # is it greater than 50? 151 movl $0,%eax 152 jle done # No, no need to normalize 153 154 # Normalize by finding smallest metric and subtracting it 155 # from all metrics 156 movq (%edi),%mm0 157 pminub 8(%edi),%mm0 158 pminub 16(%edi),%mm0 159 pminub 24(%edi),%mm0 160 pminub 32(%edi),%mm0 161 pminub 40(%edi),%mm0 162 pminub 48(%edi),%mm0 163 pminub 56(%edi),%mm0 164 pminub 64(%edi),%mm0 165 pminub 72(%edi),%mm0 166 pminub 80(%edi),%mm0 167 pminub 88(%edi),%mm0 168 pminub 96(%edi),%mm0 169 pminub 104(%edi),%mm0 170 pminub 112(%edi),%mm0 171 pminub 120(%edi),%mm0 172 pminub 128(%edi),%mm0 173 pminub 136(%edi),%mm0 174 pminub 144(%edi),%mm0 175 pminub 152(%edi),%mm0 176 pminub 160(%edi),%mm0 177 pminub 168(%edi),%mm0 178 pminub 176(%edi),%mm0 179 pminub 184(%edi),%mm0 180 pminub 192(%edi),%mm0 181 pminub 200(%edi),%mm0 182 pminub 208(%edi),%mm0 183 pminub 216(%edi),%mm0 184 pminub 224(%edi),%mm0 185 pminub 232(%edi),%mm0 186 pminub 240(%edi),%mm0 187 pminub 248(%edi),%mm0 188 # mm0 contains 8 smallest metrics 189 # crunch down to single lowest metric 190 movq %mm0,%mm1 191 psrlq $32,%mm0 192 pminub %mm1,%mm0 193 movq %mm0,%mm1 194 psrlq $16,%mm0 195 pminub %mm1,%mm0 196 movq %mm0,%mm1 197 psrlq $8,%mm0 198 pminub %mm1,%mm0 199 movq 8(%edi),%mm1 # reload 200 punpcklbw %mm0,%mm0 # expand to all 8 bytes 201 pshufw $0,%mm0,%mm0 202 203 # mm0 now contains lowest metric in all 8 bytes 204 # subtract it from every output metric 205 # Trashes %mm7 206 .macro PSUBUSBM REG,MEM 207 movq \MEM,%mm7 208 psubusb \REG,%mm7 209 movq %mm7,\MEM 210 .endm 211 212 PSUBUSBM %mm0,(%edi) 213 PSUBUSBM %mm0,8(%edi) 214 PSUBUSBM %mm0,16(%edi) 215 PSUBUSBM %mm0,24(%edi) 216 PSUBUSBM %mm0,32(%edi) 217 PSUBUSBM %mm0,40(%edi) 218 PSUBUSBM %mm0,48(%edi) 219 PSUBUSBM %mm0,56(%edi) 220 PSUBUSBM %mm0,64(%edi) 221 PSUBUSBM %mm0,72(%edi) 222 PSUBUSBM %mm0,80(%edi) 223 PSUBUSBM %mm0,88(%edi) 224 PSUBUSBM %mm0,96(%edi) 225 PSUBUSBM %mm0,104(%edi) 226 PSUBUSBM %mm0,112(%edi) 227 PSUBUSBM %mm0,120(%edi) 228 PSUBUSBM %mm0,128(%edi) 229 PSUBUSBM %mm0,136(%edi) 230 PSUBUSBM %mm0,144(%edi) 231 PSUBUSBM %mm0,152(%edi) 232 PSUBUSBM %mm0,160(%edi) 233 PSUBUSBM %mm0,168(%edi) 234 PSUBUSBM %mm0,176(%edi) 235 PSUBUSBM %mm0,184(%edi) 236 PSUBUSBM %mm0,192(%edi) 237 PSUBUSBM %mm0,200(%edi) 238 PSUBUSBM %mm0,208(%edi) 239 PSUBUSBM %mm0,216(%edi) 240 PSUBUSBM %mm0,224(%edi) 241 PSUBUSBM %mm0,232(%edi) 242 PSUBUSBM %mm0,240(%edi) 243 PSUBUSBM %mm0,248(%edi) 244 245done: 246 # swap metrics 247 movl %esi,%eax 248 movl %edi,%esi 249 movl %eax,%edi 250 jmp 1b 251 2522: emms 253 movl 8(%ebp),%ebx # ebx = vp 254 # stash metric pointers 255 movl %esi,OLDMETRICS(%ebx) 256 movl %edi,NEWMETRICS(%ebx) 257 movl %edx,DP(%ebx) # stash incremented value of vp->dp 258 xorl %eax,%eax 259err: popl %ebx 260 popl %edx 261 popl %edi 262 popl %esi 263 popl %ebp 264 ret 265 266 .data 267 .align 8 268thirtyones: 269 .byte 31,31,31,31,31,31,31,31 270 271 272