1; new count bit routine 2; part of this code is origined from 3; new GOGO-no-coda (1999, 2000) 4; Copyright (C) 1999 shigeo 5; modified by Keiichi SAKAI 6 7%include "nasm.h" 8 9 globaldef choose_table_MMX 10 globaldef MMX_masking 11 12 externdef largetbl 13 externdef t1l 14 externdef table23 15 externdef table56 16 17 segment_data 18 align 16 19D14_14_14_14 dd 0x000E000E, 0x000E000E 20D15_15_15_15 dd 0xfff0fff0, 0xfff0fff0 21mul_add dd 0x00010010, 0x00010010 22mul_add23 dd 0x00010003, 0x00010003 23mul_add56 dd 0x00010004, 0x00010004 24tableDEF 25 dd 0x00010003,0x01,0x00050005,0x05,0x00070006,0x07,0x00090008,0x08,0x000a0008, 0x09 26 dd 0x000a0009,0x0a,0x000b000a,0x0a,0x000b000a,0x0b,0x000c000a,0x0a,0x000c000b, 0x0b 27 dd 0x000c000b,0x0c,0x000d000c,0x0c,0x000d000c,0x0d,0x000d000c,0x0d,0x000e000d, 0x0e 28 dd 0x000b000e,0x0e,0x00040005,0x04,0x00060005,0x06,0x00080007,0x08,0x00090008, 0x09 29 dd 0x000a0009,0x0a,0x000b0009,0x0a,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0b 30 dd 0x000c000b,0x0b,0x000c000b,0x0c,0x000d000c,0x0c,0x000e000c,0x0d,0x000d000c, 0x0e 31 dd 0x000e000d,0x0e,0x000b000d,0x0e,0x00070006,0x07,0x00080007,0x08,0x00090007, 0x09 32 dd 0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c 33 dd 0x000d000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000c,0x0d,0x000d000c, 0x0d 34 dd 0x000e000d,0x0e,0x000e000d,0x0f,0x000c000d,0x0f,0x00090007,0x08,0x00090008, 0x09 35 dd 0x000a0008,0x0a,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c 36 dd 0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000c,0x0d,0x000e000c, 0x0d 37 dd 0x000e000c,0x0d,0x000f000d,0x0e,0x000f000d,0x0f,0x000d000d,0x0f,0x000a0008, 0x09 38 dd 0x000a0008,0x09,0x000b0009,0x0b,0x000b0009,0x0b,0x000c000a,0x0c,0x000c000a, 0x0c 39 dd 0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0c,0x000e000b,0x0d,0x000e000c, 0x0d 40 dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d,0x0f,0x000c000d, 0x10 41 dd 0x000a0009,0x0a,0x000a0009,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c 42 dd 0x000d000a,0x0c,0x000d000b,0x0d,0x000e000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d 43 dd 0x000e000c,0x0e,0x000f000c,0x0d,0x000f000d,0x0f,0x000f000d,0x0f,0x0010000d, 0x10 44 dd 0x000d000e,0x10,0x000b000a,0x0a,0x000b0009,0x0b,0x000b000a,0x0c,0x000c000a, 0x0c 45 dd 0x000d000a,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000e000b, 0x0d 46 dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f 47 dd 0x0010000e,0x10,0x000d000e,0x10,0x000b000a,0x0b,0x000b000a,0x0b,0x000c000a, 0x0c 48 dd 0x000c000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0d,0x000d000b,0x0e,0x000e000c, 0x0e 49 dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0f,0x000f000c,0x0f,0x000f000d, 0x0f 50 dd 0x0011000d,0x10,0x0011000d,0x12,0x000d000e,0x12,0x000b000a,0x0a,0x000c000a, 0x0a 51 dd 0x000c000a,0x0b,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d 52 dd 0x000e000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000c,0x0e,0x000f000d, 0x0f 53 dd 0x0010000d,0x0f,0x0010000e,0x10,0x0010000e,0x11,0x000d000e,0x11,0x000c000a, 0x0b 54 dd 0x000c000a,0x0b,0x000c000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0d,0x000e000b, 0x0d 55 dd 0x000e000c,0x0d,0x000f000c,0x0f,0x000f000c,0x0e,0x000f000d,0x0f,0x000f000d, 0x0f 56 dd 0x0010000d,0x10,0x000f000d,0x10,0x0010000e,0x10,0x000f000e,0x12,0x000e000e, 0x11 57 dd 0x000c000b,0x0b,0x000d000b,0x0c,0x000c000b,0x0c,0x000d000b,0x0d,0x000e000c, 0x0d 58 dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0e,0x0010000d, 0x0f 59 dd 0x0010000d,0x10,0x0010000d,0x0f,0x0011000d,0x10,0x0011000e,0x11,0x0010000f, 0x12 60 dd 0x000d000e,0x13,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b,0x0c,0x000d000b, 0x0d 61 dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0e,0x0010000c,0x0e,0x0010000d, 0x0f 62 dd 0x0010000d,0x0f,0x0010000d,0x0f,0x0010000d,0x10,0x0010000e,0x11,0x000f000e, 0x11 63 dd 0x0010000e,0x11,0x000e000f,0x12,0x000d000c,0x0c,0x000e000c,0x0d,0x000e000b, 0x0d 64 dd 0x000e000c,0x0e,0x000e000c,0x0e,0x000f000c,0x0f,0x000f000d,0x0e,0x000f000d, 0x0f 65 dd 0x000f000d,0x10,0x0011000d,0x10,0x0010000d,0x11,0x0010000d,0x11,0x0010000e, 0x11 66 dd 0x0010000e,0x12,0x0012000f,0x12,0x000e000f,0x12,0x000f000c,0x0d,0x000e000c, 0x0d 67 dd 0x000e000c,0x0e,0x000e000c,0x0f,0x000f000c,0x0f,0x000f000d,0x0f,0x0010000d, 0x10 68 dd 0x0010000d,0x10,0x0010000d,0x10,0x0012000e,0x10,0x0011000e,0x10,0x0011000e, 0x11 69 dd 0x0011000e,0x12,0x0013000e,0x11,0x0011000f,0x12,0x000e000f,0x12,0x000e000d, 0x0e 70 dd 0x000f000d,0x0e,0x000d000d,0x0e,0x000e000d,0x0f,0x0010000d,0x0f,0x0010000d, 0x0f 71 dd 0x000f000d,0x11,0x0010000d,0x10,0x0010000e,0x10,0x0011000e,0x13,0x0012000e, 0x11 72 dd 0x0011000e,0x11,0x0013000f,0x11,0x0011000f,0x13,0x0010000e,0x12,0x000e000f, 0x12 73 dd 0x000b000d,0x0d,0x000b000d,0x0e,0x000b000d,0x0f,0x000c000d,0x10,0x000c000d, 0x10 74 dd 0x000d000d,0x10,0x000d000d,0x11,0x000d000e,0x10,0x000e000e,0x11,0x000e000e, 0x11 75 dd 0x000e000e,0x12,0x000e000e,0x12,0x000e000f,0x15,0x000e000f,0x14,0x000e000f, 0x15 76 dd 0x000c000f,0x12 77 78tableABC 79 dd 0x00020004,0x1,0x00040004,0x4,0x00060006,0x7,0x00080008,0x9,0x00090009,0xa,0x000a000a,0xa 80 dd 0x0009000a,0xa,0x000a000a,0xb,0x00000000,0x0,0x00020003,0x1,0x00040004,0x4,0x00070006,0x7 81 dd 0x00090007,0x9,0x00090009,0x9,0x000a000a,0xa,0x00000000,0x0,0x00040004,0x4,0x00050005,0x6 82 dd 0x00060006,0x8,0x00080007,0x9,0x000a0009,0xa,0x000a0009,0xb,0x0009000a,0xa,0x000a000a,0xa 83 dd 0x00000000,0x0,0x00040004,0x4,0x00040005,0x6,0x00060006,0x8,0x000a0007,0x9,0x000a0008,0x9 84 dd 0x000a000a,0xa,0x00000000,0x0,0x00060006,0x7,0x00070006,0x8,0x00080007,0x9,0x00090008,0xa 85 dd 0x000a0009,0xb,0x000b000a,0xc,0x000a0009,0xb,0x000a000a,0xb,0x00000000,0x0,0x00070005,0x7 86 dd 0x00060006,0x7,0x00080007,0x9,0x000a0008,0xa,0x000a0009,0xa,0x000b000a,0xb,0x00000000,0x0 87 dd 0x00080007,0x8,0x00080007,0x9,0x00090008,0xa,0x000b0008,0xb,0x000a0009,0xc,0x000c000a,0xc 88 dd 0x000a000a,0xb,0x000b000a,0xc,0x00000000,0x0,0x00090007,0x8,0x000a0007,0x9,0x000a0008,0xa 89 dd 0x000b0009,0xb,0x000b0009,0xb,0x000c000a,0xb,0x00000000,0x0,0x00090008,0x9,0x000a0008,0xa 90 dd 0x000a0009,0xb,0x000b0009,0xc,0x000b000a,0xc,0x000c000a,0xc,0x000b000a,0xc,0x000c000b,0xc 91 dd 0x00000000,0x0,0x00090008,0x8,0x00090008,0x9,0x000a0009,0xa,0x000b0009,0xb,0x000c000a,0xb 92 dd 0x000c000b,0xc,0x00000000,0x0,0x00090009,0xa,0x000a0009,0xb,0x000b000a,0xc,0x000c000a,0xc 93 dd 0x000c000a,0xd,0x000d000b,0xd,0x000c000a,0xc,0x000d000b,0xd,0x00000000,0x0,0x000a0009,0x9 94 dd 0x000a0009,0xa,0x000b000a,0xb,0x000b000a,0xc,0x000d000b,0xc,0x000d000b,0xc,0x00000000,0x0 95 dd 0x00090009,0x9,0x00090009,0xa,0x00090009,0xb,0x000a000a,0xc,0x000b000a,0xc,0x000c000b,0xc 96 dd 0x000c000b,0xd,0x000c000c,0xd,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0 97 dd 0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x0009000a,0xa,0x0009000a,0xa 98 dd 0x000a000a,0xb,0x000b000b,0xc,0x000c000b,0xc,0x000c000b,0xd,0x000c000b,0xd,0x000c000c,0xd 99 dd 0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0,0x00000000,0x0 100 dd 0x0,0x00000000, 0x0,0x00000000 101 102linbits32 103 dd 0x00040004,0x10001,0x00040004,0x20002,0x00040004,0x30003,0x00040004,0x40004 104 dd 0x00050005,0x60006,0x00060006,0x60006,0x00070007,0x80008,0x00080008,0x80008 105 dd 0x00090009,0xa000a,0x000b000b,0xa000a,0x000b000b,0xd000d,0x000d000d,0xd000d 106 dd 0x000d000d,0xd000d 107 108 109choose_table_H 110 dw 0x1810, 0x1811, 0x1812, 0x1813, 0x1914, 0x1a14, 0x1b15, 0x1c15 111 dw 0x1d16, 0x1e16, 0x1e17, 0x1f17, 0x1f17 112 113choose_jump_table_L: 114 dd table_MMX.L_case_0 - choose_table_MMX 115 dd table_MMX.L_case_1 - choose_table_MMX 116 dd table_MMX.L_case_2 - choose_table_MMX 117 dd table_MMX.L_case_3 - choose_table_MMX 118 dd table_MMX.L_case_45 - choose_table_MMX 119 dd table_MMX.L_case_45 - choose_table_MMX 120 dd table_MMX.L_case_67 - choose_table_MMX 121 dd table_MMX.L_case_67 - choose_table_MMX 122 dd table_MMX.L_case_8_15 - choose_table_MMX 123 dd table_MMX.L_case_8_15 - choose_table_MMX 124 dd table_MMX.L_case_8_15 - choose_table_MMX 125 dd table_MMX.L_case_8_15 - choose_table_MMX 126 dd table_MMX.L_case_8_15 - choose_table_MMX 127 dd table_MMX.L_case_8_15 - choose_table_MMX 128 dd table_MMX.L_case_8_15 - choose_table_MMX 129 dd table_MMX.L_case_8_15 - choose_table_MMX 130 131 segment_code 132; 133; use MMX 134; 135 136PIC_OFFSETTABLE 137 138 align 16 139; int choose_table(int *ix, int *end, int *s) 140choose_table_MMX: 141 push ebp 142 call get_pc.bp 143 add ebp, PIC_BASE() 144 145 mov ecx,[esp+8] ;ecx = begin 146 mov edx,[esp+12] ;edx = end 147 sub ecx,edx ;ecx = begin-end(should be minus) 148 test ecx,8 149 pxor mm0,mm0 ;mm0=[0:0] 150 movq mm1,[edx+ecx] 151 jz .lp 152 153 add ecx,8 154 jz .exit 155 156 align 4 157.lp: 158 movq mm4,[edx+ecx] 159 movq mm5,[edx+ecx+8] 160 add ecx,16 161 psubusw mm4,mm0 ; $BK\Ev$O(B dword $B$G$J$$$H$$$1$J$$$N$@$,(B 162 psubusw mm5,mm1 ; $B$=$s$J%3%^%s%I$O$J$$(B :-p 163 paddw mm0,mm4 ; $B$,(B, $B$3$3$G07$&CM$NHO0O$O(B 8191+15 $B0J2<$J$N$GLdBj$J$$(B 164 paddw mm1,mm5 165 jnz .lp 166.exit: 167 psubusw mm1,mm0 ; $B$3$l$bK\Ev$O(B dword $B$G$J$$$H$$$1$J$$(B 168 paddw mm0,mm1 169 170 movq mm4,mm0 171 punpckhdq mm4,mm4 172 psubusw mm4,mm0 ; $B$3$l$bK\Ev$O(B dword $B$G$J$$$H$$$1$J$$(B 173 paddw mm0,mm4 174 movd eax,mm0 175 176 cmp eax,15 177 ja .with_ESC 178 lea ecx,[PIC_EBP_REL(choose_table_MMX)] 179 add ecx,[PIC_EBP_REL(choose_jump_table_L+eax*4)] 180 jmp ecx 181 182.with_ESC1: 183 emms 184 mov ecx, [esp+16] ; *s 185 mov [ecx], eax 186 or eax,-1 187 pop ebp 188 ret 189 190.with_ESC: 191 cmp eax, 8191+15 192 ja .with_ESC1 193 194 sub eax,15 195 push ebx 196 push esi 197 bsr eax, eax 198%assign _P 4*2 199 movq mm5, [PIC_EBP_REL(D15_15_15_15)] 200 movq mm6, [PIC_EBP_REL(D14_14_14_14)] 201 movq mm3, [PIC_EBP_REL(mul_add)] 202 203 mov ecx, [esp+_P+8] ; = ix 204; mov edx, [esp+_P+12] ; = end 205 sub ecx, edx 206 207 xor esi, esi ; sum = 0 208 test ecx, 8 209 pxor mm7, mm7 ; linbits_sum, 14$B$r1[$($?$b$N$N?t(B 210 jz .H_dual_lp1 211 212 movq mm0, [edx+ecx] 213 add ecx,8 214 packssdw mm0,mm7 215 movq mm2, mm0 216 paddusw mm0, mm5 ; mm0 = min(ix, 15)+0xfff0 217 pcmpgtw mm2, mm6 ; 14$B$h$jBg$-$$$+!)(B 218 psubw mm7, mm2 ; 14$B$h$jBg$-$$$H$-(B linbits_sum++; 219 pmaddwd mm0, mm3 ; {0, 0, y, x}*{1, 16, 1, 16} 220 movd ebx, mm0 221 mov esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)] 222 223 jz .H_dual_exit 224 225 align 4 226.H_dual_lp1: 227 movq mm0, [edx+ecx] 228 movq mm1, [edx+ecx+8] 229 packssdw mm0,mm1 230 movq mm2, mm0 231 paddusw mm0, mm5 ; mm0 = min(ix, 15)+0xfff0 232 pcmpgtw mm2, mm6 ; 14$B$h$jBg$-$$$+!)(B 233 pmaddwd mm0, mm3 ; {y, x, y, x}*{1, 16, 1, 16} 234 movd ebx, mm0 235 punpckhdq mm0,mm0 236 add esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)] 237 movd ebx, mm0 238 add esi, [PIC_EBP_REL(largetbl+ebx*4+(16*16+16)*4)] 239 add ecx, 16 240 psubw mm7, mm2 ; 14$B$h$jBg$-$$$H$-(B linbits_sum++; 241 jnz .H_dual_lp1 242 243.H_dual_exit: 244 pmov mm1,mm7 245 punpckhdq mm7,mm7 246 paddd mm7,mm1 247 punpckldq mm7,mm7 248 249 pmaddwd mm7, [PIC_EBP_REL(linbits32+eax*8)] ; linbits 250 mov ax, [PIC_EBP_REL(choose_table_H+eax*2)] 251 252 movd ecx, mm7 253 punpckhdq mm7,mm7 254 movd edx,mm7 255 emms 256 shl edx, 16 257 add ecx, edx 258 259 add ecx, esi 260 261 pop esi 262 pop ebx 263 264 mov edx, ecx 265 and ecx, 0xffff ; ecx = sum2 266 shr edx, 16 ; edx = sum 267 268 cmp edx, ecx 269 jle .chooseE_s1 270 mov edx, ecx 271 shr eax, 8 272.chooseE_s1: 273 mov ecx, [esp+16] ; *s 274 and eax, 0xff 275 add [ecx], edx 276 pop ebp 277 ret 278 279table_MMX.L_case_0: 280 emms 281 pop ebp 282 ret 283 284table_MMX.L_case_1: 285 emms 286 mov eax, [esp+16] ; *s 287 mov ecx, [esp+8] ; *ix 288 sub ecx, edx 289 push ebx 290.lp: 291 mov ebx, [edx+ecx] 292 add ebx, ebx 293 add ebx, [edx+ecx+4] 294 movzx ebx, byte [PIC_EBP_REL(ebx+t1l)] 295 add [eax], ebx 296 add ecx, 8 297 jnz .lp 298 pop ebx 299 mov eax, 1 300 pop ebp 301 ret 302 303table_MMX.L_case_45: 304 push dword 7 305 lea ecx, [PIC_EBP_REL(tableABC+9*8)] 306 jmp from3 307 308table_MMX.L_case_67: 309 push dword 10 310 lea ecx, [PIC_EBP_REL(tableABC)] 311 jmp from3 312 313table_MMX.L_case_8_15: 314 push dword 13 315 lea ecx, [PIC_EBP_REL(tableDEF)] 316from3: 317 mov eax,[esp+12] ;eax = *begin 318; mov edx,[esp+16] ;edx = *end 319 320 push ebx 321 sub eax, edx 322 323 movq mm5,[PIC_EBP_REL(mul_add)] 324 pxor mm2,mm2 ;mm2 = sum 325 326 test eax, 8 327 jz .choose3_lp1 328; odd length 329 movq mm0,[edx+eax] ;mm0 = ix[0] | ix[1] 330 add eax,8 331 packssdw mm0,mm2 332 333 pmaddwd mm0,mm5 334 movd ebx,mm0 335 336 movq mm2, [ecx+ebx*8] 337 338 jz .choose3_exit 339 340 align 4 341.choose3_lp1 342 movq mm0,[edx+eax] 343 movq mm1,[edx+eax+8] 344 add eax,16 345 packssdw mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3] 346 pmaddwd mm0,mm5 347 movd ebx,mm0 348 punpckhdq mm0,mm0 349 paddd mm2, [ecx+ebx*8] 350 movd ebx,mm0 351 paddd mm2, [ecx+ebx*8] 352 jnz .choose3_lp1 353.choose3_exit 354; xor eax,eax 355 movd ebx, mm2 356 punpckhdq mm2,mm2 357 mov ecx, ebx 358 and ecx, 0xffff ; ecx = sum2 359 shr ebx, 16 ; ebx = sum1 360 movd edx, mm2 ; edx = sum 361 362 cmp edx, ebx 363 jle .choose3_s1 364 mov edx, ebx 365 inc eax 366.choose3_s1: 367 emms 368 pop ebx 369 cmp edx, ecx 370 jle .choose3_s2 371 mov edx, ecx 372 mov eax, 2 373.choose3_s2: 374 pop ecx 375 add eax, ecx 376 mov ecx, [esp+16] ; *s 377 add [ecx], edx 378 pop ebp 379 ret 380 381table_MMX.L_case_2: 382 push dword 2 383 lea ecx,[PIC_EBP_REL(table23)] 384 pmov mm5,[PIC_EBP_REL(mul_add23)] 385 jmp from2 386table_MMX.L_case_3: 387 push dword 5 388 lea ecx,[PIC_EBP_REL(table56)] 389 pmov mm5,[PIC_EBP_REL(mul_add56)] 390from2: 391 mov eax,[esp+12] ;eax = *begin 392; mov edx,[esp+16] ;edx = *end 393 push ebx 394 push edi 395 396 sub eax, edx 397 xor edi, edi 398 test eax, 8 399 jz .choose2_lp1 400; odd length 401 movq mm0,[edx+eax] ;mm0 = ix[0] | ix[1] 402 pxor mm2,mm2 ;mm2 = sum 403 packssdw mm0,mm2 404 405 pmaddwd mm0,mm5 406 movd ebx,mm0 407 408 mov edi, [ecx+ebx*4] 409 410 add eax,8 411 jz .choose2_exit 412 413 align 4 414.choose2_lp1 415 movq mm0,[edx+eax] 416 movq mm1,[edx+eax+8] 417 packssdw mm0,mm1 ;mm0 = ix[0]|ix[1]|ix[2]|ix[3] 418 pmaddwd mm0,mm5 419 movd ebx,mm0 420 punpckhdq mm0,mm0 421 add edi, [ecx+ebx*4] 422 movd ebx, mm0 423 add edi, [ecx+ebx*4] 424 add eax,16 425 jnc .choose2_lp1 426.choose2_exit 427 mov ecx, edi 428 pop edi 429 pop ebx 430 pop eax ; table num. 431 emms 432 433 mov edx, ecx 434 and ecx, 0xffff ; ecx = sum2 435 shr edx, 16 ; edx = sum1 436 437 cmp edx, ecx 438 jle .choose2_s1 439 mov edx, ecx 440 inc eax 441.choose2_s1: 442 mov ecx, [esp+16] ; *s 443 add [ecx], edx 444 pop ebp 445 ret 446 447 end 448