1; vim:filetype=nasm ts=8 2 3; libFLAC - Free Lossless Audio Codec library 4; Copyright (C) 2001,2002,2003,2004,2005,2006,2007 Josh Coalson 5; 6; Redistribution and use in source and binary forms, with or without 7; modification, are permitted provided that the following conditions 8; are met: 9; 10; - Redistributions of source code must retain the above copyright 11; notice, this list of conditions and the following disclaimer. 12; 13; - Redistributions in binary form must reproduce the above copyright 14; notice, this list of conditions and the following disclaimer in the 15; documentation and/or other materials provided with the distribution. 16; 17; - Neither the name of the Xiph.org Foundation nor the names of its 18; contributors may be used to endorse or promote products derived from 19; this software without specific prior written permission. 20; 21; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 25; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 26; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 27; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 33%include "nasm.h" 34 35 data_section 36 37cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov 38 39 code_section 40 41; ********************************************************************** 42; 43; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]) 44; { 45; FLAC__int32 last_error_0 = data[-1]; 46; FLAC__int32 last_error_1 = data[-1] - data[-2]; 47; FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]); 48; FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]); 49; FLAC__int32 error, save; 50; FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0; 51; unsigned i, order; 52; 53; for(i = 0; i < data_len; i++) { 54; error = data[i] ; total_error_0 += local_abs(error); save = error; 55; error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error; 56; error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error; 57; error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error; 58; error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save; 59; } 60; 61; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) 62; order = 0; 63; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) 64; order = 1; 65; else if(total_error_2 < min(total_error_3, total_error_4)) 66; order = 2; 67; else if(total_error_3 < total_error_4) 68; order = 3; 69; else 70; order = 4; 71; 72; residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0); 73; residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0); 74; residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0); 75; residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0); 76; residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0); 77; 78; return order; 79; } 80 ALIGN 16 81cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov 82 83 ; esp + 36 == data[] 84 ; esp + 40 == data_len 85 ; esp + 44 == residual_bits_per_sample[] 86 87 push ebp 88 push ebx 89 push esi 90 push edi 91 sub esp, byte 16 92 ; qword [esp] == temp space for loading FLAC__uint64s to FPU regs 93 94 ; ebx == &data[i] 95 ; ecx == loop counter (i) 96 ; ebp == order 97 ; mm0 == total_error_1:total_error_0 98 ; mm1 == total_error_2:total_error_3 99 ; mm2 == :total_error_4 100 ; mm3 == last_error_1:last_error_0 101 ; mm4 == last_error_2:last_error_3 102 103 mov ecx, [esp + 40] ; ecx = data_len 104 test ecx, ecx 105 jz near .data_len_is_0 106 107 mov ebx, [esp + 36] ; ebx = data[] 108 movd mm3, [ebx - 4] ; mm3 = 0:last_error_0 109 movd mm2, [ebx - 8] ; mm2 = 0:data[-2] 110 movd mm1, [ebx - 12] ; mm1 = 0:data[-3] 111 movd mm0, [ebx - 16] ; mm0 = 0:data[-4] 112 movq mm5, mm3 ; mm5 = 0:last_error_0 113 psubd mm5, mm2 ; mm5 = 0:last_error_1 114 punpckldq mm3, mm5 ; mm3 = last_error_1:last_error_0 115 psubd mm2, mm1 ; mm2 = 0:data[-2] - data[-3] 116 psubd mm5, mm2 ; mm5 = 0:last_error_2 117 movq mm4, mm5 ; mm4 = 0:last_error_2 118 psubd mm4, mm2 ; mm4 = 0:last_error_2 - (data[-2] - data[-3]) 119 paddd mm4, mm1 ; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3]) 120 psubd mm4, mm0 ; mm4 = 0:last_error_3 121 punpckldq mm4, mm5 ; mm4 = last_error_2:last_error_3 122 pxor mm0, mm0 ; mm0 = total_error_1:total_error_0 123 pxor mm1, mm1 ; mm1 = total_error_2:total_error_3 124 pxor mm2, mm2 ; mm2 = 0:total_error_4 125 126 ALIGN 16 127.loop: 128 movd mm7, [ebx] ; mm7 = 0:error_0 129 add ebx, byte 4 130 movq mm6, mm7 ; mm6 = 0:error_0 131 psubd mm7, mm3 ; mm7 = :error_1 132 punpckldq mm6, mm7 ; mm6 = error_1:error_0 133 movq mm5, mm6 ; mm5 = error_1:error_0 134 movq mm7, mm6 ; mm7 = error_1:error_0 135 psubd mm5, mm3 ; mm5 = error_2: 136 movq mm3, mm6 ; mm3 = error_1:error_0 137 psrad mm6, 31 138 pxor mm7, mm6 139 psubd mm7, mm6 ; mm7 = abs(error_1):abs(error_0) 140 paddd mm0, mm7 ; mm0 = total_error_1:total_error_0 141 movq mm6, mm5 ; mm6 = error_2: 142 psubd mm5, mm4 ; mm5 = error_3: 143 punpckhdq mm5, mm6 ; mm5 = error_2:error_3 144 movq mm7, mm5 ; mm7 = error_2:error_3 145 movq mm6, mm5 ; mm6 = error_2:error_3 146 psubd mm5, mm4 ; mm5 = :error_4 147 movq mm4, mm6 ; mm4 = error_2:error_3 148 psrad mm6, 31 149 pxor mm7, mm6 150 psubd mm7, mm6 ; mm7 = abs(error_2):abs(error_3) 151 paddd mm1, mm7 ; mm1 = total_error_2:total_error_3 152 movq mm6, mm5 ; mm6 = :error_4 153 psrad mm5, 31 154 pxor mm6, mm5 155 psubd mm6, mm5 ; mm6 = :abs(error_4) 156 paddd mm2, mm6 ; mm2 = :total_error_4 157 158 dec ecx 159 jnz short .loop 160 161; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) 162; order = 0; 163; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) 164; order = 1; 165; else if(total_error_2 < min(total_error_3, total_error_4)) 166; order = 2; 167; else if(total_error_3 < total_error_4) 168; order = 3; 169; else 170; order = 4; 171 movq mm3, mm0 ; mm3 = total_error_1:total_error_0 172 movd edi, mm2 ; edi = total_error_4 173 movd esi, mm1 ; esi = total_error_3 174 movd eax, mm0 ; eax = total_error_0 175 punpckhdq mm1, mm1 ; mm1 = total_error_2:total_error_2 176 punpckhdq mm3, mm3 ; mm3 = total_error_1:total_error_1 177 movd edx, mm1 ; edx = total_error_2 178 movd ecx, mm3 ; ecx = total_error_1 179 180 xor ebx, ebx 181 xor ebp, ebp 182 inc ebx 183 cmp ecx, eax 184 cmovb eax, ecx ; eax = min(total_error_0, total_error_1) 185 cmovbe ebp, ebx 186 inc ebx 187 cmp edx, eax 188 cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2) 189 cmovbe ebp, ebx 190 inc ebx 191 cmp esi, eax 192 cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3) 193 cmovbe ebp, ebx 194 inc ebx 195 cmp edi, eax 196 cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4) 197 cmovbe ebp, ebx 198 movd ebx, mm0 ; ebx = total_error_0 199 emms 200 201 ; residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0); 202 ; residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0); 203 ; residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0); 204 ; residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0); 205 ; residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0); 206 xor eax, eax 207 fild dword [esp + 40] ; ST = data_len (NOTE: assumes data_len is <2gigs) 208.rbps_0: 209 test ebx, ebx 210 jz .total_error_0_is_0 211 fld1 ; ST = 1.0 data_len 212 mov [esp], ebx 213 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_0 214 mov ebx, [esp + 44] 215 fild qword [esp] ; ST = total_error_0 1.0 data_len 216 fdiv st2 ; ST = total_error_0/data_len 1.0 data_len 217 fldln2 ; ST = ln2 total_error_0/data_len 1.0 data_len 218 fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 data_len 219 fyl2x ; ST = log2(ln2*total_error_0/data_len) data_len 220 fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = data_len 221 jmp short .rbps_1 222.total_error_0_is_0: 223 mov ebx, [esp + 44] 224 mov [ebx], eax ; residual_bits_per_sample[0] = 0.0 225.rbps_1: 226 test ecx, ecx 227 jz .total_error_1_is_0 228 fld1 ; ST = 1.0 data_len 229 mov [esp], ecx 230 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_1 231 fild qword [esp] ; ST = total_error_1 1.0 data_len 232 fdiv st2 ; ST = total_error_1/data_len 1.0 data_len 233 fldln2 ; ST = ln2 total_error_1/data_len 1.0 data_len 234 fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 data_len 235 fyl2x ; ST = log2(ln2*total_error_1/data_len) data_len 236 fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = data_len 237 jmp short .rbps_2 238.total_error_1_is_0: 239 mov [ebx + 4], eax ; residual_bits_per_sample[1] = 0.0 240.rbps_2: 241 test edx, edx 242 jz .total_error_2_is_0 243 fld1 ; ST = 1.0 data_len 244 mov [esp], edx 245 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_2 246 fild qword [esp] ; ST = total_error_2 1.0 data_len 247 fdiv st2 ; ST = total_error_2/data_len 1.0 data_len 248 fldln2 ; ST = ln2 total_error_2/data_len 1.0 data_len 249 fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 data_len 250 fyl2x ; ST = log2(ln2*total_error_2/data_len) data_len 251 fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = data_len 252 jmp short .rbps_3 253.total_error_2_is_0: 254 mov [ebx + 8], eax ; residual_bits_per_sample[2] = 0.0 255.rbps_3: 256 test esi, esi 257 jz .total_error_3_is_0 258 fld1 ; ST = 1.0 data_len 259 mov [esp], esi 260 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_3 261 fild qword [esp] ; ST = total_error_3 1.0 data_len 262 fdiv st2 ; ST = total_error_3/data_len 1.0 data_len 263 fldln2 ; ST = ln2 total_error_3/data_len 1.0 data_len 264 fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 data_len 265 fyl2x ; ST = log2(ln2*total_error_3/data_len) data_len 266 fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = data_len 267 jmp short .rbps_4 268.total_error_3_is_0: 269 mov [ebx + 12], eax ; residual_bits_per_sample[3] = 0.0 270.rbps_4: 271 test edi, edi 272 jz .total_error_4_is_0 273 fld1 ; ST = 1.0 data_len 274 mov [esp], edi 275 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_4 276 fild qword [esp] ; ST = total_error_4 1.0 data_len 277 fdiv st2 ; ST = total_error_4/data_len 1.0 data_len 278 fldln2 ; ST = ln2 total_error_4/data_len 1.0 data_len 279 fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 data_len 280 fyl2x ; ST = log2(ln2*total_error_4/data_len) data_len 281 fstp dword [ebx + 16] ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len) ST = data_len 282 jmp short .rbps_end 283.total_error_4_is_0: 284 mov [ebx + 16], eax ; residual_bits_per_sample[4] = 0.0 285.rbps_end: 286 fstp st0 ; ST = [empty] 287 jmp short .end 288.data_len_is_0: 289 ; data_len == 0, so residual_bits_per_sample[*] = 0.0 290 xor ebp, ebp 291 mov edi, [esp + 44] 292 mov [edi], ebp 293 mov [edi + 4], ebp 294 mov [edi + 8], ebp 295 mov [edi + 12], ebp 296 mov [edi + 16], ebp 297 add ebp, byte 4 ; order = 4 298 299.end: 300 mov eax, ebp ; return order 301 add esp, byte 16 302 pop edi 303 pop esi 304 pop ebx 305 pop ebp 306 ret 307 308end 309 310%ifdef OBJ_FORMAT_elf 311 section .note.GNU-stack noalloc 312%endif 313