1; vim:filetype=nasm ts=8 2 3; libFLAC - Free Lossless Audio Codec library 4; Copyright (C) 2001-2009 Josh Coalson 5; Copyright (C) 2011-2016 Xiph.Org Foundation 6; 7; Redistribution and use in source and binary forms, with or without 8; modification, are permitted provided that the following conditions 9; are met: 10; 11; - Redistributions of source code must retain the above copyright 12; notice, this list of conditions and the following disclaimer. 13; 14; - Redistributions in binary form must reproduce the above copyright 15; notice, this list of conditions and the following disclaimer in the 16; documentation and/or other materials provided with the distribution. 17; 18; - Neither the name of the Xiph.org Foundation nor the names of its 19; contributors may be used to endorse or promote products derived from 20; this software without specific prior written permission. 21; 22; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 26; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 27; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 28; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 29; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 30; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 31; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 34%include "nasm.h" 35 36 data_section 37 38cglobal FLAC__lpc_compute_autocorrelation_asm_ia32 39cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old 40cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old 41cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old 42cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old 43cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 44cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx 45cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 46cglobal FLAC__lpc_restore_signal_asm_ia32 47cglobal FLAC__lpc_restore_signal_asm_ia32_mmx 48cglobal FLAC__lpc_restore_signal_wide_asm_ia32 49 50 code_section 51 52; ********************************************************************** 53; 54; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) 55; { 56; FLAC__real d; 57; unsigned sample, coeff; 58; const unsigned limit = data_len - lag; 59; 60; FLAC__ASSERT(lag > 0); 61; FLAC__ASSERT(lag <= data_len); 62; 63; for(coeff = 0; coeff < lag; coeff++) 64; autoc[coeff] = 0.0; 65; for(sample = 0; sample <= limit; sample++) { 66; d = data[sample]; 67; for(coeff = 0; coeff < lag; coeff++) 68; autoc[coeff] += d * data[sample+coeff]; 69; } 70; for(; sample < data_len; sample++) { 71; d = data[sample]; 72; for(coeff = 0; coeff < data_len - sample; coeff++) 73; autoc[coeff] += d * data[sample+coeff]; 74; } 75; } 76; 77 ALIGN 16 78cident FLAC__lpc_compute_autocorrelation_asm_ia32 79 ;[esp + 28] == autoc[] 80 ;[esp + 24] == lag 81 ;[esp + 20] == data_len 82 ;[esp + 16] == data[] 83 84 ;ASSERT(lag > 0) 85 ;ASSERT(lag <= 33) 86 ;ASSERT(lag <= data_len) 87 88.begin: 89 push esi 90 push edi 91 push ebx 92 93 ; for(coeff = 0; coeff < lag; coeff++) 94 ; autoc[coeff] = 0.0; 95 mov edi, [esp + 28] ; edi == autoc 96 mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write 97 xor eax, eax 98 rep stosd 99 100 ; const unsigned limit = data_len - lag; 101 mov eax, [esp + 24] ; eax == lag 102 mov ecx, [esp + 20] 103 sub ecx, eax ; ecx == limit 104 105 mov edi, [esp + 28] ; edi == autoc 106 mov esi, [esp + 16] ; esi == data 107 inc ecx ; we are looping <= limit so we add one to the counter 108 109 ; for(sample = 0; sample <= limit; sample++) { 110 ; d = data[sample]; 111 ; for(coeff = 0; coeff < lag; coeff++) 112 ; autoc[coeff] += d * data[sample+coeff]; 113 ; } 114 fld dword [esi] ; ST = d <- data[sample] 115 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) 116 lea edx, [eax + eax*2] 117 neg edx 118 lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1] 119 call .mov_eip_to_ebx 120.get_eip1: 121 add edx, ebx 122 inc edx ; compensate for the shorter opcode on the last iteration 123 inc edx ; compensate for the shorter opcode on the last iteration 124 inc edx ; compensate for the shorter opcode on the last iteration 125 cmp eax, 33 126 jne .loop1_start 127 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration 128.loop1_start: 129 jmp edx 130 131.mov_eip_to_ebx: 132 mov ebx, [esp] 133 ret 134 135 fld st0 ; ST = d d 136 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here! 137 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here! 138 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here! 139 fld st0 ; ST = d d 140 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d 141 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d 142 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d 143 fld st0 ; ST = d d 144 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d 145 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d 146 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d 147 fld st0 ; ST = d d 148 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d 149 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d 150 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d 151 fld st0 ; ST = d d 152 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d 153 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d 154 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d 155 fld st0 ; ST = d d 156 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d 157 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d 158 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d 159 fld st0 ; ST = d d 160 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d 161 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d 162 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d 163 fld st0 ; ST = d d 164 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d 165 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d 166 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d 167 fld st0 ; ST = d d 168 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d 169 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d 170 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d 171 fld st0 ; ST = d d 172 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d 173 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d 174 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d 175 fld st0 ; ST = d d 176 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d 177 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d 178 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d 179 fld st0 ; ST = d d 180 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d 181 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d 182 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d 183 fld st0 ; ST = d d 184 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d 185 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d 186 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d 187 fld st0 ; ST = d d 188 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d 189 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d 190 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d 191 fld st0 ; ST = d d 192 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d 193 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d 194 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d 195 fld st0 ; ST = d d 196 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d 197 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d 198 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d 199 fld st0 ; ST = d d 200 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d 201 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d 202 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d 203 fld st0 ; ST = d d 204 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d 205 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d 206 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d 207 fld st0 ; ST = d d 208 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d 209 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d 210 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d 211 fld st0 ; ST = d d 212 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d 213 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d 214 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d 215 fld st0 ; ST = d d 216 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d 217 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d 218 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d 219 fld st0 ; ST = d d 220 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d 221 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d 222 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d 223 fld st0 ; ST = d d 224 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d 225 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d 226 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d 227 fld st0 ; ST = d d 228 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d 229 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d 230 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d 231 fld st0 ; ST = d d 232 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d 233 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d 234 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d 235 fld st0 ; ST = d d 236 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d 237 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d 238 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d 239 fld st0 ; ST = d d 240 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d 241 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d 242 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d 243 fld st0 ; ST = d d 244 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d 245 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d 246 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d 247 fld st0 ; ST = d d 248 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d 249 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d 250 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d 251 fld st0 ; ST = d d 252 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d 253 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d 254 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d 255 fld st0 ; ST = d d 256 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d 257 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d 258 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d 259 fld st0 ; ST = d d 260 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d 261 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d 262 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d 263 fld st0 ; ST = d d 264 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! 265 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! 266 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! 267.jumper1_0: 268 269 fstp st0 ; pop d, ST = empty 270 add esi, byte 4 ; sample++ 271 dec ecx 272 jz .loop1_end 273 fld dword [esi] ; ST = d <- data[sample] 274 jmp edx 275.loop1_end: 276 277 ; for(; sample < data_len; sample++) { 278 ; d = data[sample]; 279 ; for(coeff = 0; coeff < data_len - sample; coeff++) 280 ; autoc[coeff] += d * data[sample+coeff]; 281 ; } 282 mov ecx, [esp + 24] ; ecx <- lag 283 dec ecx ; ecx <- lag - 1 284 jz near .end ; skip loop if 0 (i.e. lag == 1) 285 286 fld dword [esi] ; ST = d <- data[sample] 287 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through 288 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) 289 lea edx, [eax + eax*2] 290 neg edx 291 lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2] 292 call .mov_eip_to_ebx 293.get_eip2: 294 add edx, ebx 295 inc edx ; compensate for the shorter opcode on the last iteration 296 inc edx ; compensate for the shorter opcode on the last iteration 297 inc edx ; compensate for the shorter opcode on the last iteration 298 jmp edx 299 300 fld st0 ; ST = d d 301 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d 302 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d 303 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d 304 fld st0 ; ST = d d 305 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d 306 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d 307 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d 308 fld st0 ; ST = d d 309 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d 310 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d 311 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d 312 fld st0 ; ST = d d 313 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d 314 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d 315 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d 316 fld st0 ; ST = d d 317 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d 318 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d 319 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d 320 fld st0 ; ST = d d 321 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d 322 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d 323 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d 324 fld st0 ; ST = d d 325 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d 326 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d 327 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d 328 fld st0 ; ST = d d 329 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d 330 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d 331 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d 332 fld st0 ; ST = d d 333 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d 334 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d 335 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d 336 fld st0 ; ST = d d 337 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d 338 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d 339 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d 340 fld st0 ; ST = d d 341 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d 342 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d 343 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d 344 fld st0 ; ST = d d 345 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d 346 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d 347 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d 348 fld st0 ; ST = d d 349 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d 350 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d 351 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d 352 fld st0 ; ST = d d 353 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d 354 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d 355 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d 356 fld st0 ; ST = d d 357 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d 358 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d 359 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d 360 fld st0 ; ST = d d 361 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d 362 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d 363 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d 364 fld st0 ; ST = d d 365 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d 366 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d 367 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d 368 fld st0 ; ST = d d 369 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d 370 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d 371 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d 372 fld st0 ; ST = d d 373 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d 374 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d 375 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d 376 fld st0 ; ST = d d 377 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d 378 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d 379 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d 380 fld st0 ; ST = d d 381 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d 382 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d 383 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d 384 fld st0 ; ST = d d 385 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d 386 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d 387 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d 388 fld st0 ; ST = d d 389 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d 390 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d 391 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d 392 fld st0 ; ST = d d 393 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d 394 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d 395 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d 396 fld st0 ; ST = d d 397 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d 398 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d 399 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d 400 fld st0 ; ST = d d 401 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d 402 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d 403 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d 404 fld st0 ; ST = d d 405 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d 406 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d 407 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d 408 fld st0 ; ST = d d 409 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d 410 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d 411 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d 412 fld st0 ; ST = d d 413 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d 414 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d 415 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d 416 fld st0 ; ST = d d 417 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d 418 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d 419 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d 420 fld st0 ; ST = d d 421 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d 422 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d 423 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d 424 fld st0 ; ST = d d 425 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! 426 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! 427 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! 428.jumper2_0: 429 430 fstp st0 ; pop d, ST = empty 431 add esi, byte 4 ; sample++ 432 dec ecx 433 jz .loop2_end 434 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target 435 fld dword [esi] ; ST = d <- data[sample] 436 jmp edx 437.loop2_end: 438 439.end: 440 pop ebx 441 pop edi 442 pop esi 443 ret 444 445 ALIGN 16 446cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4_old 447 ;[esp + 16] == autoc[] 448 ;[esp + 12] == lag 449 ;[esp + 8] == data_len 450 ;[esp + 4] == data[] 451 452 ;ASSERT(lag > 0) 453 ;ASSERT(lag <= 4) 454 ;ASSERT(lag <= data_len) 455 456 ; for(coeff = 0; coeff < lag; coeff++) 457 ; autoc[coeff] = 0.0; 458 xorps xmm5, xmm5 459 460 mov edx, [esp + 8] ; edx == data_len 461 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] 462 463 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] 464 add eax, 4 465 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] 466 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 467.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample] 468 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 469 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 470 dec edx 471 jz .loop_end 472 ALIGN 16 473.loop_start: 474 ; start by reading the next sample 475 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] 476 add eax, 4 477 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] 478 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float 479 movss xmm2, xmm0 480 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 481 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 482 dec edx 483 jnz .loop_start 484.loop_end: 485 ; store autoc 486 mov edx, [esp + 16] ; edx == autoc 487 movups [edx], xmm5 488 489.end: 490 ret 491 492 ALIGN 16 493cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8_old 494 ;[esp + 16] == autoc[] 495 ;[esp + 12] == lag 496 ;[esp + 8] == data_len 497 ;[esp + 4] == data[] 498 499 ;ASSERT(lag > 0) 500 ;ASSERT(lag <= 8) 501 ;ASSERT(lag <= data_len) 502 503 ; for(coeff = 0; coeff < lag; coeff++) 504 ; autoc[coeff] = 0.0; 505 xorps xmm5, xmm5 506 xorps xmm6, xmm6 507 508 mov edx, [esp + 8] ; edx == data_len 509 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] 510 511 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] 512 add eax, 4 513 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] 514 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 515 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 516 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 517.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] 518 mulps xmm0, xmm2 519 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 520 addps xmm5, xmm0 521 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 522 dec edx 523 jz .loop_end 524 ALIGN 16 525.loop_start: 526 ; start by reading the next sample 527 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] 528 ; here we reorder the instructions; see the (#) indexes for a logical order 529 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float 530 add eax, 4 ; (0) 531 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float 532 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample] 533 movss xmm3, xmm2 ; (5) 534 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample] 535 movss xmm2, xmm0 ; (6) 536 mulps xmm1, xmm3 ; (8) 537 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 538 addps xmm6, xmm1 ; (10) 539 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 540 dec edx 541 jnz .loop_start 542.loop_end: 543 ; store autoc 544 mov edx, [esp + 16] ; edx == autoc 545 movups [edx], xmm5 546 movups [edx + 16], xmm6 547 548.end: 549 ret 550 551 ALIGN 16 552cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12_old 553 ;[esp + 16] == autoc[] 554 ;[esp + 12] == lag 555 ;[esp + 8] == data_len 556 ;[esp + 4] == data[] 557 558 ;ASSERT(lag > 0) 559 ;ASSERT(lag <= 12) 560 ;ASSERT(lag <= data_len) 561 562 ; for(coeff = 0; coeff < lag; coeff++) 563 ; autoc[coeff] = 0.0; 564 xorps xmm5, xmm5 565 xorps xmm6, xmm6 566 xorps xmm7, xmm7 567 568 mov edx, [esp + 8] ; edx == data_len 569 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] 570 571 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] 572 add eax, 4 573 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] 574 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 575 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 576 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 577.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] 578 movaps xmm1, xmm0 579 mulps xmm1, xmm2 580 addps xmm5, xmm1 581 movaps xmm1, xmm0 582 mulps xmm1, xmm3 583 addps xmm6, xmm1 584 mulps xmm0, xmm4 585 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 586 dec edx 587 jz .loop_end 588 ALIGN 16 589.loop_start: 590 ; start by reading the next sample 591 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] 592 add eax, 4 593 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] 594 595 ; shift xmm4:xmm3:xmm2 left by one float 596 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float 597 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float 598 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float 599 movss xmm4, xmm3 600 movss xmm3, xmm2 601 movss xmm2, xmm0 602 603 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 604 movaps xmm1, xmm0 605 mulps xmm1, xmm2 606 addps xmm5, xmm1 607 movaps xmm1, xmm0 608 mulps xmm1, xmm3 609 addps xmm6, xmm1 610 mulps xmm0, xmm4 611 addps xmm7, xmm0 612 613 dec edx 614 jnz .loop_start 615.loop_end: 616 ; store autoc 617 mov edx, [esp + 16] ; edx == autoc 618 movups [edx], xmm5 619 movups [edx + 16], xmm6 620 movups [edx + 32], xmm7 621 622.end: 623 ret 624 625 ALIGN 16 626cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_16_old 627 ;[ebp + 20] == autoc[] 628 ;[ebp + 16] == lag 629 ;[ebp + 12] == data_len 630 ;[ebp + 8] == data[] 631 ;[esp] == __m128 632 ;[esp + 16] == __m128 633 634 push ebp 635 mov ebp, esp 636 and esp, -16 ; stack realign for SSE instructions 'movaps' and 'addps' 637 sub esp, 32 638 639 ;ASSERT(lag > 0) 640 ;ASSERT(lag <= 16) 641 ;ASSERT(lag <= data_len) 642 ;ASSERT(data_len > 0) 643 644 ; for(coeff = 0; coeff < lag; coeff++) 645 ; autoc[coeff] = 0.0; 646 xorps xmm5, xmm5 647 xorps xmm6, xmm6 648 movaps [esp], xmm5 649 movaps [esp + 16], xmm6 650 651 mov edx, [ebp + 12] ; edx == data_len 652 mov eax, [ebp + 8] ; eax == &data[sample] <- &data[0] 653 654 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] 655 add eax, 4 656 movaps xmm1, xmm0 ; xmm1 = 0,0,0,data[0] 657 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] 658 xorps xmm2, xmm2 ; xmm2 = 0,0,0,0 659 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 660 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 661 movaps xmm7, xmm0 662 mulps xmm7, xmm1 663 addps xmm5, xmm7 664 dec edx 665 jz .loop_end 666 ALIGN 16 667.loop_start: 668 ; start by reading the next sample 669 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] 670 add eax, 4 671 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] 672 673 ; shift xmm4:xmm3:xmm2:xmm1 left by one float 674 shufps xmm1, xmm1, 93h 675 shufps xmm2, xmm2, 93h 676 shufps xmm3, xmm3, 93h 677 shufps xmm4, xmm4, 93h 678 movss xmm4, xmm3 679 movss xmm3, xmm2 680 movss xmm2, xmm1 681 movss xmm1, xmm0 682 683 ; xmmB:xmmA:xmm6:xmm5 += xmm0:xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2:xmm1 684 movaps xmm7, xmm0 685 mulps xmm7, xmm1 686 addps xmm5, xmm7 687 movaps xmm7, xmm0 688 mulps xmm7, xmm2 689 addps xmm6, xmm7 690 movaps xmm7, xmm0 691 mulps xmm7, xmm3 692 mulps xmm0, xmm4 693 addps xmm7, [esp] 694 addps xmm0, [esp + 16] 695 movaps [esp], xmm7 696 movaps [esp + 16], xmm0 697 698 dec edx 699 jnz .loop_start 700.loop_end: 701 ; store autoc 702 mov edx, [ebp + 20] ; edx == autoc 703 movups [edx], xmm5 704 movups [edx + 16], xmm6 705 movaps xmm5, [esp] 706 movaps xmm6, [esp + 16] 707 movups [edx + 32], xmm5 708 movups [edx + 48], xmm6 709.end: 710 mov esp, ebp 711 pop ebp 712 ret 713 714;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 715; 716; for(i = 0; i < data_len; i++) { 717; sum = 0; 718; for(j = 0; j < order; j++) 719; sum += qlp_coeff[j] * data[i-j-1]; 720; residual[i] = data[i] - (sum >> lp_quantization); 721; } 722; 723 ALIGN 16 724cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 725 ;[esp + 40] residual[] 726 ;[esp + 36] lp_quantization 727 ;[esp + 32] order 728 ;[esp + 28] qlp_coeff[] 729 ;[esp + 24] data_len 730 ;[esp + 20] data[] 731 732 ;ASSERT(order > 0) 733 734 push ebp 735 push ebx 736 push esi 737 push edi 738 739 mov esi, [esp + 20] ; esi = data[] 740 mov edi, [esp + 40] ; edi = residual[] 741 mov eax, [esp + 32] ; eax = order 742 mov ebx, [esp + 24] ; ebx = data_len 743 744 test ebx, ebx 745 jz near .end ; do nothing if data_len == 0 746.begin: 747 cmp eax, byte 1 748 jg short .i_1more 749 750 mov ecx, [esp + 28] 751 mov edx, [ecx] ; edx = qlp_coeff[0] 752 mov eax, [esi - 4] ; eax = data[-1] 753 mov ecx, [esp + 36] ; cl = lp_quantization 754 ALIGN 16 755.i_1_loop_i: 756 imul eax, edx 757 sar eax, cl 758 neg eax 759 add eax, [esi] 760 mov [edi], eax 761 mov eax, [esi] 762 add edi, byte 4 763 add esi, byte 4 764 dec ebx 765 jnz .i_1_loop_i 766 767 jmp .end 768 769.i_1more: 770 cmp eax, byte 32 ; for order <= 32 there is a faster routine 771 jbe short .i_32 772 773 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 774 ALIGN 16 775.i_32more_loop_i: 776 xor ebp, ebp 777 mov ecx, [esp + 32] 778 mov edx, ecx 779 shl edx, 2 780 add edx, [esp + 28] 781 neg ecx 782 ALIGN 16 783.i_32more_loop_j: 784 sub edx, byte 4 785 mov eax, [edx] 786 imul eax, [esi + 4 * ecx] 787 add ebp, eax 788 inc ecx 789 jnz short .i_32more_loop_j 790 791 mov ecx, [esp + 36] 792 sar ebp, cl 793 neg ebp 794 add ebp, [esi] 795 mov [edi], ebp 796 add esi, byte 4 797 add edi, byte 4 798 799 dec ebx 800 jnz .i_32more_loop_i 801 802 jmp .end 803 804.mov_eip_to_eax: 805 mov eax, [esp] 806 ret 807 808.i_32: 809 sub edi, esi 810 neg eax 811 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] 812 call .mov_eip_to_eax 813.get_eip0: 814 add edx, eax 815 inc edx 816 mov eax, [esp + 28] ; eax = qlp_coeff[] 817 xor ebp, ebp 818 jmp edx 819 820 mov ecx, [eax + 124] 821 imul ecx, [esi - 128] 822 add ebp, ecx 823 mov ecx, [eax + 120] 824 imul ecx, [esi - 124] 825 add ebp, ecx 826 mov ecx, [eax + 116] 827 imul ecx, [esi - 120] 828 add ebp, ecx 829 mov ecx, [eax + 112] 830 imul ecx, [esi - 116] 831 add ebp, ecx 832 mov ecx, [eax + 108] 833 imul ecx, [esi - 112] 834 add ebp, ecx 835 mov ecx, [eax + 104] 836 imul ecx, [esi - 108] 837 add ebp, ecx 838 mov ecx, [eax + 100] 839 imul ecx, [esi - 104] 840 add ebp, ecx 841 mov ecx, [eax + 96] 842 imul ecx, [esi - 100] 843 add ebp, ecx 844 mov ecx, [eax + 92] 845 imul ecx, [esi - 96] 846 add ebp, ecx 847 mov ecx, [eax + 88] 848 imul ecx, [esi - 92] 849 add ebp, ecx 850 mov ecx, [eax + 84] 851 imul ecx, [esi - 88] 852 add ebp, ecx 853 mov ecx, [eax + 80] 854 imul ecx, [esi - 84] 855 add ebp, ecx 856 mov ecx, [eax + 76] 857 imul ecx, [esi - 80] 858 add ebp, ecx 859 mov ecx, [eax + 72] 860 imul ecx, [esi - 76] 861 add ebp, ecx 862 mov ecx, [eax + 68] 863 imul ecx, [esi - 72] 864 add ebp, ecx 865 mov ecx, [eax + 64] 866 imul ecx, [esi - 68] 867 add ebp, ecx 868 mov ecx, [eax + 60] 869 imul ecx, [esi - 64] 870 add ebp, ecx 871 mov ecx, [eax + 56] 872 imul ecx, [esi - 60] 873 add ebp, ecx 874 mov ecx, [eax + 52] 875 imul ecx, [esi - 56] 876 add ebp, ecx 877 mov ecx, [eax + 48] 878 imul ecx, [esi - 52] 879 add ebp, ecx 880 mov ecx, [eax + 44] 881 imul ecx, [esi - 48] 882 add ebp, ecx 883 mov ecx, [eax + 40] 884 imul ecx, [esi - 44] 885 add ebp, ecx 886 mov ecx, [eax + 36] 887 imul ecx, [esi - 40] 888 add ebp, ecx 889 mov ecx, [eax + 32] 890 imul ecx, [esi - 36] 891 add ebp, ecx 892 mov ecx, [eax + 28] 893 imul ecx, [esi - 32] 894 add ebp, ecx 895 mov ecx, [eax + 24] 896 imul ecx, [esi - 28] 897 add ebp, ecx 898 mov ecx, [eax + 20] 899 imul ecx, [esi - 24] 900 add ebp, ecx 901 mov ecx, [eax + 16] 902 imul ecx, [esi - 20] 903 add ebp, ecx 904 mov ecx, [eax + 12] 905 imul ecx, [esi - 16] 906 add ebp, ecx 907 mov ecx, [eax + 8] 908 imul ecx, [esi - 12] 909 add ebp, ecx 910 mov ecx, [eax + 4] 911 imul ecx, [esi - 8] 912 add ebp, ecx 913 mov ecx, [eax] ; there is one byte missing 914 imul ecx, [esi - 4] 915 add ebp, ecx 916.jumper_0: 917 918 mov ecx, [esp + 36] 919 sar ebp, cl 920 neg ebp 921 add ebp, [esi] 922 mov [edi + esi], ebp 923 add esi, byte 4 924 925 dec ebx 926 jz short .end 927 xor ebp, ebp 928 jmp edx 929 930.end: 931 pop edi 932 pop esi 933 pop ebx 934 pop ebp 935 ret 936 937; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for 938; the channel and qlp_coeffs must be <= 16. Especially note that this routine 939; cannot be used for side-channel coded 16bps channels since the effective bps 940; is 17. 941 ALIGN 16 942cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx 943 ;[esp + 40] residual[] 944 ;[esp + 36] lp_quantization 945 ;[esp + 32] order 946 ;[esp + 28] qlp_coeff[] 947 ;[esp + 24] data_len 948 ;[esp + 20] data[] 949 950 ;ASSERT(order > 0) 951 952 push ebp 953 push ebx 954 push esi 955 push edi 956 957 mov esi, [esp + 20] ; esi = data[] 958 mov edi, [esp + 40] ; edi = residual[] 959 mov eax, [esp + 32] ; eax = order 960 mov ebx, [esp + 24] ; ebx = data_len 961 962 test ebx, ebx 963 jz near .end ; do nothing if data_len == 0 964 dec ebx 965 test ebx, ebx 966 jz near .last_one 967 968 mov edx, [esp + 28] ; edx = qlp_coeff[] 969 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization 970 mov ebp, esp 971 972 and esp, 0xfffffff8 973 974 xor ecx, ecx 975.copy_qlp_loop: 976 push word [edx + 4 * ecx] 977 inc ecx 978 cmp ecx, eax 979 jnz short .copy_qlp_loop 980 981 and ecx, 0x3 982 test ecx, ecx 983 je short .za_end 984 sub ecx, byte 4 985.za_loop: 986 push word 0 987 inc eax 988 inc ecx 989 jnz short .za_loop 990.za_end: 991 992 movq mm5, [esp + 2 * eax - 8] 993 movd mm4, [esi - 16] 994 punpckldq mm4, [esi - 12] 995 movd mm0, [esi - 8] 996 punpckldq mm0, [esi - 4] 997 packssdw mm4, mm0 998 999 cmp eax, byte 4 1000 jnbe short .mmx_4more 1001 1002 ALIGN 16 1003.mmx_4_loop_i: 1004 movd mm1, [esi] 1005 movq mm3, mm4 1006 punpckldq mm1, [esi + 4] 1007 psrlq mm4, 16 1008 movq mm0, mm1 1009 psllq mm0, 48 1010 por mm4, mm0 1011 movq mm2, mm4 1012 psrlq mm4, 16 1013 pxor mm0, mm0 1014 punpckhdq mm0, mm1 1015 pmaddwd mm3, mm5 1016 pmaddwd mm2, mm5 1017 psllq mm0, 16 1018 por mm4, mm0 1019 movq mm0, mm3 1020 punpckldq mm3, mm2 1021 punpckhdq mm0, mm2 1022 paddd mm3, mm0 1023 psrad mm3, mm6 1024 psubd mm1, mm3 1025 movd [edi], mm1 1026 punpckhdq mm1, mm1 1027 movd [edi + 4], mm1 1028 1029 add edi, byte 8 1030 add esi, byte 8 1031 1032 sub ebx, 2 1033 jg .mmx_4_loop_i 1034 jmp .mmx_end 1035 1036.mmx_4more: 1037 shl eax, 2 1038 neg eax 1039 add eax, byte 16 1040 1041 ALIGN 16 1042.mmx_4more_loop_i: 1043 movd mm1, [esi] 1044 punpckldq mm1, [esi + 4] 1045 movq mm3, mm4 1046 psrlq mm4, 16 1047 movq mm0, mm1 1048 psllq mm0, 48 1049 por mm4, mm0 1050 movq mm2, mm4 1051 psrlq mm4, 16 1052 pxor mm0, mm0 1053 punpckhdq mm0, mm1 1054 pmaddwd mm3, mm5 1055 pmaddwd mm2, mm5 1056 psllq mm0, 16 1057 por mm4, mm0 1058 1059 mov ecx, esi 1060 add ecx, eax 1061 mov edx, esp 1062 1063 ALIGN 16 1064.mmx_4more_loop_j: 1065 movd mm0, [ecx - 16] 1066 movd mm7, [ecx - 8] 1067 punpckldq mm0, [ecx - 12] 1068 punpckldq mm7, [ecx - 4] 1069 packssdw mm0, mm7 1070 pmaddwd mm0, [edx] 1071 punpckhdq mm7, mm7 1072 paddd mm3, mm0 1073 movd mm0, [ecx - 12] 1074 punpckldq mm0, [ecx - 8] 1075 punpckldq mm7, [ecx] 1076 packssdw mm0, mm7 1077 pmaddwd mm0, [edx] 1078 paddd mm2, mm0 1079 1080 add edx, byte 8 1081 add ecx, byte 16 1082 cmp ecx, esi 1083 jnz .mmx_4more_loop_j 1084 1085 movq mm0, mm3 1086 punpckldq mm3, mm2 1087 punpckhdq mm0, mm2 1088 paddd mm3, mm0 1089 psrad mm3, mm6 1090 psubd mm1, mm3 1091 movd [edi], mm1 1092 punpckhdq mm1, mm1 1093 movd [edi + 4], mm1 1094 1095 add edi, byte 8 1096 add esi, byte 8 1097 1098 sub ebx, 2 1099 jg near .mmx_4more_loop_i 1100 1101.mmx_end: 1102 emms 1103 mov esp, ebp 1104.last_one: 1105 mov eax, [esp + 32] 1106 inc ebx 1107 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin 1108 1109.end: 1110 pop edi 1111 pop esi 1112 pop ebx 1113 pop ebp 1114 ret 1115 1116; ********************************************************************** 1117; 1118; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) 1119; { 1120; unsigned i, j; 1121; FLAC__int32 sum; 1122; 1123; FLAC__ASSERT(order > 0); 1124; 1125; for(i = 0; i < data_len; i++) { 1126; sum = 0; 1127; for(j = 0; j < order; j++) 1128; sum += qlp_coeff[j] * data[i-j-1]; 1129; data[i] = residual[i] + (sum >> lp_quantization); 1130; } 1131; } 1132 ALIGN 16 1133cident FLAC__lpc_restore_signal_asm_ia32 1134 ;[esp + 40] data[] 1135 ;[esp + 36] lp_quantization 1136 ;[esp + 32] order 1137 ;[esp + 28] qlp_coeff[] 1138 ;[esp + 24] data_len 1139 ;[esp + 20] residual[] 1140 1141 ;ASSERT(order > 0) 1142 1143 push ebp 1144 push ebx 1145 push esi 1146 push edi 1147 1148 mov esi, [esp + 20] ; esi = residual[] 1149 mov edi, [esp + 40] ; edi = data[] 1150 mov eax, [esp + 32] ; eax = order 1151 mov ebx, [esp + 24] ; ebx = data_len 1152 1153 test ebx, ebx 1154 jz near .end ; do nothing if data_len == 0 1155 1156.begin: 1157 cmp eax, byte 1 1158 jg short .x87_1more 1159 1160 mov ecx, [esp + 28] 1161 mov edx, [ecx] 1162 mov eax, [edi - 4] 1163 mov ecx, [esp + 36] 1164 ALIGN 16 1165.x87_1_loop_i: 1166 imul eax, edx 1167 sar eax, cl 1168 add eax, [esi] 1169 mov [edi], eax 1170 add esi, byte 4 1171 add edi, byte 4 1172 dec ebx 1173 jnz .x87_1_loop_i 1174 1175 jmp .end 1176 1177.x87_1more: 1178 cmp eax, byte 32 ; for order <= 32 there is a faster routine 1179 jbe short .x87_32 1180 1181 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 1182 ALIGN 16 1183.x87_32more_loop_i: 1184 xor ebp, ebp 1185 mov ecx, [esp + 32] 1186 mov edx, ecx 1187 shl edx, 2 1188 add edx, [esp + 28] 1189 neg ecx 1190 ALIGN 16 1191.x87_32more_loop_j: 1192 sub edx, byte 4 1193 mov eax, [edx] 1194 imul eax, [edi + 4 * ecx] 1195 add ebp, eax 1196 inc ecx 1197 jnz short .x87_32more_loop_j 1198 1199 mov ecx, [esp + 36] 1200 sar ebp, cl 1201 add ebp, [esi] 1202 mov [edi], ebp 1203 add edi, byte 4 1204 add esi, byte 4 1205 1206 dec ebx 1207 jnz .x87_32more_loop_i 1208 1209 jmp .end 1210 1211.mov_eip_to_eax: 1212 mov eax, [esp] 1213 ret 1214 1215.x87_32: 1216 sub esi, edi 1217 neg eax 1218 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] 1219 call .mov_eip_to_eax 1220.get_eip0: 1221 add edx, eax 1222 inc edx ; compensate for the shorter opcode on the last iteration 1223 mov eax, [esp + 28] ; eax = qlp_coeff[] 1224 xor ebp, ebp 1225 jmp edx 1226 1227 mov ecx, [eax + 124] ; ecx = qlp_coeff[31] 1228 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32] 1229 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32] 1230 mov ecx, [eax + 120] ; ecx = qlp_coeff[30] 1231 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31] 1232 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31] 1233 mov ecx, [eax + 116] ; ecx = qlp_coeff[29] 1234 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30] 1235 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30] 1236 mov ecx, [eax + 112] ; ecx = qlp_coeff[28] 1237 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29] 1238 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29] 1239 mov ecx, [eax + 108] ; ecx = qlp_coeff[27] 1240 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28] 1241 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28] 1242 mov ecx, [eax + 104] ; ecx = qlp_coeff[26] 1243 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27] 1244 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27] 1245 mov ecx, [eax + 100] ; ecx = qlp_coeff[25] 1246 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26] 1247 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26] 1248 mov ecx, [eax + 96] ; ecx = qlp_coeff[24] 1249 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25] 1250 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25] 1251 mov ecx, [eax + 92] ; ecx = qlp_coeff[23] 1252 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24] 1253 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24] 1254 mov ecx, [eax + 88] ; ecx = qlp_coeff[22] 1255 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23] 1256 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23] 1257 mov ecx, [eax + 84] ; ecx = qlp_coeff[21] 1258 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22] 1259 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22] 1260 mov ecx, [eax + 80] ; ecx = qlp_coeff[20] 1261 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21] 1262 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21] 1263 mov ecx, [eax + 76] ; ecx = qlp_coeff[19] 1264 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20] 1265 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20] 1266 mov ecx, [eax + 72] ; ecx = qlp_coeff[18] 1267 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19] 1268 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19] 1269 mov ecx, [eax + 68] ; ecx = qlp_coeff[17] 1270 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18] 1271 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18] 1272 mov ecx, [eax + 64] ; ecx = qlp_coeff[16] 1273 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17] 1274 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17] 1275 mov ecx, [eax + 60] ; ecx = qlp_coeff[15] 1276 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16] 1277 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16] 1278 mov ecx, [eax + 56] ; ecx = qlp_coeff[14] 1279 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15] 1280 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15] 1281 mov ecx, [eax + 52] ; ecx = qlp_coeff[13] 1282 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14] 1283 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14] 1284 mov ecx, [eax + 48] ; ecx = qlp_coeff[12] 1285 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13] 1286 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13] 1287 mov ecx, [eax + 44] ; ecx = qlp_coeff[11] 1288 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12] 1289 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12] 1290 mov ecx, [eax + 40] ; ecx = qlp_coeff[10] 1291 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11] 1292 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11] 1293 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9] 1294 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10] 1295 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10] 1296 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8] 1297 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9] 1298 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9] 1299 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7] 1300 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8] 1301 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8] 1302 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6] 1303 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7] 1304 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7] 1305 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5] 1306 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6] 1307 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6] 1308 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4] 1309 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5] 1310 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5] 1311 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3] 1312 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4] 1313 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4] 1314 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2] 1315 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3] 1316 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3] 1317 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1] 1318 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2] 1319 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2] 1320 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction) 1321 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1] 1322 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1] 1323.jumper_0: 1324 1325 mov ecx, [esp + 36] 1326 sar ebp, cl ; ebp = (sum >> lp_quantization) 1327 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization) 1328 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization) 1329 add edi, byte 4 1330 1331 dec ebx 1332 jz short .end 1333 xor ebp, ebp 1334 jmp edx 1335 1336.end: 1337 pop edi 1338 pop esi 1339 pop ebx 1340 pop ebp 1341 ret 1342 1343; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for 1344; the channel and qlp_coeffs must be <= 16. Especially note that this routine 1345; cannot be used for side-channel coded 16bps channels since the effective bps 1346; is 17. 1347; WATCHOUT: this routine requires that each data array have a buffer of up to 1348; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each 1349; channel n, data[n][-1] through data[n][-3] should be accessible and zero. 1350 ALIGN 16 1351cident FLAC__lpc_restore_signal_asm_ia32_mmx 1352 ;[esp + 40] data[] 1353 ;[esp + 36] lp_quantization 1354 ;[esp + 32] order 1355 ;[esp + 28] qlp_coeff[] 1356 ;[esp + 24] data_len 1357 ;[esp + 20] residual[] 1358 1359 ;ASSERT(order > 0) 1360 1361 push ebp 1362 push ebx 1363 push esi 1364 push edi 1365 1366 mov esi, [esp + 20] 1367 mov edi, [esp + 40] 1368 mov eax, [esp + 32] 1369 mov ebx, [esp + 24] 1370 1371 test ebx, ebx 1372 jz near .end ; do nothing if data_len == 0 1373 cmp eax, byte 4 1374 jb near FLAC__lpc_restore_signal_asm_ia32.begin 1375 1376 mov edx, [esp + 28] 1377 movd mm6, [esp + 36] 1378 mov ebp, esp 1379 1380 and esp, 0xfffffff8 1381 1382 xor ecx, ecx 1383.copy_qlp_loop: 1384 push word [edx + 4 * ecx] 1385 inc ecx 1386 cmp ecx, eax 1387 jnz short .copy_qlp_loop 1388 1389 and ecx, 0x3 1390 test ecx, ecx 1391 je short .za_end 1392 sub ecx, byte 4 1393.za_loop: 1394 push word 0 1395 inc eax 1396 inc ecx 1397 jnz short .za_loop 1398.za_end: 1399 1400 movq mm5, [esp + 2 * eax - 8] 1401 movd mm4, [edi - 16] 1402 punpckldq mm4, [edi - 12] 1403 movd mm0, [edi - 8] 1404 punpckldq mm0, [edi - 4] 1405 packssdw mm4, mm0 1406 1407 cmp eax, byte 4 1408 jnbe short .mmx_4more 1409 1410 ALIGN 16 1411.mmx_4_loop_i: 1412 movq mm7, mm4 1413 pmaddwd mm7, mm5 1414 movq mm0, mm7 1415 punpckhdq mm7, mm7 1416 paddd mm7, mm0 1417 psrad mm7, mm6 1418 movd mm1, [esi] 1419 paddd mm7, mm1 1420 movd [edi], mm7 1421 psllq mm7, 48 1422 psrlq mm4, 16 1423 por mm4, mm7 1424 1425 add esi, byte 4 1426 add edi, byte 4 1427 1428 dec ebx 1429 jnz .mmx_4_loop_i 1430 jmp .mmx_end 1431.mmx_4more: 1432 shl eax, 2 1433 neg eax 1434 add eax, byte 16 1435 ALIGN 16 1436.mmx_4more_loop_i: 1437 mov ecx, edi 1438 add ecx, eax 1439 mov edx, esp 1440 1441 movq mm7, mm4 1442 pmaddwd mm7, mm5 1443 1444 ALIGN 16 1445.mmx_4more_loop_j: 1446 movd mm0, [ecx - 16] 1447 punpckldq mm0, [ecx - 12] 1448 movd mm1, [ecx - 8] 1449 punpckldq mm1, [ecx - 4] 1450 packssdw mm0, mm1 1451 pmaddwd mm0, [edx] 1452 paddd mm7, mm0 1453 1454 add edx, byte 8 1455 add ecx, byte 16 1456 cmp ecx, edi 1457 jnz .mmx_4more_loop_j 1458 1459 movq mm0, mm7 1460 punpckhdq mm7, mm7 1461 paddd mm7, mm0 1462 psrad mm7, mm6 1463 movd mm1, [esi] 1464 paddd mm7, mm1 1465 movd [edi], mm7 1466 psllq mm7, 48 1467 psrlq mm4, 16 1468 por mm4, mm7 1469 1470 add esi, byte 4 1471 add edi, byte 4 1472 1473 dec ebx 1474 jnz short .mmx_4more_loop_i 1475.mmx_end: 1476 emms 1477 mov esp, ebp 1478 1479.end: 1480 pop edi 1481 pop esi 1482 pop ebx 1483 pop ebp 1484 ret 1485 1486 1487; ********************************************************************** 1488; 1489;void FLAC__lpc_compute_residual_from_qlp_coefficients_wide(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 1490; { 1491; unsigned i, j; 1492; FLAC__int64 sum; 1493; 1494; FLAC__ASSERT(order > 0); 1495; 1496; for(i = 0; i < data_len; i++) { 1497; sum = 0; 1498; for(j = 0; j < order; j++) 1499; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; 1500; residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); 1501; } 1502; } 1503 ALIGN 16 1504cident FLAC__lpc_compute_residual_from_qlp_coefficients_wide_asm_ia32 1505 ;[esp + 40] residual[] 1506 ;[esp + 36] lp_quantization 1507 ;[esp + 32] order 1508 ;[esp + 28] qlp_coeff[] 1509 ;[esp + 24] data_len 1510 ;[esp + 20] data[] 1511 1512 ;ASSERT(order > 0) 1513 ;ASSERT(order <= 32) 1514 ;ASSERT(lp_quantization <= 31) 1515 1516 push ebp 1517 push ebx 1518 push esi 1519 push edi 1520 1521 mov ebx, [esp + 24] ; ebx = data_len 1522 test ebx, ebx 1523 jz near .end ; do nothing if data_len == 0 1524 1525.begin: 1526 mov eax, [esp + 32] ; eax = order 1527 cmp eax, 1 1528 jg short .i_32 1529 1530 mov esi, [esp + 40] ; esi = residual[] 1531 mov edi, [esp + 20] ; edi = data[] 1532 mov ecx, [esp + 28] ; ecx = qlp_coeff[] 1533 mov ebp, [ecx] ; ebp = qlp_coeff[0] 1534 mov eax, [edi - 4] ; eax = data[-1] 1535 mov ecx, [esp + 36] ; cl = lp_quantization 1536 ALIGN 16 1537.i_1_loop_i: 1538 imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] 1539 shrd eax, edx, cl ; 0 <= lp_quantization <= 15 1540 neg eax 1541 add eax, [edi] 1542 mov [esi], eax 1543 mov eax, [edi] 1544 add esi, 4 1545 add edi, 4 1546 dec ebx 1547 jnz .i_1_loop_i 1548 jmp .end 1549 1550.mov_eip_to_eax: 1551 mov eax, [esp] 1552 ret 1553 1554.i_32: ; eax = order 1555 neg eax 1556 add eax, eax 1557 lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] 1558 call .mov_eip_to_eax 1559.get_eip0: 1560 add ebp, eax 1561 inc ebp ; compensate for the shorter opcode on the last iteration 1562 1563 mov ebx, [esp + 28] ; ebx = qlp_coeff[] 1564 mov edi, [esp + 20] ; edi = data[] 1565 sub [esp + 40], edi ; residual[] -= data[] 1566 1567 xor ecx, ecx 1568 xor esi, esi 1569 jmp ebp 1570 1571;eax = -- 1572;edx = -- 1573;ecx = 0 1574;esi = 0 1575; 1576;ebx = qlp_coeff[] 1577;edi = data[] 1578;ebp = @address 1579 1580 mov eax, [ebx + 124] ; eax = qlp_coeff[31] 1581 imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] 1582 add ecx, eax 1583 adc esi, edx ; sum += qlp_coeff[31] * data[i-32] 1584 1585 mov eax, [ebx + 120] ; eax = qlp_coeff[30] 1586 imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] 1587 add ecx, eax 1588 adc esi, edx ; sum += qlp_coeff[30] * data[i-31] 1589 1590 mov eax, [ebx + 116] 1591 imul dword [edi - 120] 1592 add ecx, eax 1593 adc esi, edx 1594 1595 mov eax, [ebx + 112] 1596 imul dword [edi - 116] 1597 add ecx, eax 1598 adc esi, edx 1599 1600 mov eax, [ebx + 108] 1601 imul dword [edi - 112] 1602 add ecx, eax 1603 adc esi, edx 1604 1605 mov eax, [ebx + 104] 1606 imul dword [edi - 108] 1607 add ecx, eax 1608 adc esi, edx 1609 1610 mov eax, [ebx + 100] 1611 imul dword [edi - 104] 1612 add ecx, eax 1613 adc esi, edx 1614 1615 mov eax, [ebx + 96] 1616 imul dword [edi - 100] 1617 add ecx, eax 1618 adc esi, edx 1619 1620 mov eax, [ebx + 92] 1621 imul dword [edi - 96] 1622 add ecx, eax 1623 adc esi, edx 1624 1625 mov eax, [ebx + 88] 1626 imul dword [edi - 92] 1627 add ecx, eax 1628 adc esi, edx 1629 1630 mov eax, [ebx + 84] 1631 imul dword [edi - 88] 1632 add ecx, eax 1633 adc esi, edx 1634 1635 mov eax, [ebx + 80] 1636 imul dword [edi - 84] 1637 add ecx, eax 1638 adc esi, edx 1639 1640 mov eax, [ebx + 76] 1641 imul dword [edi - 80] 1642 add ecx, eax 1643 adc esi, edx 1644 1645 mov eax, [ebx + 72] 1646 imul dword [edi - 76] 1647 add ecx, eax 1648 adc esi, edx 1649 1650 mov eax, [ebx + 68] 1651 imul dword [edi - 72] 1652 add ecx, eax 1653 adc esi, edx 1654 1655 mov eax, [ebx + 64] 1656 imul dword [edi - 68] 1657 add ecx, eax 1658 adc esi, edx 1659 1660 mov eax, [ebx + 60] 1661 imul dword [edi - 64] 1662 add ecx, eax 1663 adc esi, edx 1664 1665 mov eax, [ebx + 56] 1666 imul dword [edi - 60] 1667 add ecx, eax 1668 adc esi, edx 1669 1670 mov eax, [ebx + 52] 1671 imul dword [edi - 56] 1672 add ecx, eax 1673 adc esi, edx 1674 1675 mov eax, [ebx + 48] 1676 imul dword [edi - 52] 1677 add ecx, eax 1678 adc esi, edx 1679 1680 mov eax, [ebx + 44] 1681 imul dword [edi - 48] 1682 add ecx, eax 1683 adc esi, edx 1684 1685 mov eax, [ebx + 40] 1686 imul dword [edi - 44] 1687 add ecx, eax 1688 adc esi, edx 1689 1690 mov eax, [ebx + 36] 1691 imul dword [edi - 40] 1692 add ecx, eax 1693 adc esi, edx 1694 1695 mov eax, [ebx + 32] 1696 imul dword [edi - 36] 1697 add ecx, eax 1698 adc esi, edx 1699 1700 mov eax, [ebx + 28] 1701 imul dword [edi - 32] 1702 add ecx, eax 1703 adc esi, edx 1704 1705 mov eax, [ebx + 24] 1706 imul dword [edi - 28] 1707 add ecx, eax 1708 adc esi, edx 1709 1710 mov eax, [ebx + 20] 1711 imul dword [edi - 24] 1712 add ecx, eax 1713 adc esi, edx 1714 1715 mov eax, [ebx + 16] 1716 imul dword [edi - 20] 1717 add ecx, eax 1718 adc esi, edx 1719 1720 mov eax, [ebx + 12] 1721 imul dword [edi - 16] 1722 add ecx, eax 1723 adc esi, edx 1724 1725 mov eax, [ebx + 8] 1726 imul dword [edi - 12] 1727 add ecx, eax 1728 adc esi, edx 1729 1730 mov eax, [ebx + 4] 1731 imul dword [edi - 8] 1732 add ecx, eax 1733 adc esi, edx 1734 1735 mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) 1736 imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] 1737 add ecx, eax 1738 adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] 1739 1740.jumper_0: 1741 mov edx, ecx 1742;esi:edx = sum 1743 mov ecx, [esp + 36] ; cl = lp_quantization 1744 shrd edx, esi, cl ; edx = (sum >> lp_quantization) 1745;eax = -- 1746;ecx = -- 1747;edx = sum >> lp_q 1748;esi = -- 1749 neg edx ; edx = -(sum >> lp_quantization) 1750 mov eax, [esp + 40] ; residual[] - data[] 1751 add edx, [edi] ; edx = data[i] - (sum >> lp_quantization) 1752 mov [edi + eax], edx 1753 add edi, 4 1754 1755 dec dword [esp + 24] 1756 jz short .end 1757 xor ecx, ecx 1758 xor esi, esi 1759 jmp ebp 1760 1761.end: 1762 pop edi 1763 pop esi 1764 pop ebx 1765 pop ebp 1766 ret 1767 1768; ********************************************************************** 1769; 1770; void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]) 1771; { 1772; unsigned i, j; 1773; FLAC__int64 sum; 1774; 1775; FLAC__ASSERT(order > 0); 1776; 1777; for(i = 0; i < data_len; i++) { 1778; sum = 0; 1779; for(j = 0; j < order; j++) 1780; sum += qlp_coeff[j] * (FLAC__int64)data[i-j-1]; 1781; data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization); 1782; } 1783; } 1784 ALIGN 16 1785cident FLAC__lpc_restore_signal_wide_asm_ia32 1786 ;[esp + 40] data[] 1787 ;[esp + 36] lp_quantization 1788 ;[esp + 32] order 1789 ;[esp + 28] qlp_coeff[] 1790 ;[esp + 24] data_len 1791 ;[esp + 20] residual[] 1792 1793 ;ASSERT(order > 0) 1794 ;ASSERT(order <= 32) 1795 ;ASSERT(lp_quantization <= 31) 1796 1797 push ebp 1798 push ebx 1799 push esi 1800 push edi 1801 1802 mov ebx, [esp + 24] ; ebx = data_len 1803 test ebx, ebx 1804 jz near .end ; do nothing if data_len == 0 1805 1806.begin: 1807 mov eax, [esp + 32] ; eax = order 1808 cmp eax, 1 1809 jg short .x87_32 1810 1811 mov esi, [esp + 20] ; esi = residual[] 1812 mov edi, [esp + 40] ; edi = data[] 1813 mov ecx, [esp + 28] ; ecx = qlp_coeff[] 1814 mov ebp, [ecx] ; ebp = qlp_coeff[0] 1815 mov eax, [edi - 4] ; eax = data[-1] 1816 mov ecx, [esp + 36] ; cl = lp_quantization 1817 ALIGN 16 1818.x87_1_loop_i: 1819 imul ebp ; edx:eax = qlp_coeff[0] * (FLAC__int64)data[i-1] 1820 shrd eax, edx, cl ; 0 <= lp_quantization <= 15 1821; 1822 add eax, [esi] 1823 mov [edi], eax 1824; 1825 add esi, 4 1826 add edi, 4 1827 dec ebx 1828 jnz .x87_1_loop_i 1829 jmp .end 1830 1831.mov_eip_to_eax: 1832 mov eax, [esp] 1833 ret 1834 1835.x87_32: ; eax = order 1836 neg eax 1837 add eax, eax 1838 lea ebp, [eax + eax * 4 + .jumper_0 - .get_eip0] 1839 call .mov_eip_to_eax 1840.get_eip0: 1841 add ebp, eax 1842 inc ebp ; compensate for the shorter opcode on the last iteration 1843 1844 mov ebx, [esp + 28] ; ebx = qlp_coeff[] 1845 mov edi, [esp + 40] ; esi = data[] 1846 sub [esp + 20], edi ; residual[] -= data[] 1847 1848 xor ecx, ecx 1849 xor esi, esi 1850 jmp ebp 1851 1852;eax = -- 1853;edx = -- 1854;ecx = 0 1855;esi = 0 1856; 1857;ebx = qlp_coeff[] 1858;edi = data[] 1859;ebp = @address 1860 1861 mov eax, [ebx + 124] ; eax = qlp_coeff[31] 1862 imul dword [edi - 128] ; edx:eax = qlp_coeff[31] * data[i-32] 1863 add ecx, eax 1864 adc esi, edx ; sum += qlp_coeff[31] * data[i-32] 1865 1866 mov eax, [ebx + 120] ; eax = qlp_coeff[30] 1867 imul dword [edi - 124] ; edx:eax = qlp_coeff[30] * data[i-31] 1868 add ecx, eax 1869 adc esi, edx ; sum += qlp_coeff[30] * data[i-31] 1870 1871 mov eax, [ebx + 116] 1872 imul dword [edi - 120] 1873 add ecx, eax 1874 adc esi, edx 1875 1876 mov eax, [ebx + 112] 1877 imul dword [edi - 116] 1878 add ecx, eax 1879 adc esi, edx 1880 1881 mov eax, [ebx + 108] 1882 imul dword [edi - 112] 1883 add ecx, eax 1884 adc esi, edx 1885 1886 mov eax, [ebx + 104] 1887 imul dword [edi - 108] 1888 add ecx, eax 1889 adc esi, edx 1890 1891 mov eax, [ebx + 100] 1892 imul dword [edi - 104] 1893 add ecx, eax 1894 adc esi, edx 1895 1896 mov eax, [ebx + 96] 1897 imul dword [edi - 100] 1898 add ecx, eax 1899 adc esi, edx 1900 1901 mov eax, [ebx + 92] 1902 imul dword [edi - 96] 1903 add ecx, eax 1904 adc esi, edx 1905 1906 mov eax, [ebx + 88] 1907 imul dword [edi - 92] 1908 add ecx, eax 1909 adc esi, edx 1910 1911 mov eax, [ebx + 84] 1912 imul dword [edi - 88] 1913 add ecx, eax 1914 adc esi, edx 1915 1916 mov eax, [ebx + 80] 1917 imul dword [edi - 84] 1918 add ecx, eax 1919 adc esi, edx 1920 1921 mov eax, [ebx + 76] 1922 imul dword [edi - 80] 1923 add ecx, eax 1924 adc esi, edx 1925 1926 mov eax, [ebx + 72] 1927 imul dword [edi - 76] 1928 add ecx, eax 1929 adc esi, edx 1930 1931 mov eax, [ebx + 68] 1932 imul dword [edi - 72] 1933 add ecx, eax 1934 adc esi, edx 1935 1936 mov eax, [ebx + 64] 1937 imul dword [edi - 68] 1938 add ecx, eax 1939 adc esi, edx 1940 1941 mov eax, [ebx + 60] 1942 imul dword [edi - 64] 1943 add ecx, eax 1944 adc esi, edx 1945 1946 mov eax, [ebx + 56] 1947 imul dword [edi - 60] 1948 add ecx, eax 1949 adc esi, edx 1950 1951 mov eax, [ebx + 52] 1952 imul dword [edi - 56] 1953 add ecx, eax 1954 adc esi, edx 1955 1956 mov eax, [ebx + 48] 1957 imul dword [edi - 52] 1958 add ecx, eax 1959 adc esi, edx 1960 1961 mov eax, [ebx + 44] 1962 imul dword [edi - 48] 1963 add ecx, eax 1964 adc esi, edx 1965 1966 mov eax, [ebx + 40] 1967 imul dword [edi - 44] 1968 add ecx, eax 1969 adc esi, edx 1970 1971 mov eax, [ebx + 36] 1972 imul dword [edi - 40] 1973 add ecx, eax 1974 adc esi, edx 1975 1976 mov eax, [ebx + 32] 1977 imul dword [edi - 36] 1978 add ecx, eax 1979 adc esi, edx 1980 1981 mov eax, [ebx + 28] 1982 imul dword [edi - 32] 1983 add ecx, eax 1984 adc esi, edx 1985 1986 mov eax, [ebx + 24] 1987 imul dword [edi - 28] 1988 add ecx, eax 1989 adc esi, edx 1990 1991 mov eax, [ebx + 20] 1992 imul dword [edi - 24] 1993 add ecx, eax 1994 adc esi, edx 1995 1996 mov eax, [ebx + 16] 1997 imul dword [edi - 20] 1998 add ecx, eax 1999 adc esi, edx 2000 2001 mov eax, [ebx + 12] 2002 imul dword [edi - 16] 2003 add ecx, eax 2004 adc esi, edx 2005 2006 mov eax, [ebx + 8] 2007 imul dword [edi - 12] 2008 add ecx, eax 2009 adc esi, edx 2010 2011 mov eax, [ebx + 4] 2012 imul dword [edi - 8] 2013 add ecx, eax 2014 adc esi, edx 2015 2016 mov eax, [ebx] ; eax = qlp_coeff[ 0] (NOTE: one byte missing from instruction) 2017 imul dword [edi - 4] ; edx:eax = qlp_coeff[ 0] * data[i- 1] 2018 add ecx, eax 2019 adc esi, edx ; sum += qlp_coeff[ 0] * data[i- 1] 2020 2021.jumper_0: 2022 mov edx, ecx 2023;esi:edx = sum 2024 mov ecx, [esp + 36] ; cl = lp_quantization 2025 shrd edx, esi, cl ; edx = (sum >> lp_quantization) 2026;eax = -- 2027;ecx = -- 2028;edx = sum >> lp_q 2029;esi = -- 2030; 2031 mov eax, [esp + 20] ; residual[] - data[] 2032 add edx, [edi + eax] ; edx = residual[i] + (sum >> lp_quantization) 2033 mov [edi], edx ; data[i] = residual[i] + (sum >> lp_quantization) 2034 add edi, 4 2035 2036 dec dword [esp + 24] 2037 jz short .end 2038 xor ecx, ecx 2039 xor esi, esi 2040 jmp ebp 2041 2042.end: 2043 pop edi 2044 pop esi 2045 pop ebx 2046 pop ebp 2047 ret 2048 2049; end 2050