1 /*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /* ---- includes ----------------------------------------------------------- */
18
19 #include "b_BasicEm/Math.h"
20 #include "b_BasicEm/Functions.h"
21
22 /* ---- related objects --------------------------------------------------- */
23
24 /* ---- typedefs ----------------------------------------------------------- */
25
26 /* ---- constants ---------------------------------------------------------- */
27
28 /* ------------------------------------------------------------------------- */
29
30 /* ========================================================================= */
31 /* */
32 /* ---- \ghd{ external functions } ----------------------------------------- */
33 /* */
34 /* ========================================================================= */
35
36 #if defined( HW_SSE2 )
37 extern int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA );
38 extern int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA );
39 #endif
40
41 #if defined( HW_FR71 )
42 int32 bbs_dotProduct_fr71( const int16* vec1A, const int16* vec2A, uint32 sizeA );
43 #endif
44
45 /* ------------------------------------------------------------------------- */
46
bbs_sqrt32(uint32 valA)47 uint16 bbs_sqrt32( uint32 valA )
48 {
49 uint32 rootL = 0;
50 uint32 expL = 0;
51 expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
52 expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
53 expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
54 expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
55 switch( expL >> 1 )
56 {
57 case 15: rootL += ( ( rootL + 0x8000 ) * ( rootL + 0x8000 ) <= valA ) << 15;
58 case 14: rootL += ( ( rootL + 0x4000 ) * ( rootL + 0x4000 ) <= valA ) << 14;
59 case 13: rootL += ( ( rootL + 0x2000 ) * ( rootL + 0x2000 ) <= valA ) << 13;
60 case 12: rootL += ( ( rootL + 0x1000 ) * ( rootL + 0x1000 ) <= valA ) << 12;
61 case 11: rootL += ( ( rootL + 0x0800 ) * ( rootL + 0x0800 ) <= valA ) << 11;
62 case 10: rootL += ( ( rootL + 0x0400 ) * ( rootL + 0x0400 ) <= valA ) << 10;
63 case 9: rootL += ( ( rootL + 0x0200 ) * ( rootL + 0x0200 ) <= valA ) << 9;
64 case 8: rootL += ( ( rootL + 0x0100 ) * ( rootL + 0x0100 ) <= valA ) << 8;
65 case 7: rootL += ( ( rootL + 0x0080 ) * ( rootL + 0x0080 ) <= valA ) << 7;
66 case 6: rootL += ( ( rootL + 0x0040 ) * ( rootL + 0x0040 ) <= valA ) << 6;
67 case 5: rootL += ( ( rootL + 0x0020 ) * ( rootL + 0x0020 ) <= valA ) << 5;
68 case 4: rootL += ( ( rootL + 0x0010 ) * ( rootL + 0x0010 ) <= valA ) << 4;
69 case 3: rootL += ( ( rootL + 0x0008 ) * ( rootL + 0x0008 ) <= valA ) << 3;
70 case 2: rootL += ( ( rootL + 0x0004 ) * ( rootL + 0x0004 ) <= valA ) << 2;
71 case 1: rootL += ( ( rootL + 0x0002 ) * ( rootL + 0x0002 ) <= valA ) << 1;
72 case 0: rootL += ( ( rootL + 0x0001 ) * ( rootL + 0x0001 ) <= valA ) << 0;
73 }
74
75 return ( uint16 )rootL;
76 }
77
78 /* ------------------------------------------------------------------------- */
79
bbs_sqrt16(uint16 valA)80 uint8 bbs_sqrt16( uint16 valA )
81 {
82 uint16 rootL = 0;
83 rootL += ( ( rootL + 0x80 ) * ( rootL + 0x80 ) <= valA ) << 7;
84 rootL += ( ( rootL + 0x40 ) * ( rootL + 0x40 ) <= valA ) << 6;
85 rootL += ( ( rootL + 0x20 ) * ( rootL + 0x20 ) <= valA ) << 5;
86 rootL += ( ( rootL + 0x10 ) * ( rootL + 0x10 ) <= valA ) << 4;
87 rootL += ( ( rootL + 0x08 ) * ( rootL + 0x08 ) <= valA ) << 3;
88 rootL += ( ( rootL + 0x04 ) * ( rootL + 0x04 ) <= valA ) << 2;
89 rootL += ( ( rootL + 0x02 ) * ( rootL + 0x02 ) <= valA ) << 1;
90 rootL += ( ( rootL + 0x01 ) * ( rootL + 0x01 ) <= valA ) << 0;
91 return ( uint8 )rootL;
92 }
93
94 /* ------------------------------------------------------------------------- */
95
96 /* table of sqrt and slope values */
97 const uint32 bbs_fastSqrt32_tableG[] =
98 {
99 268435456, 1016, 272596992, 1000, 276692992, 987, 280735744, 972,
100 284717056, 959, 288645120, 946, 292519936, 933, 296341504, 922,
101 300118016, 910, 303845376, 899, 307527680, 889, 311169024, 878,
102 314765312, 869, 318324736, 858, 321839104, 850, 325320704, 840,
103 328761344, 832, 332169216, 824, 335544320, 815, 338882560, 807,
104 342188032, 799, 345460736, 792, 348704768, 785, 351920128, 777,
105 355102720, 771, 358260736, 764, 361390080, 757, 364490752, 751,
106 367566848, 745, 370618368, 739, 373645312, 732, 376643584, 727,
107 379621376, 722, 382578688, 715, 385507328, 711, 388419584, 705,
108 391307264, 700, 394174464, 695, 397021184, 689, 399843328, 686,
109 402653184, 680, 405438464, 675, 408203264, 672, 410955776, 666,
110 413683712, 663, 416399360, 658, 419094528, 653, 421769216, 650,
111 424431616, 646, 427077632, 641, 429703168, 638, 432316416, 634,
112 434913280, 630, 437493760, 627, 440061952, 622, 442609664, 620,
113 445149184, 615, 447668224, 613, 450179072, 609, 452673536, 605,
114 455151616, 602, 457617408, 600, 460075008, 595, 462512128, 593,
115 464941056, 590, 467357696, 587, 469762048, 583, 472150016, 581,
116 474529792, 578, 476897280, 575, 479252480, 572, 481595392, 569,
117 483926016, 567, 486248448, 564, 488558592, 561, 490856448, 559,
118 493146112, 556, 495423488, 553, 497688576, 552, 499949568, 548,
119 502194176, 546, 504430592, 544, 506658816, 541, 508874752, 539,
120 511082496, 537, 513282048, 534, 515469312, 533, 517652480, 529,
121 519819264, 528, 521981952, 526, 524136448, 523, 526278656, 521,
122 528412672, 519, 530538496, 517, 532656128, 515, 534765568, 514
123 };
124
bbs_fastSqrt32(uint32 valA)125 uint16 bbs_fastSqrt32( uint32 valA )
126 {
127 uint32 expL = 0;
128 uint32 valL;
129 uint32 offsL;
130 uint32 indexL = 0;
131
132 if( valA == 0 ) return 0;
133
134 /* compute closest even size exponent of valA */
135 expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
136 expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
137 expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
138 expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
139
140 valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */
141 offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 12 ) ) >> 13;
142 indexL = ( valL >> 24 ) & 0xFE;
143
144 return ( bbs_fastSqrt32_tableG[ indexL ] + offsL * bbs_fastSqrt32_tableG[ indexL + 1 ] ) >> ( 28 - ( expL >> 1 ) );
145 }
146
147 /* ------------------------------------------------------------------------- */
148
149 /* table of 1/sqrt (1.31) and negative slope (.15) values
150 referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */
151 const uint32 bbs_invSqrt32_tableG[] =
152 {
153 2147483648u, 1001, 2114682880, 956, 2083356672, 915, 2053373952, 877,
154 2024636416, 840, 1997111296, 808, 1970634752, 776, 1945206784, 746,
155 1920761856, 720, 1897168896, 693, 1874460672, 669, 1852538880, 646,
156 1831370752, 625, 1810890752, 604, 1791098880, 584, 1771962368, 567,
157 1753382912, 548, 1735426048, 533, 1717960704, 516, 1701052416, 502,
158 1684602880, 487, 1668644864, 474, 1653112832, 461, 1638006784, 448,
159 1623326720, 436, 1609039872, 426, 1595080704, 414, 1581514752, 404,
160 1568276480, 394, 1555365888, 384, 1542782976, 375, 1530494976, 367,
161 1518469120, 357, 1506770944, 350, 1495302144, 342, 1484095488, 334,
162 1473150976, 327, 1462435840, 320, 1451950080, 313, 1441693696, 307,
163 1431633920, 300, 1421803520, 294, 1412169728, 289, 1402699776, 282,
164 1393459200, 277, 1384382464, 272, 1375469568, 266, 1366753280, 262,
165 1358168064, 257, 1349746688, 251, 1341521920, 248, 1333395456, 243,
166 1325432832, 238, 1317634048, 235, 1309933568, 230, 1302396928, 227,
167 1294958592, 222, 1287684096, 219, 1280507904, 216, 1273430016, 211,
168 1266515968, 209, 1259667456, 205, 1252950016, 202, 1246330880, 198,
169 1239842816, 196, 1233420288, 192, 1227128832, 190, 1220902912, 187,
170 1214775296, 184, 1208745984, 181, 1202814976, 179, 1196949504, 176,
171 1191182336, 173, 1185513472, 171, 1179910144, 169, 1174372352, 166,
172 1168932864, 164, 1163558912, 162, 1158250496, 160, 1153007616, 157,
173 1147863040, 155, 1142784000, 154, 1137737728, 151, 1132789760, 149,
174 1127907328, 148, 1123057664, 145, 1118306304, 144, 1113587712, 142,
175 1108934656, 140, 1104347136, 138, 1099825152, 137, 1095335936, 135,
176 1090912256, 134, 1086521344, 131, 1082228736, 131, 1077936128, 128
177 };
178
bbs_invSqrt32(uint32 valA)179 uint32 bbs_invSqrt32( uint32 valA )
180 {
181
182 uint32 expL = 0;
183 uint32 valL;
184 uint32 offsL;
185 uint32 indexL = 0;
186
187 if( valA == 0U ) return 0x80000000;
188
189 /* compute closest even size exponent of valA */
190 expL += ( ( valA >> ( expL + 0x10 ) ) != 0 ) << 4;
191 expL += ( ( valA >> ( expL + 0x08 ) ) != 0 ) << 3;
192 expL += ( ( valA >> ( expL + 0x04 ) ) != 0 ) << 2;
193 expL += ( ( valA >> ( expL + 0x02 ) ) != 0 ) << 1;
194
195 valL = ( ( valA << ( 30 - expL ) ) - 1073741824 ); /* ( 1 << 30 ) */
196 offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 9 ) ) >> 10;
197 indexL = ( valL >> 24 ) & 0xFE;
198
199 return ( bbs_invSqrt32_tableG[ indexL ] - offsL * bbs_invSqrt32_tableG[ indexL + 1 ] ) >> ( expL >> 1 );
200 }
201
202 /* ------------------------------------------------------------------------- */
203
204 /* table of 1/( x + 1 ) (2.30) and negative slope (.14) values
205 referenced in b_GaborCueEm/focusDispAsm.s55, do not rename or remove ! */
206 const int32 bbs_inv32_tableG[] =
207 {
208 1073741824, 1986, 1041203200, 1870, 1010565120, 1762, 981696512, 1664,
209 954433536, 1575, 928628736, 1491, 904200192, 1415, 881016832, 1345,
210 858980352, 1278, 838041600, 1218, 818085888, 1162, 799047680, 1108,
211 780894208, 1059, 763543552, 1013, 746946560, 970, 731054080, 930,
212 715816960, 891, 701218816, 856, 687194112, 823, 673710080, 791,
213 660750336, 761, 648282112, 732, 636289024, 706, 624721920, 681,
214 613564416, 657, 602800128, 635, 592396288, 613, 582352896, 592,
215 572653568, 573, 563265536, 554, 554188800, 537, 545390592, 520,
216 };
217
bbs_inv32(int32 valA)218 int32 bbs_inv32( int32 valA )
219 {
220
221 uint32 expL = 0;
222 int32 signL = ( ( valA >> 30 ) & 0xFFFFFFFE ) + 1;
223 int32 valL = signL * valA;
224 int32 offsL;
225 uint32 indexL = 0;
226
227 if( valL <= ( int32 ) 1 ) return 0x40000000 * signL;
228
229 /* compute size exponent of valL */
230 expL += ( ( valL >> ( expL + 0x10 ) ) != 0 ) << 4;
231 expL += ( ( valL >> ( expL + 0x08 ) ) != 0 ) << 3;
232 expL += ( ( valL >> ( expL + 0x04 ) ) != 0 ) << 2;
233 expL += ( ( valL >> ( expL + 0x02 ) ) != 0 ) << 1;
234 expL += ( ( valL >> ( expL + 0x01 ) ) != 0 );
235
236 valL = ( ( valL << ( 30 - expL ) ) - 1073741824 ); /*( 1U << 30 )*/
237 offsL = ( ( valL & 0x01FFFFFF ) + ( 1 << 10 ) ) >> 11;
238 indexL = ( valL >> 24 ) & 0xFE;
239
240 return signL * ( ( ( ( bbs_inv32_tableG[ indexL ] - offsL * bbs_inv32_tableG[ indexL + 1 ] ) >> ( expL - 1 ) ) + 1 ) >> 1 );
241 }
242
243 /* ------------------------------------------------------------------------- */
244
bbs_intLog2(uint32 valA)245 uint32 bbs_intLog2( uint32 valA )
246 {
247 uint32 expL = 0;
248 expL += 0x10 * ( ( valA >> ( expL + 0x10 ) ) != 0 );
249 expL += 0x08 * ( ( valA >> ( expL + 0x08 ) ) != 0 );
250 expL += 0x04 * ( ( valA >> ( expL + 0x04 ) ) != 0 );
251 expL += 0x02 * ( ( valA >> ( expL + 0x02 ) ) != 0 );
252 expL += 0x01 * ( ( valA >> ( expL + 0x01 ) ) != 0 );
253 return expL;
254 }
255
256 /* ------------------------------------------------------------------------- */
257
258 const uint32 bbs_pow2M1_tableG[] =
259 {
260 0, 713, 46769127, 721, 94047537, 729, 141840775, 737,
261 190154447, 745, 238994221, 753, 288365825, 761, 338275050, 769,
262 388727751, 778, 439729846, 786, 491287318, 795, 543406214, 803,
263 596092647, 812, 649352798, 821, 703192913, 830, 757619310, 839,
264 812638371, 848, 868256550, 857, 924480371, 867, 981316430, 876,
265 1038771393, 886, 1096851999, 895, 1155565062, 905, 1214917468, 915,
266 1274916179, 925, 1335568233, 935, 1396880745, 945, 1458860907, 956,
267 1521515988, 966, 1584853338, 976, 1648880387, 987, 1713604645, 998,
268 1779033703, 1009, 1845175238, 1020, 1912037006, 1031, 1979626852, 1042,
269 2047952702, 1053, 2117022572, 1065, 2186844564u, 1077, 2257426868u, 1088,
270 2328777762u, 1100, 2400905617u, 1112, 2473818892u, 1124, 2547526141u, 1136,
271 2622036010u, 1149, 2697357237u, 1161, 2773498659u, 1174, 2850469207u, 1187,
272 2928277909u, 1200, 3006933892u, 1213, 3086446383u, 1226, 3166824708u, 1239,
273 3248078296u, 1253, 3330216677u, 1266, 3413249486u, 1280, 3497186464u, 1294,
274 3582037455u, 1308, 3667812413u, 1323, 3754521399u, 1337, 3842174584u, 1352,
275 3930782250u, 1366, 4020354790u, 1381, 4110902710u, 1396, 4202436633u, 1411
276 };
277
bbs_pow2M1(uint32 valA)278 uint32 bbs_pow2M1( uint32 valA )
279 {
280 uint32 offsL = ( valA & 0x03FFFFFF ) >> 10;
281 uint16 indexL = ( ( valA & 0xFC000000 ) >> 26 ) << 1;
282 return bbs_pow2M1_tableG[ indexL ] + offsL * bbs_pow2M1_tableG[ indexL + 1 ];
283 }
284
285 /* ------------------------------------------------------------------------- */
286
bbs_pow2(int32 valA)287 uint32 bbs_pow2( int32 valA )
288 {
289 int32 shiftL = 16 - ( valA >> 27 );
290 uint32 offsL = ( uint32 )( valA << 5 );
291 if( shiftL == 32 ) return 1;
292 return ( 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL );
293 }
294
295 /* ------------------------------------------------------------------------- */
296
bbs_exp(int32 valA)297 uint32 bbs_exp( int32 valA )
298 {
299 int32 adjustedL;
300 int32 shiftL;
301 int32 offsL;
302
303 /* check boundaries to avoid overflow */
304 if( valA < -1488522236 )
305 {
306 return 0;
307 }
308 else if( valA > 1488522236 )
309 {
310 return 0xFFFFFFFF;
311 }
312
313 /* multily valA with 1/ln(2) in order to use function 2^x instead of e^x */
314 adjustedL = ( valA >> 16 ) * 94548 + ( ( ( ( ( uint32 )valA ) & 0x0FFFF ) * 47274 ) >> 15 );
315
316 shiftL = 16 - ( adjustedL >> 27 );
317 if( shiftL == 32 ) return 1;
318 offsL = ( uint32 )( adjustedL << 5 );
319 return ( ( int32 ) 1 << ( 32 - shiftL ) ) + ( bbs_pow2M1( offsL ) >> shiftL );
320 }
321
322 /* ------------------------------------------------------------------------- */
323
bbs_satS16(int32 valA)324 int16 bbs_satS16( int32 valA )
325 {
326 if( valA > 32767 ) return 32767;
327 if( valA < -32768 ) return -32768;
328 return valA;
329 }
330
331 /* ------------------------------------------------------------------------- */
332
333 #if defined( HW_i586 ) || defined( HW_i686 )
334
335 /* Windows section */
336 #if defined( WIN32 ) && !defined( WIN64 )
337
338 /* disable warning "no return value"*/
339 #pragma warning( disable : 4035 )
340
341 /**
342 * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0
343 */
bbs_dotProduct_intelMMX16(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)344 int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
345 {
346 __asm
347 {
348 push esi
349 push edi
350
351 mov eax, vec1A
352 mov ebx, vec2A
353
354 mov ecx, sizeA
355
356 pxor mm4, mm4
357 pxor mm6, mm6
358
359 pxor mm7, mm7
360 shr ecx, 4
361
362 inner_loop_start:
363 movq mm0, 0[eax]
364 paddd mm7, mm4
365
366 movq mm1, 0[ebx]
367 paddd mm7, mm6
368
369 movq mm2, 8[eax]
370 pmaddwd mm0, mm1
371
372 movq mm3, 8[ebx]
373
374 movq mm4, 16[eax]
375 pmaddwd mm2, mm3
376
377 movq mm5, 16[ebx]
378 paddd mm7, mm0
379
380 movq mm6, 24[eax]
381 pmaddwd mm4, mm5
382
383 pmaddwd mm6, 24[ebx]
384 paddd mm7, mm2
385
386 add eax, 32
387 add ebx, 32
388
389 dec ecx
390 jnz inner_loop_start
391
392 paddd mm7, mm4
393
394 paddd mm7, mm6
395
396 movq mm0, mm7
397
398 psrlq mm0, 32
399
400 paddd mm7, mm0
401
402 movd eax, mm7
403
404 emms
405 pop edi
406 pop esi
407 }
408 }
409
410 #pragma warning( default : 4035 )
411
412 /* gcc compiler section */
413 #elif defined( epl_LINUX ) || defined( CYGWIN )
414
415 /**
416 * computes a fast dot product using intel MMX, sizeA must be multiple of 16 and >0
417 */
bbs_dotProduct_intelMMX16(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)418 int32 bbs_dotProduct_intelMMX16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
419 {
420 int32 resultL;
421
422 __asm__ __volatile__(
423
424 "movl %1,%%eax\n\t"
425 "movl %2,%%ebx\n\t"
426
427 "movl %3,%%ecx\n\t"
428
429 "pxor %%mm4,%%mm4\n\t"
430 "pxor %%mm6,%%mm6\n\t"
431
432 "pxor %%mm7, %%mm7\n\t"
433 "shrl $4, %%ecx\n\t"
434
435 "\n1:\t"
436 "movq 0(%%eax),%%mm0\n\t"
437 "paddd %%mm4,%%mm7\n\t"
438
439 "movq 0( %%ebx ),%%mm1\n\t"
440 "paddd %%mm6,%%mm7\n\t"
441
442 "movq 8( %%eax ),%%mm2\n\t"
443 "pmaddwd %%mm1,%%mm0\n\t"
444
445 "movq 8( %%ebx ),%%mm3\n\t"
446
447 "movq 16( %%eax ),%%mm4\n\t"
448 "pmaddwd %%mm3,%%mm2\n\t"
449
450 "movq 16( %%ebx ),%%mm5\n\t"
451 "paddd %%mm0,%%mm7\n\t"
452
453 "movq 24( %%eax ),%%mm6\n\t"
454 "pmaddwd %%mm5,%%mm4\n\t"
455
456 "pmaddwd 24( %%ebx ),%%mm6\n\t"
457 "paddd %%mm2,%%mm7\n\t"
458
459 "addl $32,%%eax\n\t"
460 "addl $32,%%ebx\n\t"
461
462 "decl %%ecx\n\t"
463 "jnz 1b\n\t"
464
465 "paddd %%mm4,%%mm7\n\t"
466 "paddd %%mm6,%%mm7\n\t"
467
468 "movq %%mm7,%%mm0\n\t"
469
470 "psrlq $32,%%mm0\n\t"
471
472 "paddd %%mm0,%%mm7\n\t"
473
474 "movd %%mm7,%0\n\t"
475
476 "emms\n\t"
477
478 : "=&g" ( resultL )
479 : "g" ( vec1A ), "g" ( vec2A ), "g" ( sizeA )
480 : "si", "di", "ax", "bx", "cx", "st", "memory" );
481
482 return resultL;
483 }
484
485 #endif /* epl_LINUX, CYGWIN */
486
487 #endif /* HW_i586 || HW_i686 */
488
489 /* ------------------------------------------------------------------------- */
490
491 #ifdef HW_TMS320C6x
492 /**
493 * Calls fast assembler version of dotproduct for DSP.
494 * dotProduct_C62x is implemented in file dotprod.asm and expects input vectors
495 * of even length.
496 */
bbs_dotProduct_dsp(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)497 int32 bbs_dotProduct_dsp( const int16* vec1A, const int16* vec2A, uint32 sizeA )
498 {
499 if( sizeA & 1 )
500 {
501 int32 resultL;
502 resultL = dotProduct_C62x( vec1A, vec2A, sizeA - 1 );
503 return resultL + ( int32 ) *( vec1A + sizeA - 1 ) * *( vec2A + sizeA - 1 );
504 }
505 else
506 {
507 return dotProduct_C62x( vec1A, vec2A, sizeA );
508 }
509 }
510 #endif /* HW_TMS320C6x */
511
512 /* ------------------------------------------------------------------------- */
513
514 /* 16 dot product for the PS2/EE processor */
515 /* input vectors MUST be 128 bit aligned ! */
516
517 #if defined( epl_LINUX ) && defined( HW_EE )
518
bbs_dotProduct_EE(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)519 int32 bbs_dotProduct_EE( const int16* vec1A, const int16* vec2A, uint32 sizeA )
520 {
521 int32 resultL = 0,
522 iL = sizeA >> 3,
523 jL = sizeA - ( iL << 3 );
524
525 if( iL > 0 )
526 {
527 /* multiply-add elements of input vectors in sets of 8 */
528 int32 accL[ 4 ], t1L, t2L, t3L;
529 asm volatile (
530 "pxor %4, %2, %2\n\t" /* reset 8 accumulators (LO and HI register) to 0 */
531 "pmtlo %4\n\t"
532 "pmthi %4\n\t"
533
534 "\n__begin_loop:\t"
535
536 "lq %2,0(%0)\n\t" /* load 8 pairs of int16 */
537 "lq %3,0(%1)\n\t"
538
539 "addi %0,%0,16\n\t" /* vec1L += 16 */
540 "addi %1,%1,16\n\t" /* vec2L += 16 */
541 "addi %7,%7,-1\n\t" /* iL-- */
542
543 "pmaddh %4,%2,%3\n\t" /* parallel multiply-add of 8 pairs of int16 */
544
545 "bgtzl %7,__begin_loop\n\t" /* if iL > 0 goto _begin_loop */
546
547 "pmflo %2\n\t" /* parallel add 8 accumulators , store remaining 4 accumulators in tmpL */
548 "pmfhi %3\n\t"
549 "paddw %4,%2,%3\n\t"
550 "sq %4,0(%8)\n\t"
551 : "=r" ( vec1A ), "=r" ( vec2A ), "=r" ( t1L ), "=r" ( t2L ), "=r" ( t3L )
552 : "0" ( vec1A ), "1" ( vec2A ), "r" ( iL ), "r" ( accL )
553 : "memory" );
554
555 /* add 4 parallel accumulators */
556 resultL += accL[ 0 ] + accL[ 1 ] + accL[ 2 ] + accL[ 3 ];
557 }
558
559 /* multiply-add remaining elements of input vectors */
560 for( ; jL--; ) resultL += ( int32 ) *vec1A++ * *vec2A++;
561
562 return resultL;
563 }
564
565 #endif
566
567 /* ------------------------------------------------------------------------- */
568
569 #if defined( HW_ARMv5TE )
570
571 /* fast 16 dot product for ARM9E cores (DSP extensions).
572 * input vectors must be 32 bit aligned
573 */
bbs_dotProduct_arm9e(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)574 int32 bbs_dotProduct_arm9e( const int16* vec1A, const int16* vec2A, uint32 sizeA )
575 {
576 int32 accuL = 0;
577
578 int32* v1PtrL = ( int32* )vec1A;
579 int32* v2PtrL = ( int32* )vec2A;
580
581 for( ; sizeA >= 4; sizeA -= 4 )
582 {
583 __asm {
584 smlabb accuL, *v1PtrL, *v2PtrL, accuL;
585 smlatt accuL, *v1PtrL, *v2PtrL, accuL;
586 }
587 v1PtrL++; v2PtrL++;
588 __asm {
589 smlabb accuL, *v1PtrL, *v2PtrL, accuL;
590 smlatt accuL, *v1PtrL, *v2PtrL, accuL;
591 }
592 v1PtrL++; v2PtrL++;
593 }
594
595 vec1A = ( int16* )v1PtrL;
596 vec2A = ( int16* )v2PtrL;
597
598 /* multiply-add remaining elements of input vectors */
599 for( ; sizeA > 0; sizeA-- ) accuL += ( int32 )*vec1A++ * *vec2A++;
600
601 return accuL;
602 }
603
604 #endif
605
606 /* ------------------------------------------------------------------------- */
607
608 /**
609 * Computes a fast dot product using standard C
610 */
bbs_dotProduct_stdc(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)611 int32 bbs_dotProduct_stdc( const int16* vec1A, const int16* vec2A, uint32 sizeA )
612 {
613 int32 accuL = 0;
614
615 for( ; sizeA >= 8; sizeA -= 8 )
616 {
617 accuL += ( int32 ) *vec1A * *vec2A;
618 accuL += ( int32 ) *( vec1A + 1 ) * *( vec2A + 1 );
619 accuL += ( int32 ) *( vec1A + 2 ) * *( vec2A + 2 );
620 accuL += ( int32 ) *( vec1A + 3 ) * *( vec2A + 3 );
621
622 accuL += ( int32 ) *( vec1A + 4 ) * *( vec2A + 4 );
623 accuL += ( int32 ) *( vec1A + 5 ) * *( vec2A + 5 );
624 accuL += ( int32 ) *( vec1A + 6 ) * *( vec2A + 6 );
625 accuL += ( int32 ) *( vec1A + 7 ) * *( vec2A + 7 );
626
627 vec1A += 8;
628 vec2A += 8;
629 }
630
631 for( ; sizeA; sizeA-- ) accuL += ( int32 ) *vec1A++ * *vec2A++;
632
633 return accuL;
634 }
635
636 /* ------------------------------------------------------------------------- */
637
bbs_dotProductInt16(const int16 * vec1A,const int16 * vec2A,uint32 sizeA)638 int32 bbs_dotProductInt16( const int16* vec1A, const int16* vec2A, uint32 sizeA )
639 {
640 /* PC */
641 #if ( defined( HW_i586 ) || defined( HW_i686 ) )
642
643 #if defined( HW_SSE2 )
644 uint32 size16L = sizeA & 0xfffffff0;
645 if( size16L )
646 {
647 if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 )
648 {
649 return bbs_dotProduct_128SSE2( vec1A, vec2A, sizeA );
650 }
651 else
652 {
653 return bbs_dotProduct_u128SSE2( vec1A, vec2A, sizeA );
654 }
655 }
656 #elif !defined( WIN64 )
657 /* MMX version (not supported by 64-bit compiler) */
658 uint32 size16L = sizeA & 0xfffffff0;
659 if( size16L )
660 {
661 if( sizeA == size16L )
662 {
663 return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L );
664 }
665 return bbs_dotProduct_intelMMX16( vec1A, vec2A, size16L )
666 + bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L );
667 } /* if( size16L ) */
668 #endif
669
670 return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
671
672 /* Playstation 2 */
673 #elif defined( HW_EE ) && defined( epl_LINUX )
674
675 if( ( (uint32)vec1A & 0xF ) == 0 && ( (uint32)vec2A & 0xF ) == 0 )
676 {
677 return bbs_dotProduct_EE( vec1A, vec2A, sizeA );
678 }
679 return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
680
681 /* ARM9E */
682 #elif defined( HW_ARMv5TE )
683
684 return bbs_dotProduct_arm9e( vec1A, vec2A, sizeA );
685
686 /* TI C6000 */
687 #elif defined( HW_TMS320C6x )
688
689 return bbs_dotProduct_dsp( vec1A, vec2A, sizeA );
690
691 #elif defined( HW_FR71 )
692
693 uint32 size16L = sizeA & 0xfffffff0;
694 if( size16L )
695 {
696 if( sizeA == size16L )
697 {
698 return bbs_dotProduct_fr71( vec1A, vec2A, size16L );
699 }
700 return bbs_dotProduct_fr71( vec1A, vec2A, size16L )
701 + bbs_dotProduct_stdc( vec1A + size16L, vec2A + size16L, sizeA - size16L );
702 }
703
704 return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
705
706 #endif
707
708 return bbs_dotProduct_stdc( vec1A, vec2A, sizeA );
709 }
710
711 /* ------------------------------------------------------------------------- */
712
713 /* table of fermi and slope values (result: 2.30; offset: .12)
714 referenced in b_NeuralNetEm/FastMlpNet.c, not not rename or remove */
715 const uint32 bbs_fermi_tableG[] =
716 {
717 45056, 8, 77824, 13, 131072, 21, 217088, 34,
718 356352, 57, 589824, 94, 974848, 155, 1609728, 255,
719 2654208, 418, 4366336, 688, 7184384, 1126, 11796480, 1834,
720 19308544, 2970, 31473664, 4748, 50921472, 7453, 81448960, 11363,
721 127991808, 16573, 195874816, 22680, 288772096, 28469, 405381120, 32102,
722 536870912, 32101, 668356608, 28469, 784965632, 22680, 877862912, 16573,
723 945745920, 11363, 992288768, 7453, 1022816256, 4748, 1042264064, 2970,
724 1054429184, 1834, 1061941248, 1126, 1066553344, 688, 1069371392, 418,
725 1071083520, 255, 1072128000, 155, 1072762880, 94, 1073147904, 57,
726 1073381376, 34, 1073520640, 21, 1073606656, 13, 1073659904, 8,
727 };
728
bbs_fermi(int32 valA)729 int32 bbs_fermi( int32 valA )
730 {
731 uint32 indexL = ( ( valA >> 15 ) + 20 ) << 1;
732 uint32 offsL = ( ( valA & 0x00007FFF ) + 4 ) >> 3;
733 if( valA < -655360 ) return 1;
734 if( valA >= 655360 ) return 1073741824 - 1; /* ( 1 << 30 ) */
735 return ( bbs_fermi_tableG[ indexL ] + offsL * bbs_fermi_tableG[ indexL + 1 ] );
736 }
737
738 /* ------------------------------------------------------------------------- */
739
bbs_uint32ReduceToNBits(uint32 * argPtrA,int32 * bbpPtrA,uint32 nBitsA)740 void bbs_uint32ReduceToNBits( uint32* argPtrA, int32* bbpPtrA, uint32 nBitsA )
741 {
742 int32 posHighestBitL = bbs_intLog2( *argPtrA ) + 1;
743 int32 shiftL = posHighestBitL - nBitsA;
744 if( shiftL > 0 )
745 {
746 ( *argPtrA ) >>= shiftL;
747 ( *bbpPtrA ) -= shiftL;
748 }
749 }
750
751 /* ------------------------------------------------------------------------- */
752
bbs_int32ReduceToNBits(int32 * argPtrA,int32 * bbpPtrA,uint32 nBitsA)753 void bbs_int32ReduceToNBits( int32* argPtrA, int32* bbpPtrA, uint32 nBitsA )
754 {
755 int32 posHighestBitL = bbs_intLog2( bbs_abs( *argPtrA ) ) + 1;
756 int32 shiftL = posHighestBitL - nBitsA;
757 if( shiftL > 0 )
758 {
759 ( *argPtrA ) >>= shiftL;
760 ( *bbpPtrA ) -= shiftL;
761 }
762 }
763
764 /* ------------------------------------------------------------------------- */
765
bbs_convertU32(uint32 srcA,int32 srcBbpA,int32 dstBbpA)766 uint32 bbs_convertU32( uint32 srcA, int32 srcBbpA, int32 dstBbpA )
767 {
768 if( dstBbpA >= srcBbpA )
769 {
770 uint32 shiftL = dstBbpA - srcBbpA;
771 if( srcA > ( ( uint32 )0xFFFFFFFF >> shiftL ) )
772 {
773 /* overflow */
774 return ( uint32 )0xFFFFFFFF;
775 }
776 else
777 {
778 return srcA << shiftL;
779 }
780 }
781 else
782 {
783 uint32 shiftL = srcBbpA - dstBbpA;
784 uint32 addL = 1L << ( shiftL - 1 );
785 if( srcA + addL < addL )
786 {
787 /* rounding would cause overflow */
788 return srcA >> shiftL;
789 }
790 else
791 {
792 return ( srcA + addL ) >> shiftL;
793 }
794 }
795 }
796
797 /* ------------------------------------------------------------------------- */
798
bbs_convertS32(int32 srcA,int32 srcBbpA,int32 dstBbpA)799 int32 bbs_convertS32( int32 srcA, int32 srcBbpA, int32 dstBbpA )
800 {
801 if( dstBbpA >= srcBbpA )
802 {
803 uint32 shiftL = ( uint32 )( dstBbpA - srcBbpA );
804 if( srcA > ( ( int32 )0x7FFFFFFF >> shiftL ) )
805 {
806 /* overflow */
807 return ( uint32 )0x7FFFFFFF;
808 }
809 else if( srcA < ( ( int32 )0x80000000 >> shiftL ) )
810 {
811 /* underflow */
812 return ( int32 )0x80000000;
813 }
814 else
815 {
816 return srcA << shiftL;
817 }
818 }
819 else
820 {
821 uint32 shiftL = ( uint32 )( srcBbpA - dstBbpA );
822 int32 addL = 1L << ( shiftL - 1 );
823 if( srcA + addL < addL )
824 {
825 /* rounding would cause overflow */
826 return srcA >> shiftL;
827 }
828 else
829 {
830 return ( srcA + addL ) >> shiftL;
831 }
832 }
833 }
834
835 /* ------------------------------------------------------------------------- */
836
bbs_vecPowerFlt16(const int16 * xA,int16 nxA)837 int32 bbs_vecPowerFlt16( const int16 *xA, int16 nxA )
838 {
839 /* #if defined( HW_TMS320C5x )
840 uint32 rL;
841 power( ( int16* ) xA, ( int32* ) &rL, nxA ); // does not work properly in DSPLib version 2.20.02
842 return ( rL >> 1 );
843 #else*/
844 /* needs to be optimized */
845 int32 rL = 0;
846 for( ; nxA--; )
847 {
848 rL += ( int32 ) *xA * *xA;
849 xA++;
850 }
851 return rL;
852 /* #endif */
853 }
854
855 /* ------------------------------------------------------------------------- */
856
bbs_mulU32(uint32 v1A,uint32 v2A,uint32 * manPtrA,int32 * expPtrA)857 void bbs_mulU32( uint32 v1A, uint32 v2A, uint32* manPtrA, int32* expPtrA )
858 {
859 uint32 log1L = bbs_intLog2( v1A );
860 uint32 log2L = bbs_intLog2( v2A );
861
862 if( log1L + log2L < 32 )
863 {
864 *manPtrA = v1A * v2A;
865 *expPtrA = 0;
866 }
867 else
868 {
869 uint32 v1L = v1A;
870 uint32 v2L = v2A;
871 uint32 exp1L = 0;
872 uint32 exp2L = 0;
873 if( log1L > 15 && log2L > 15 )
874 {
875 exp1L = log1L - 15;
876 exp2L = log2L - 15;
877 v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
878 v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
879 }
880 else if( log1L > 15 )
881 {
882 exp1L = log1L + log2L - 31;
883 v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
884 }
885 else
886 {
887 exp2L = log1L + log2L - 31;
888 v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
889 }
890
891 *manPtrA = v1L * v2L;
892 *expPtrA = exp1L + exp2L;
893 }
894 }
895
896 /* ------------------------------------------------------------------------- */
897
bbs_mulS32(int32 v1A,int32 v2A,int32 * manPtrA,int32 * expPtrA)898 void bbs_mulS32( int32 v1A, int32 v2A, int32* manPtrA, int32* expPtrA )
899 {
900 uint32 log1L = bbs_intLog2( v1A > 0 ? v1A : -v1A );
901 uint32 log2L = bbs_intLog2( v2A > 0 ? v2A : -v2A );
902
903 if( log1L + log2L < 30 )
904 {
905 *manPtrA = v1A * v2A;
906 *expPtrA = 0;
907 }
908 else
909 {
910 int32 v1L = v1A;
911 int32 v2L = v2A;
912 int32 exp1L = 0;
913 int32 exp2L = 0;
914 if( log1L > 14 && log2L > 14 )
915 {
916 exp1L = log1L - 14;
917 exp2L = log2L - 14;
918 v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
919 v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
920 }
921 else if( log1L > 14 )
922 {
923 exp1L = log1L + log2L - 29;
924 v1L = ( ( v1L >> ( exp1L - 1 ) ) + 1 ) >> 1;
925 }
926 else
927 {
928 exp2L = log1L + log2L - 29;
929 v2L = ( ( v2L >> ( exp2L - 1 ) ) + 1 ) >> 1;
930 }
931
932 *manPtrA = v1L * v2L;
933 *expPtrA = exp1L + exp2L;
934 }
935 }
936
937 /* ------------------------------------------------------------------------- */
938
bbs_vecSqrNorm32(const int32 * vecA,uint32 sizeA,uint32 * manPtrA,uint32 * expPtrA)939 void bbs_vecSqrNorm32( const int32* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA )
940 {
941 uint32 sumL = 0;
942 int32 sumExpL = 0;
943
944 uint32 iL;
945 for( iL = 0; iL < sizeA; iL++ )
946 {
947 int32 vL = vecA[ iL ];
948 int32 logL = bbs_intLog2( vL > 0 ? vL : -vL );
949 int32 expL = ( logL > 14 ) ? logL - 14 : 0;
950 uint32 prdL;
951
952 if( expL >= 1 )
953 {
954 vL = ( ( vL >> ( expL - 1 ) ) + 1 ) >> 1;
955 }
956 else
957 {
958 vL = vL >> expL;
959 }
960
961 prdL = vL * vL;
962 expL <<= 1; /* now exponent of product */
963
964 if( sumExpL > expL )
965 {
966 uint32 shrL = sumExpL - expL;
967 prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1;
968 }
969 else if( expL > sumExpL )
970 {
971 uint32 shrL = expL - sumExpL;
972 sumL = ( ( sumL >> ( shrL - 1 ) ) + 1 ) >> 1;
973 sumExpL += shrL;
974 }
975
976 sumL += prdL;
977
978 if( sumL > 0x80000000 )
979 {
980 sumL = ( sumL + 1 ) >> 1;
981 sumExpL++;
982 }
983 }
984
985 /* make exponent even */
986 if( ( sumExpL & 1 ) != 0 )
987 {
988 sumL = ( sumL + 1 ) >> 1;
989 sumExpL++;
990 }
991
992 if( manPtrA != NULL ) *manPtrA = sumL;
993 if( expPtrA != NULL ) *expPtrA = sumExpL;
994 }
995
996 /* ------------------------------------------------------------------------- */
997
bbs_vecSqrNorm16(const int16 * vecA,uint32 sizeA,uint32 * manPtrA,uint32 * expPtrA)998 void bbs_vecSqrNorm16( const int16* vecA, uint32 sizeA, uint32* manPtrA, uint32* expPtrA )
999 {
1000 uint32 sumL = 0;
1001 int32 sumExpL = 0;
1002
1003 uint32 iL;
1004 for( iL = 0; iL < sizeA; iL++ )
1005 {
1006 int32 vL = vecA[ iL ];
1007 uint32 prdL = vL * vL;
1008
1009 if( sumExpL > 0 )
1010 {
1011 uint32 shrL = sumExpL;
1012 prdL = ( ( prdL >> ( shrL - 1 ) ) + 1 ) >> 1;
1013 }
1014
1015 sumL += prdL;
1016
1017 if( sumL > 0x80000000 )
1018 {
1019 sumL = ( sumL + 1 ) >> 1;
1020 sumExpL++;
1021 }
1022 }
1023
1024 /* make exponent even */
1025 if( ( sumExpL & 1 ) != 0 )
1026 {
1027 sumL = ( sumL + 1 ) >> 1;
1028 sumExpL++;
1029 }
1030
1031 if( manPtrA != NULL ) *manPtrA = sumL;
1032 if( expPtrA != NULL ) *expPtrA = sumExpL;
1033 }
1034
1035 /* ------------------------------------------------------------------------- */
1036
bbs_vecNorm16(const int16 * vecA,uint32 sizeA)1037 uint32 bbs_vecNorm16( const int16* vecA, uint32 sizeA )
1038 {
1039 uint32 manL;
1040 uint32 expL;
1041 bbs_vecSqrNorm16( vecA, sizeA, &manL, &expL );
1042 manL = bbs_sqrt32( manL );
1043 return manL << ( expL >> 1 );
1044 }
1045
1046 /* ------------------------------------------------------------------------- */
1047
bbs_matMultiplyFlt16(const int16 * x1A,int16 row1A,int16 col1A,const int16 * x2A,int16 col2A,int16 * rA)1048 void bbs_matMultiplyFlt16( const int16 *x1A, int16 row1A, int16 col1A, const int16 *x2A, int16 col2A, int16 *rA )
1049 {
1050 #if defined( HW_TMS320C5x )
1051 /* operands need to be in internal memory for mmul*/
1052 if( x1A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE ||
1053 x2A > ( int16* ) bbs_C5X_INTERNAL_MEMORY_SIZE )
1054 {
1055 int16 iL,jL,kL;
1056 int16 *ptr1L, *ptr2L;
1057 int32 sumL;
1058
1059 for( iL = 0; iL < row1A; iL++ )
1060 {
1061 for( jL = 0; jL < col2A; jL++ )
1062 {
1063 ptr1L = ( int16* ) x1A + iL * col1A;
1064 ptr2L = ( int16* ) x2A + jL;
1065 sumL = 0;
1066 for( kL = 0; kL < col1A; kL++ )
1067 {
1068 sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1069 ptr2L += col2A;
1070 }
1071 *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1072 }
1073 }
1074 }
1075 else mmul( ( int16* ) x1A, row1A, col1A, ( int16* ) x2A, col1A, col2A, rA );
1076
1077 #elif defined( HW_ARMv4 ) || defined( HW_ARMv5TE )
1078
1079 int32 iL, jL, kL;
1080 int16 *ptr1L, *ptr2L;
1081 int32 sumL;
1082 for( iL = 0; iL < row1A; iL++ )
1083 {
1084 for( jL = 0; jL < col2A; jL++ )
1085 {
1086 ptr1L = ( int16* ) x1A + iL * col1A;
1087 ptr2L = ( int16* ) x2A + jL;
1088 sumL = 0;
1089 for( kL = col1A; kL >= 4; kL -= 4 )
1090 {
1091 sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1092 sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
1093 sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
1094 sumL += ( ( int32 ) *ptr1L++ * *( ptr2L += col2A ) );
1095 ptr2L += col2A;
1096 }
1097 for( ; kL > 0; kL-- )
1098 {
1099 sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1100 ptr2L += col2A;
1101 }
1102 *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1103 }
1104 }
1105 #else
1106 /* needs to be optimized */
1107 int16 iL,jL,kL;
1108 int16 *ptr1L, *ptr2L;
1109 int32 sumL;
1110
1111 for( iL = 0; iL < row1A; iL++ )
1112 {
1113 for( jL = 0; jL < col2A; jL++ )
1114 {
1115 ptr1L = ( int16* ) x1A + iL * col1A;
1116 ptr2L = ( int16* ) x2A + jL;
1117 sumL = 0;
1118 for( kL = 0; kL < col1A; kL++ )
1119 {
1120 sumL += ( ( int32 ) *ptr1L++ * *ptr2L );
1121 ptr2L += col2A;
1122 }
1123 *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1124 }
1125 }
1126 #endif
1127 }
1128
1129 /* ------------------------------------------------------------------------- */
1130
bbs_matMultiplyTranspFlt16(const int16 * x1A,int16 row1A,int16 col1A,const int16 * x2A,int16 col2A,int16 * rA)1131 void bbs_matMultiplyTranspFlt16( const int16 *x1A, int16 row1A, int16 col1A,
1132 const int16 *x2A, int16 col2A, int16 *rA )
1133 {
1134 const int16* ptr1L = x1A;
1135
1136 int32 iL;
1137 for( iL = row1A; iL > 0 ; iL-- )
1138 {
1139 int32 jL;
1140 const int16* ptr2L = x2A;
1141 for( jL = col2A; jL > 0 ; jL-- )
1142 {
1143 int32 kL;
1144 int32 sumL = 0;
1145 for( kL = col1A >> 2; kL > 0; kL-- )
1146 {
1147 sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1148 sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1149 sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1150 sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1151 }
1152 for( kL = col1A & 3; kL > 0; kL-- )
1153 {
1154 sumL += ( ( int32 ) *ptr1L++ * *ptr2L++ );
1155 }
1156 *rA++ = ( sumL + ( 1 << 14 ) ) >> 15; /* round result to 1.15 */
1157 ptr1L -= col1A;
1158 }
1159 ptr1L += col1A;
1160 }
1161 }
1162
1163 /* ------------------------------------------------------------------------- */
1164
1165
1166 #ifndef mtrans
bbs_matTrans(int16 * xA,int16 rowA,int16 colA,int16 * rA)1167 uint16 bbs_matTrans( int16 *xA, int16 rowA, int16 colA, int16 *rA )
1168 {
1169 /* needs to be optimized */
1170 int16 iL;
1171 for( iL = colA; iL--; )
1172 {
1173 int16* sL = xA++;
1174 int16 jL;
1175 for( jL = rowA; jL--; )
1176 {
1177 *rA++ = *sL;
1178 sL += colA;
1179 }
1180 }
1181 return 0;
1182 }
1183 #endif
1184
1185 /* ------------------------------------------------------------------------- */
1186 #ifndef atan2_16
bbs_atan2(int16 nomA,int16 denomA)1187 int16 bbs_atan2( int16 nomA, int16 denomA )
1188 {
1189 int16 phL, argL;
1190
1191 if( nomA == denomA ) return 8192;
1192 argL = ( ( int32 ) nomA << 15 ) / denomA;
1193
1194 /* 0.318253*2 x 20857 .15
1195 +0.003314*2 x^2 217 .15
1196 -0.130908*2 x^3 -8580 .15
1197 +0.068542*2 x^4 4491 .15
1198 -0.009159*2 x^5 -600 .15 */
1199
1200 phL = -600;
1201 phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 4481;
1202 phL = ( ( ( int32 ) phL * argL ) >> 15 ) - 8580;
1203 phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 217;
1204 phL = ( ( ( int32 ) phL * argL ) >> 15 ) + 20857;
1205 phL = ( ( int32 ) phL * argL ) >> 15;
1206
1207 return phL >> 1; /* /2 */
1208 }
1209
1210 /* needs to be optimized */
bbs_vecPhase(int16 * reA,int16 * imA,int16 * phaseA,uint16 sizeA)1211 uint16 bbs_vecPhase( int16 *reA, int16 *imA, int16 *phaseA, uint16 sizeA )
1212 {
1213 for( ; sizeA--; )
1214 {
1215 int16 reL = *reA++;
1216 int16 imL = *imA++;
1217 int16 phL = 0;
1218
1219 if( reL < 0 )
1220 {
1221 reL = -reL;
1222 if( imL < 0 )
1223 {
1224 imL = -imL;
1225 if( reL > imL )
1226 {
1227 phL = -32768 + bbs_atan2( imL, reL );
1228 }
1229 else
1230 {
1231 phL = -16384 - bbs_atan2( reL, imL );
1232 }
1233 }
1234 else
1235 {
1236 if( reL > imL )
1237 {
1238 phL = -( -32768 + bbs_atan2( imL, reL ) );
1239 }
1240 else
1241 {
1242 if( imL == 0 ) phL = 0;
1243 else phL = 16384 + bbs_atan2( reL, imL );
1244 }
1245 }
1246 }
1247 else
1248 {
1249 if( imL < 0 )
1250 {
1251 imL = -imL;
1252 if( reL > imL )
1253 {
1254 phL = -bbs_atan2( imL, reL );
1255 }
1256 else
1257 {
1258 phL = -16384 + bbs_atan2( reL, imL );
1259 }
1260 }
1261 else
1262 {
1263 if( reL > imL )
1264 {
1265 phL = bbs_atan2( imL, reL );
1266 }
1267 else
1268 {
1269 if( imL == 0 ) phL = 0;
1270 else phL = 16384 - bbs_atan2( reL, imL );
1271 }
1272 }
1273 }
1274
1275 *phaseA++ = phL;
1276 }
1277 return 0;
1278 }
1279
1280 #endif
1281
1282 /* ------------------------------------------------------------------------- */
1283