1 /*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <cutils/bitops.h> /* for popcount() */
18 #include <audio_utils/primitives.h>
19 #include "private/private.h"
20
ditherAndClamp(int32_t * out,const int32_t * sums,size_t pairs)21 void ditherAndClamp(int32_t *out, const int32_t *sums, size_t pairs)
22 {
23 for (; pairs > 0; --pairs) {
24 const int32_t l = clamp16(*sums++ >> 12);
25 const int32_t r = clamp16(*sums++ >> 12);
26 *out++ = (r << 16) | (l & 0xFFFF);
27 }
28 }
29
memcpy_to_i16_from_q4_27(int16_t * dst,const int32_t * src,size_t count)30 void memcpy_to_i16_from_q4_27(int16_t *dst, const int32_t *src, size_t count)
31 {
32 for (; count > 0; --count) {
33 *dst++ = clamp16(*src++ >> 12);
34 }
35 }
36
memcpy_to_i16_from_u8(int16_t * dst,const uint8_t * src,size_t count)37 void memcpy_to_i16_from_u8(int16_t *dst, const uint8_t *src, size_t count)
38 {
39 dst += count;
40 src += count;
41 for (; count > 0; --count) {
42 *--dst = (int16_t)(*--src - 0x80) << 8;
43 }
44 }
45
memcpy_to_u8_from_i16(uint8_t * dst,const int16_t * src,size_t count)46 void memcpy_to_u8_from_i16(uint8_t *dst, const int16_t *src, size_t count)
47 {
48 for (; count > 0; --count) {
49 *dst++ = (*src++ >> 8) + 0x80;
50 }
51 }
52
memcpy_to_u8_from_float(uint8_t * dst,const float * src,size_t count)53 void memcpy_to_u8_from_float(uint8_t *dst, const float *src, size_t count)
54 {
55 for (; count > 0; --count) {
56 *dst++ = clamp8_from_float(*src++);
57 }
58 }
59
memcpy_to_i16_from_i32(int16_t * dst,const int32_t * src,size_t count)60 void memcpy_to_i16_from_i32(int16_t *dst, const int32_t *src, size_t count)
61 {
62 for (; count > 0; --count) {
63 *dst++ = *src++ >> 16;
64 }
65 }
66
memcpy_to_i16_from_float(int16_t * dst,const float * src,size_t count)67 void memcpy_to_i16_from_float(int16_t *dst, const float *src, size_t count)
68 {
69 for (; count > 0; --count) {
70 *dst++ = clamp16_from_float(*src++);
71 }
72 }
73
memcpy_to_float_from_q4_27(float * dst,const int32_t * src,size_t count)74 void memcpy_to_float_from_q4_27(float *dst, const int32_t *src, size_t count)
75 {
76 for (; count > 0; --count) {
77 *dst++ = float_from_q4_27(*src++);
78 }
79 }
80
memcpy_to_float_from_i16(float * dst,const int16_t * src,size_t count)81 void memcpy_to_float_from_i16(float *dst, const int16_t *src, size_t count)
82 {
83 dst += count;
84 src += count;
85 for (; count > 0; --count) {
86 *--dst = float_from_i16(*--src);
87 }
88 }
89
memcpy_to_float_from_u8(float * dst,const uint8_t * src,size_t count)90 void memcpy_to_float_from_u8(float *dst, const uint8_t *src, size_t count)
91 {
92 dst += count;
93 src += count;
94 for (; count > 0; --count) {
95 *--dst = float_from_u8(*--src);
96 }
97 }
98
memcpy_to_float_from_p24(float * dst,const uint8_t * src,size_t count)99 void memcpy_to_float_from_p24(float *dst, const uint8_t *src, size_t count)
100 {
101 dst += count;
102 src += count * 3;
103 for (; count > 0; --count) {
104 src -= 3;
105 *--dst = float_from_p24(src);
106 }
107 }
108
memcpy_to_i16_from_p24(int16_t * dst,const uint8_t * src,size_t count)109 void memcpy_to_i16_from_p24(int16_t *dst, const uint8_t *src, size_t count)
110 {
111 for (; count > 0; --count) {
112 #if HAVE_BIG_ENDIAN
113 *dst++ = src[1] | (src[0] << 8);
114 #else
115 *dst++ = src[1] | (src[2] << 8);
116 #endif
117 src += 3;
118 }
119 }
120
memcpy_to_i32_from_p24(int32_t * dst,const uint8_t * src,size_t count)121 void memcpy_to_i32_from_p24(int32_t *dst, const uint8_t *src, size_t count)
122 {
123 dst += count;
124 src += count * 3;
125 for (; count > 0; --count) {
126 src -= 3;
127 #if HAVE_BIG_ENDIAN
128 *--dst = (src[2] << 8) | (src[1] << 16) | (src[0] << 24);
129 #else
130 *--dst = (src[0] << 8) | (src[1] << 16) | (src[2] << 24);
131 #endif
132 }
133 }
134
memcpy_to_p24_from_i16(uint8_t * dst,const int16_t * src,size_t count)135 void memcpy_to_p24_from_i16(uint8_t *dst, const int16_t *src, size_t count)
136 {
137 dst += count * 3;
138 src += count;
139 for (; count > 0; --count) {
140 dst -= 3;
141 const int16_t sample = *--src;
142 #if HAVE_BIG_ENDIAN
143 dst[0] = sample >> 8;
144 dst[1] = sample;
145 dst[2] = 0;
146 #else
147 dst[0] = 0;
148 dst[1] = sample;
149 dst[2] = sample >> 8;
150 #endif
151 }
152 }
153
memcpy_to_p24_from_float(uint8_t * dst,const float * src,size_t count)154 void memcpy_to_p24_from_float(uint8_t *dst, const float *src, size_t count)
155 {
156 for (; count > 0; --count) {
157 int32_t ival = clamp24_from_float(*src++);
158
159 #if HAVE_BIG_ENDIAN
160 *dst++ = ival >> 16;
161 *dst++ = ival >> 8;
162 *dst++ = ival;
163 #else
164 *dst++ = ival;
165 *dst++ = ival >> 8;
166 *dst++ = ival >> 16;
167 #endif
168 }
169 }
170
memcpy_to_p24_from_q8_23(uint8_t * dst,const int32_t * src,size_t count)171 void memcpy_to_p24_from_q8_23(uint8_t *dst, const int32_t *src, size_t count)
172 {
173 for (; count > 0; --count) {
174 int32_t ival = clamp24_from_q8_23(*src++);
175
176 #if HAVE_BIG_ENDIAN
177 *dst++ = ival >> 16;
178 *dst++ = ival >> 8;
179 *dst++ = ival;
180 #else
181 *dst++ = ival;
182 *dst++ = ival >> 8;
183 *dst++ = ival >> 16;
184 #endif
185 }
186 }
187
memcpy_to_p24_from_i32(uint8_t * dst,const int32_t * src,size_t count)188 void memcpy_to_p24_from_i32(uint8_t *dst, const int32_t *src, size_t count)
189 {
190 for (; count > 0; --count) {
191 int32_t ival = *src++ >> 8;
192
193 #if HAVE_BIG_ENDIAN
194 *dst++ = ival >> 16;
195 *dst++ = ival >> 8;
196 *dst++ = ival;
197 #else
198 *dst++ = ival;
199 *dst++ = ival >> 8;
200 *dst++ = ival >> 16;
201 #endif
202 }
203 }
204
memcpy_to_q8_23_from_i16(int32_t * dst,const int16_t * src,size_t count)205 void memcpy_to_q8_23_from_i16(int32_t *dst, const int16_t *src, size_t count)
206 {
207 dst += count;
208 src += count;
209 for (; count > 0; --count) {
210 *--dst = (int32_t)*--src << 8;
211 }
212 }
213
memcpy_to_q8_23_from_float_with_clamp(int32_t * dst,const float * src,size_t count)214 void memcpy_to_q8_23_from_float_with_clamp(int32_t *dst, const float *src, size_t count)
215 {
216 for (; count > 0; --count) {
217 *dst++ = clamp24_from_float(*src++);
218 }
219 }
220
memcpy_to_q8_23_from_p24(int32_t * dst,const uint8_t * src,size_t count)221 void memcpy_to_q8_23_from_p24(int32_t *dst, const uint8_t *src, size_t count)
222 {
223 dst += count;
224 src += count * 3;
225 for (; count > 0; --count) {
226 src -= 3;
227 #if HAVE_BIG_ENDIAN
228 *--dst = (int8_t)src[0] << 16 | src[1] << 8 | src[2];
229 #else
230 *--dst = (int8_t)src[2] << 16 | src[1] << 8 | src[0];
231 #endif
232 }
233 }
234
memcpy_to_q4_27_from_float(int32_t * dst,const float * src,size_t count)235 void memcpy_to_q4_27_from_float(int32_t *dst, const float *src, size_t count)
236 {
237 for (; count > 0; --count) {
238 *dst++ = clampq4_27_from_float(*src++);
239 }
240 }
241
memcpy_to_i16_from_q8_23(int16_t * dst,const int32_t * src,size_t count)242 void memcpy_to_i16_from_q8_23(int16_t *dst, const int32_t *src, size_t count)
243 {
244 for (; count > 0; --count) {
245 *dst++ = clamp16(*src++ >> 8);
246 }
247 }
248
memcpy_to_float_from_q8_23(float * dst,const int32_t * src,size_t count)249 void memcpy_to_float_from_q8_23(float *dst, const int32_t *src, size_t count)
250 {
251 for (; count > 0; --count) {
252 *dst++ = float_from_q8_23(*src++);
253 }
254 }
255
memcpy_to_i32_from_u8(int32_t * dst,const uint8_t * src,size_t count)256 void memcpy_to_i32_from_u8(int32_t *dst, const uint8_t *src, size_t count)
257 {
258 dst += count;
259 src += count;
260 for (; count > 0; --count) {
261 *--dst = ((int32_t)(*--src) - 0x80) << 24;
262 }
263 }
264
memcpy_to_i32_from_i16(int32_t * dst,const int16_t * src,size_t count)265 void memcpy_to_i32_from_i16(int32_t *dst, const int16_t *src, size_t count)
266 {
267 dst += count;
268 src += count;
269 for (; count > 0; --count) {
270 *--dst = (int32_t)*--src << 16;
271 }
272 }
273
memcpy_to_i32_from_float(int32_t * dst,const float * src,size_t count)274 void memcpy_to_i32_from_float(int32_t *dst, const float *src, size_t count)
275 {
276 for (; count > 0; --count) {
277 *dst++ = clamp32_from_float(*src++);
278 }
279 }
280
memcpy_to_float_from_i32(float * dst,const int32_t * src,size_t count)281 void memcpy_to_float_from_i32(float *dst, const int32_t *src, size_t count)
282 {
283 for (; count > 0; --count) {
284 *dst++ = float_from_i32(*src++);
285 }
286 }
287
memcpy_to_float_from_float_with_clamping(float * dst,const float * src,size_t count,float absMax)288 void memcpy_to_float_from_float_with_clamping(float *dst, const float *src, size_t count,
289 float absMax) {
290 // Note: using NEON intrinsics (vminq_f32, vld1q_f32...) did NOT accelerate
291 // the function when benchmarked. The compiler already vectorize using FMINNM f32x4 & similar.
292 // Note: clamping induce a ~20% overhead compared to memcpy for count in [64, 512]
293 // See primitives_benchmark
294 for (; count > 0; --count) {
295 const float sample = *src++;
296 *dst++ = fmax(-absMax, fmin(absMax, sample));
297 }
298 }
299
downmix_to_mono_i16_from_stereo_i16(int16_t * dst,const int16_t * src,size_t count)300 void downmix_to_mono_i16_from_stereo_i16(int16_t *dst, const int16_t *src, size_t count)
301 {
302 for (; count > 0; --count) {
303 *dst++ = (int16_t)(((int32_t)src[0] + (int32_t)src[1]) >> 1);
304 src += 2;
305 }
306 }
307
upmix_to_stereo_i16_from_mono_i16(int16_t * dst,const int16_t * src,size_t count)308 void upmix_to_stereo_i16_from_mono_i16(int16_t *dst, const int16_t *src, size_t count)
309 {
310 dst += count * 2;
311 src += count;
312 for (; count > 0; --count) {
313 const int32_t temp = *--src;
314 dst -= 2;
315 dst[0] = temp;
316 dst[1] = temp;
317 }
318 }
319
downmix_to_mono_float_from_stereo_float(float * dst,const float * src,size_t frames)320 void downmix_to_mono_float_from_stereo_float(float *dst, const float *src, size_t frames)
321 {
322 for (; frames > 0; --frames) {
323 *dst++ = (src[0] + src[1]) * 0.5;
324 src += 2;
325 }
326 }
327
upmix_to_stereo_float_from_mono_float(float * dst,const float * src,size_t frames)328 void upmix_to_stereo_float_from_mono_float(float *dst, const float *src, size_t frames)
329 {
330 dst += frames * 2;
331 src += frames;
332 for (; frames > 0; --frames) {
333 const float temp = *--src;
334 dst -= 2;
335 dst[0] = temp;
336 dst[1] = temp;
337 }
338 }
339
nonZeroMono32(const int32_t * samples,size_t count)340 size_t nonZeroMono32(const int32_t *samples, size_t count)
341 {
342 size_t nonZero = 0;
343 for (; count > 0; --count) {
344 nonZero += *samples++ != 0;
345 }
346 return nonZero;
347 }
348
nonZeroMono16(const int16_t * samples,size_t count)349 size_t nonZeroMono16(const int16_t *samples, size_t count)
350 {
351 size_t nonZero = 0;
352 for (; count > 0; --count) {
353 nonZero += *samples++ != 0;
354 }
355 return nonZero;
356 }
357
nonZeroStereo32(const int32_t * frames,size_t count)358 size_t nonZeroStereo32(const int32_t *frames, size_t count)
359 {
360 size_t nonZero = 0;
361 for (; count > 0; --count) {
362 nonZero += frames[0] != 0 || frames[1] != 0;
363 frames += 2;
364 }
365 return nonZero;
366 }
367
nonZeroStereo16(const int16_t * frames,size_t count)368 size_t nonZeroStereo16(const int16_t *frames, size_t count)
369 {
370 size_t nonZero = 0;
371 for (; count > 0; --count) {
372 nonZero += frames[0] != 0 || frames[1] != 0;
373 frames += 2;
374 }
375 return nonZero;
376 }
377
378 /*
379 * C macro to do channel mask copying independent of dst/src sample type.
380 * Don't pass in any expressions for the macro arguments here.
381 */
382 #define copy_frame_by_mask(dst, dmask, src, smask, count, zero) \
383 { \
384 uint32_t bit, ormask; \
385 for (; (count) > 0; --(count)) { \
386 ormask = (dmask) | (smask); \
387 while (ormask) { \
388 bit = ormask & -ormask; /* get lowest bit */ \
389 ormask ^= bit; /* remove lowest bit */ \
390 if ((dmask) & bit) { \
391 *(dst)++ = (smask) & bit ? *(src)++ : (zero); \
392 } else { /* source channel only */ \
393 ++(src); \
394 } \
395 } \
396 } \
397 }
398
memcpy_by_channel_mask(void * dst,uint32_t dst_mask,const void * src,uint32_t src_mask,size_t sample_size,size_t count)399 void memcpy_by_channel_mask(void *dst, uint32_t dst_mask,
400 const void *src, uint32_t src_mask, size_t sample_size, size_t count)
401 {
402 #if 0
403 /* alternate way of handling memcpy_by_channel_mask by using the idxary */
404 int8_t idxary[32];
405 uint32_t src_channels = popcount(src_mask);
406 uint32_t dst_channels =
407 memcpy_by_index_array_initialization(idxary, 32, dst_mask, src_mask);
408
409 memcpy_by_idxary(dst, dst_channels, src, src_channels, idxary, sample_size, count);
410 #else
411 if (dst_mask == src_mask) {
412 memcpy(dst, src, sample_size * popcount(dst_mask) * count);
413 return;
414 }
415 switch (sample_size) {
416 case 1: {
417 uint8_t *udst = (uint8_t*)dst;
418 const uint8_t *usrc = (const uint8_t*)src;
419
420 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0);
421 } break;
422 case 2: {
423 uint16_t *udst = (uint16_t*)dst;
424 const uint16_t *usrc = (const uint16_t*)src;
425
426 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0);
427 } break;
428 case 3: { /* could be slow. use a struct to represent 3 bytes of data. */
429 uint8x3_t *udst = (uint8x3_t*)dst;
430 const uint8x3_t *usrc = (const uint8x3_t*)src;
431 static const uint8x3_t zero; /* tricky - we use this to zero out a sample */
432
433 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, zero);
434 } break;
435 case 4: {
436 uint32_t *udst = (uint32_t*)dst;
437 const uint32_t *usrc = (const uint32_t*)src;
438
439 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0);
440 } break;
441 default:
442 abort(); /* illegal value */
443 break;
444 }
445 #endif
446 }
447
448 /*
449 * C macro to do copying by index array, to rearrange samples
450 * within a frame. This is independent of src/dst sample type.
451 * Don't pass in any expressions for the macro arguments here.
452 */
453 #define copy_frame_by_idx(dst, dst_channels, src, src_channels, idxary, count, zero) \
454 { \
455 unsigned i; \
456 int index; \
457 for (; (count) > 0; --(count)) { \
458 for (i = 0; i < (dst_channels); ++i) { \
459 index = (idxary)[i]; \
460 *(dst)++ = index < 0 ? (zero) : (src)[index]; \
461 } \
462 (src) += (src_channels); \
463 } \
464 }
465
memcpy_by_index_array(void * dst,uint32_t dst_channels,const void * src,uint32_t src_channels,const int8_t * idxary,size_t sample_size,size_t count)466 void memcpy_by_index_array(void *dst, uint32_t dst_channels,
467 const void *src, uint32_t src_channels,
468 const int8_t *idxary, size_t sample_size, size_t count)
469 {
470 switch (sample_size) {
471 case 1: {
472 uint8_t *udst = (uint8_t*)dst;
473 const uint8_t *usrc = (const uint8_t*)src;
474
475 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0);
476 } break;
477 case 2: {
478 uint16_t *udst = (uint16_t*)dst;
479 const uint16_t *usrc = (const uint16_t*)src;
480
481 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0);
482 } break;
483 case 3: { /* could be slow. use a struct to represent 3 bytes of data. */
484 uint8x3_t *udst = (uint8x3_t*)dst;
485 const uint8x3_t *usrc = (const uint8x3_t*)src;
486 static const uint8x3_t zero;
487
488 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, zero);
489 } break;
490 case 4: {
491 uint32_t *udst = (uint32_t*)dst;
492 const uint32_t *usrc = (const uint32_t*)src;
493
494 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0);
495 } break;
496 default:
497 abort(); /* illegal value */
498 break;
499 }
500 }
501
memcpy_by_index_array_initialization(int8_t * idxary,size_t idxcount,uint32_t dst_mask,uint32_t src_mask)502 size_t memcpy_by_index_array_initialization(int8_t *idxary, size_t idxcount,
503 uint32_t dst_mask, uint32_t src_mask)
504 {
505 size_t n = 0;
506 int srcidx = 0;
507 uint32_t bit, ormask = src_mask | dst_mask;
508
509 while (ormask && n < idxcount) {
510 bit = ormask & -ormask; /* get lowest bit */
511 ormask ^= bit; /* remove lowest bit */
512 if (src_mask & dst_mask & bit) { /* matching channel */
513 idxary[n++] = srcidx++;
514 } else if (src_mask & bit) { /* source channel only */
515 ++srcidx;
516 } else { /* destination channel only */
517 idxary[n++] = -1;
518 }
519 }
520 return n + popcount(ormask & dst_mask);
521 }
522
memcpy_by_index_array_initialization_src_index(int8_t * idxary,size_t idxcount,uint32_t dst_mask,uint32_t src_mask)523 size_t memcpy_by_index_array_initialization_src_index(int8_t *idxary, size_t idxcount,
524 uint32_t dst_mask, uint32_t src_mask) {
525 size_t dst_count = popcount(dst_mask);
526 if (idxcount == 0) {
527 return dst_count;
528 }
529 if (dst_count > idxcount) {
530 dst_count = idxcount;
531 }
532
533 size_t src_idx, dst_idx;
534 for (src_idx = 0, dst_idx = 0; dst_idx < dst_count; ++dst_idx) {
535 if (src_mask & 1) {
536 idxary[dst_idx] = src_idx++;
537 } else {
538 idxary[dst_idx] = -1;
539 }
540 src_mask >>= 1;
541 }
542 return dst_idx;
543 }
544
memcpy_by_index_array_initialization_dst_index(int8_t * idxary,size_t idxcount,uint32_t dst_mask,uint32_t src_mask)545 size_t memcpy_by_index_array_initialization_dst_index(int8_t *idxary, size_t idxcount,
546 uint32_t dst_mask, uint32_t src_mask) {
547 size_t src_idx, dst_idx;
548 size_t dst_count = __builtin_popcount(dst_mask);
549 size_t src_count = __builtin_popcount(src_mask);
550 if (idxcount == 0) {
551 return dst_count;
552 }
553 if (dst_count > idxcount) {
554 dst_count = idxcount;
555 }
556 for (src_idx = 0, dst_idx = 0; dst_idx < dst_count; ++src_idx) {
557 if (dst_mask & 1) {
558 idxary[dst_idx++] = src_idx < src_count ? (signed)src_idx : -1;
559 }
560 dst_mask >>= 1;
561 }
562 return dst_idx;
563 }
564
accumulate_i16(int16_t * dst,const int16_t * src,size_t count)565 void accumulate_i16(int16_t *dst, const int16_t *src, size_t count) {
566 while (count--) {
567 *dst = clamp16((int32_t)*dst + *src++);
568 ++dst;
569 }
570 }
571
accumulate_u8(uint8_t * dst,const uint8_t * src,size_t count)572 void accumulate_u8(uint8_t *dst, const uint8_t *src, size_t count) {
573 int32_t sum;
574 for (; count > 0; --count) {
575 // 8-bit samples are centered around 0x80.
576 sum = *dst + *src++ - 0x80;
577 // Clamp to [0, 0xff].
578 *dst++ = (sum & 0x100) ? (~sum >> 9) : sum;
579 }
580 }
581
accumulate_p24(uint8_t * dst,const uint8_t * src,size_t count)582 void accumulate_p24(uint8_t *dst, const uint8_t *src, size_t count) {
583 for (; count > 0; --count) {
584 // Unpack.
585 int32_t dst_q8_23 = 0;
586 int32_t src_q8_23 = 0;
587 memcpy_to_q8_23_from_p24(&dst_q8_23, dst, 1);
588 memcpy_to_q8_23_from_p24(&src_q8_23, src, 1);
589
590 // Accumulate and overwrite.
591 dst_q8_23 += src_q8_23;
592 memcpy_to_p24_from_q8_23(dst, &dst_q8_23, 1);
593
594 // Move on to next sample.
595 dst += 3;
596 src += 3;
597 }
598 }
599
accumulate_q8_23(int32_t * dst,const int32_t * src,size_t count)600 void accumulate_q8_23(int32_t *dst, const int32_t *src, size_t count) {
601 for (; count > 0; --count) {
602 *dst = clamp24_from_q8_23(*dst + *src++);
603 ++dst;
604 }
605 }
606
accumulate_i32(int32_t * dst,const int32_t * src,size_t count)607 void accumulate_i32(int32_t *dst, const int32_t *src, size_t count) {
608 for (; count > 0; --count) {
609 *dst = clamp32((int64_t)*dst + *src++);
610 ++dst;
611 }
612 }
613
accumulate_float(float * dst,const float * src,size_t count)614 void accumulate_float(float *dst, const float *src, size_t count) {
615 for (; count > 0; --count) {
616 *dst++ += *src++;
617 }
618 }
619