• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2010, Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1.  Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2.  Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16  * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
17  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
20  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23  */
24 
25 #include "config.h"
26 
27 #if ENABLE(WEB_AUDIO)
28 
29 #include "platform/audio/VectorMath.h"
30 #include "wtf/Assertions.h"
31 #include "wtf/CPU.h"
32 #include <stdint.h>
33 
34 #if OS(MACOSX)
35 #include <Accelerate/Accelerate.h>
36 #endif
37 
38 #if CPU(X86) || CPU(X86_64)
39 #include <emmintrin.h>
40 #endif
41 
42 #if HAVE(ARM_NEON_INTRINSICS)
43 #include <arm_neon.h>
44 #endif
45 
46 #include <math.h>
47 #include <algorithm>
48 
49 namespace blink {
50 
51 namespace VectorMath {
52 
53 #if OS(MACOSX)
54 // On the Mac we use the highly optimized versions in Accelerate.framework
55 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
56 // our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
57 
vsmul(const float * sourceP,int sourceStride,const float * scale,float * destP,int destStride,size_t framesToProcess)58 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
59 {
60 #if CPU(X86)
61     ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
62 #else
63     vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
64 #endif
65 }
66 
vadd(const float * source1P,int sourceStride1,const float * source2P,int sourceStride2,float * destP,int destStride,size_t framesToProcess)67 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
68 {
69 #if CPU(X86)
70     ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
71 #else
72     vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
73 #endif
74 }
75 
vmul(const float * source1P,int sourceStride1,const float * source2P,int sourceStride2,float * destP,int destStride,size_t framesToProcess)76 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
77 {
78 #if CPU(X86)
79     ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
80 #else
81     vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
82 #endif
83 }
84 
zvmul(const float * real1P,const float * imag1P,const float * real2P,const float * imag2P,float * realDestP,float * imagDestP,size_t framesToProcess)85 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
86 {
87     DSPSplitComplex sc1;
88     DSPSplitComplex sc2;
89     DSPSplitComplex dest;
90     sc1.realp = const_cast<float*>(real1P);
91     sc1.imagp = const_cast<float*>(imag1P);
92     sc2.realp = const_cast<float*>(real2P);
93     sc2.imagp = const_cast<float*>(imag2P);
94     dest.realp = realDestP;
95     dest.imagp = imagDestP;
96 #if CPU(X86)
97     ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
98 #else
99     vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
100 #endif
101 }
102 
vsma(const float * sourceP,int sourceStride,const float * scale,float * destP,int destStride,size_t framesToProcess)103 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
104 {
105     vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess);
106 }
107 
vmaxmgv(const float * sourceP,int sourceStride,float * maxP,size_t framesToProcess)108 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
109 {
110     vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);
111 }
112 
vsvesq(const float * sourceP,int sourceStride,float * sumP,size_t framesToProcess)113 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
114 {
115     vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess);
116 }
117 
vclip(const float * sourceP,int sourceStride,const float * lowThresholdP,const float * highThresholdP,float * destP,int destStride,size_t framesToProcess)118 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
119 {
120     vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess);
121 }
122 #else
123 
124 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
125 {
126     int n = framesToProcess;
127 
128 #if CPU(X86) || CPU(X86_64)
129     if ((sourceStride == 1) && (destStride == 1)) {
130         float k = *scale;
131 
132         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
133         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
134             *destP += k * *sourceP;
135             sourceP++;
136             destP++;
137             n--;
138         }
139 
140         // Now the sourceP is aligned, use SSE.
141         int tailFrames = n % 4;
142         const float* endP = destP + n - tailFrames;
143 
144         __m128 pSource;
145         __m128 dest;
146         __m128 temp;
147         __m128 mScale = _mm_set_ps1(k);
148 
149         bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
150 
151 #define SSE2_MULT_ADD(loadInstr, storeInstr)        \
152             while (destP < endP)                    \
153             {                                       \
154                 pSource = _mm_load_ps(sourceP);     \
155                 temp = _mm_mul_ps(pSource, mScale); \
156                 dest = _mm_##loadInstr##_ps(destP); \
157                 dest = _mm_add_ps(dest, temp);      \
158                 _mm_##storeInstr##_ps(destP, dest); \
159                 sourceP += 4;                       \
160                 destP += 4;                         \
161             }
162 
163         if (destAligned)
164             SSE2_MULT_ADD(load, store)
165         else
166             SSE2_MULT_ADD(loadu, storeu)
167 
168         n = tailFrames;
169     }
170 #elif HAVE(ARM_NEON_INTRINSICS)
171     if ((sourceStride == 1) && (destStride == 1)) {
172         int tailFrames = n % 4;
173         const float* endP = destP + n - tailFrames;
174 
175         float32x4_t k = vdupq_n_f32(*scale);
176         while (destP < endP) {
177             float32x4_t source = vld1q_f32(sourceP);
178             float32x4_t dest = vld1q_f32(destP);
179 
180             dest = vmlaq_f32(dest, source, k);
181             vst1q_f32(destP, dest);
182 
183             sourceP += 4;
184             destP += 4;
185         }
186         n = tailFrames;
187     }
188 #endif
189     while (n) {
190         *destP += *sourceP * *scale;
191         sourceP += sourceStride;
192         destP += destStride;
193         n--;
194     }
195 }
196 
197 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
198 {
199     int n = framesToProcess;
200 
201 #if CPU(X86) || CPU(X86_64)
202     if ((sourceStride == 1) && (destStride == 1)) {
203         float k = *scale;
204 
205         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
206         while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
207             *destP = k * *sourceP;
208             sourceP++;
209             destP++;
210             n--;
211         }
212 
213         // Now the sourceP address is aligned and start to apply SSE.
214         int group = n / 4;
215         __m128 mScale = _mm_set_ps1(k);
216         __m128* pSource;
217         __m128* pDest;
218         __m128 dest;
219 
220 
221         if (reinterpret_cast<size_t>(destP) & 0x0F) {
222             while (group--) {
223                 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
224                 dest = _mm_mul_ps(*pSource, mScale);
225                 _mm_storeu_ps(destP, dest);
226 
227                 sourceP += 4;
228                 destP += 4;
229             }
230         } else {
231             while (group--) {
232                 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
233                 pDest = reinterpret_cast<__m128*>(destP);
234                 *pDest = _mm_mul_ps(*pSource, mScale);
235 
236                 sourceP += 4;
237                 destP += 4;
238             }
239         }
240 
241         // Non-SSE handling for remaining frames which is less than 4.
242         n %= 4;
243         while (n) {
244             *destP = k * *sourceP;
245             sourceP++;
246             destP++;
247             n--;
248         }
249     } else { // If strides are not 1, rollback to normal algorithm.
250 #elif HAVE(ARM_NEON_INTRINSICS)
251     if ((sourceStride == 1) && (destStride == 1)) {
252         float k = *scale;
253         int tailFrames = n % 4;
254         const float* endP = destP + n - tailFrames;
255 
256         while (destP < endP) {
257             float32x4_t source = vld1q_f32(sourceP);
258             vst1q_f32(destP, vmulq_n_f32(source, k));
259 
260             sourceP += 4;
261             destP += 4;
262         }
263         n = tailFrames;
264     }
265 #endif
266     float k = *scale;
267     while (n--) {
268         *destP = k * *sourceP;
269         sourceP += sourceStride;
270         destP += destStride;
271     }
272 #if CPU(X86) || CPU(X86_64)
273     }
274 #endif
275 }
276 
277 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
278 {
279     int n = framesToProcess;
280 
281 #if CPU(X86) || CPU(X86_64)
282     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
283         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
284         while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
285             *destP = *source1P + *source2P;
286             source1P++;
287             source2P++;
288             destP++;
289             n--;
290         }
291 
292         // Now the source1P address is aligned and start to apply SSE.
293         int group = n / 4;
294         __m128* pSource1;
295         __m128* pSource2;
296         __m128* pDest;
297         __m128 source2;
298         __m128 dest;
299 
300         bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
301         bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
302 
303         if (source2Aligned && destAligned) { // all aligned
304             while (group--) {
305                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
306                 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
307                 pDest = reinterpret_cast<__m128*>(destP);
308                 *pDest = _mm_add_ps(*pSource1, *pSource2);
309 
310                 source1P += 4;
311                 source2P += 4;
312                 destP += 4;
313             }
314 
315         } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
316             while (group--) {
317                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
318                 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
319                 dest = _mm_add_ps(*pSource1, *pSource2);
320                 _mm_storeu_ps(destP, dest);
321 
322                 source1P += 4;
323                 source2P += 4;
324                 destP += 4;
325             }
326 
327         } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
328             while (group--) {
329                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
330                 source2 = _mm_loadu_ps(source2P);
331                 pDest = reinterpret_cast<__m128*>(destP);
332                 *pDest = _mm_add_ps(*pSource1, source2);
333 
334                 source1P += 4;
335                 source2P += 4;
336                 destP += 4;
337             }
338         } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
339             while (group--) {
340                 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
341                 source2 = _mm_loadu_ps(source2P);
342                 dest = _mm_add_ps(*pSource1, source2);
343                 _mm_storeu_ps(destP, dest);
344 
345                 source1P += 4;
346                 source2P += 4;
347                 destP += 4;
348             }
349         }
350 
351         // Non-SSE handling for remaining frames which is less than 4.
352         n %= 4;
353         while (n) {
354             *destP = *source1P + *source2P;
355             source1P++;
356             source2P++;
357             destP++;
358             n--;
359         }
360     } else { // if strides are not 1, rollback to normal algorithm
361 #elif HAVE(ARM_NEON_INTRINSICS)
362     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
363         int tailFrames = n % 4;
364         const float* endP = destP + n - tailFrames;
365 
366         while (destP < endP) {
367             float32x4_t source1 = vld1q_f32(source1P);
368             float32x4_t source2 = vld1q_f32(source2P);
369             vst1q_f32(destP, vaddq_f32(source1, source2));
370 
371             source1P += 4;
372             source2P += 4;
373             destP += 4;
374         }
375         n = tailFrames;
376     }
377 #endif
378     while (n--) {
379         *destP = *source1P + *source2P;
380         source1P += sourceStride1;
381         source2P += sourceStride2;
382         destP += destStride;
383     }
384 #if CPU(X86) || CPU(X86_64)
385     }
386 #endif
387 }
388 
389 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
390 {
391 
392     int n = framesToProcess;
393 
394 #if CPU(X86) || CPU(X86_64)
395     if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
396         // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
397         while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {
398             *destP = *source1P * *source2P;
399             source1P++;
400             source2P++;
401             destP++;
402             n--;
403         }
404 
405         // Now the source1P address aligned and start to apply SSE.
406         int tailFrames = n % 4;
407         const float* endP = destP + n - tailFrames;
408         __m128 pSource1;
409         __m128 pSource2;
410         __m128 dest;
411 
412         bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);
413         bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
414 
415 #define SSE2_MULT(loadInstr, storeInstr)                   \
416             while (destP < endP)                           \
417             {                                              \
418                 pSource1 = _mm_load_ps(source1P);          \
419                 pSource2 = _mm_##loadInstr##_ps(source2P); \
420                 dest = _mm_mul_ps(pSource1, pSource2);     \
421                 _mm_##storeInstr##_ps(destP, dest);        \
422                 source1P += 4;                             \
423                 source2P += 4;                             \
424                 destP += 4;                                \
425             }
426 
427         if (source2Aligned && destAligned) // Both aligned.
428             SSE2_MULT(load, store)
429         else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
430             SSE2_MULT(load, storeu)
431         else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
432             SSE2_MULT(loadu, store)
433         else // Neither aligned.
434             SSE2_MULT(loadu, storeu)
435 
436         n = tailFrames;
437     }
438 #elif HAVE(ARM_NEON_INTRINSICS)
439     if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
440         int tailFrames = n % 4;
441         const float* endP = destP + n - tailFrames;
442 
443         while (destP < endP) {
444             float32x4_t source1 = vld1q_f32(source1P);
445             float32x4_t source2 = vld1q_f32(source2P);
446             vst1q_f32(destP, vmulq_f32(source1, source2));
447 
448             source1P += 4;
449             source2P += 4;
450             destP += 4;
451         }
452         n = tailFrames;
453     }
454 #endif
455     while (n) {
456         *destP = *source1P * *source2P;
457         source1P += sourceStride1;
458         source2P += sourceStride2;
459         destP += destStride;
460         n--;
461     }
462 }
463 
464 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
465 {
466     unsigned i = 0;
467 #if CPU(X86) || CPU(X86_64)
468     // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
469     // Otherwise, fall through to the scalar code below.
470     if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
471         && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)
472         && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)
473         && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)
474         && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)
475         && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {
476 
477         unsigned endSize = framesToProcess - framesToProcess % 4;
478         while (i < endSize) {
479             __m128 real1 = _mm_load_ps(real1P + i);
480             __m128 real2 = _mm_load_ps(real2P + i);
481             __m128 imag1 = _mm_load_ps(imag1P + i);
482             __m128 imag2 = _mm_load_ps(imag2P + i);
483             __m128 real = _mm_mul_ps(real1, real2);
484             real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
485             __m128 imag = _mm_mul_ps(real1, imag2);
486             imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
487             _mm_store_ps(realDestP + i, real);
488             _mm_store_ps(imagDestP + i, imag);
489             i += 4;
490         }
491     }
492 #elif HAVE(ARM_NEON_INTRINSICS)
493         unsigned endSize = framesToProcess - framesToProcess % 4;
494         while (i < endSize) {
495             float32x4_t real1 = vld1q_f32(real1P + i);
496             float32x4_t real2 = vld1q_f32(real2P + i);
497             float32x4_t imag1 = vld1q_f32(imag1P + i);
498             float32x4_t imag2 = vld1q_f32(imag2P + i);
499 
500             float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
501             float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
502 
503             vst1q_f32(realDestP + i, realResult);
504             vst1q_f32(imagDestP + i, imagResult);
505 
506             i += 4;
507         }
508 #endif
509     for (; i < framesToProcess; ++i) {
510         // Read and compute result before storing them, in case the
511         // destination is the same as one of the sources.
512         float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
513         float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
514 
515         realDestP[i] = realResult;
516         imagDestP[i] = imagResult;
517     }
518 }
519 
520 void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
521 {
522     int n = framesToProcess;
523     float sum = 0;
524 
525 #if CPU(X86) || CPU(X86_64)
526     if (sourceStride == 1) {
527         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
528         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
529             float sample = *sourceP;
530             sum += sample * sample;
531             sourceP++;
532             n--;
533         }
534 
535         // Now the sourceP is aligned, use SSE.
536         int tailFrames = n % 4;
537         const float* endP = sourceP + n - tailFrames;
538         __m128 source;
539         __m128 mSum = _mm_setzero_ps();
540 
541         while (sourceP < endP) {
542             source = _mm_load_ps(sourceP);
543             source = _mm_mul_ps(source, source);
544             mSum = _mm_add_ps(mSum, source);
545             sourceP += 4;
546         }
547 
548         // Summarize the SSE results.
549         const float* groupSumP = reinterpret_cast<float*>(&mSum);
550         sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
551 
552         n = tailFrames;
553     }
554 #elif HAVE(ARM_NEON_INTRINSICS)
555     if (sourceStride == 1) {
556         int tailFrames = n % 4;
557         const float* endP = sourceP + n - tailFrames;
558 
559         float32x4_t fourSum = vdupq_n_f32(0);
560         while (sourceP < endP) {
561             float32x4_t source = vld1q_f32(sourceP);
562             fourSum = vmlaq_f32(fourSum, source, source);
563             sourceP += 4;
564         }
565         float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
566 
567         float groupSum[2];
568         vst1_f32(groupSum, twoSum);
569         sum += groupSum[0] + groupSum[1];
570 
571         n = tailFrames;
572     }
573 #endif
574 
575     while (n--) {
576         float sample = *sourceP;
577         sum += sample * sample;
578         sourceP += sourceStride;
579     }
580 
581     ASSERT(sumP);
582     *sumP = sum;
583 }
584 
585 void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
586 {
587     int n = framesToProcess;
588     float max = 0;
589 
590 #if CPU(X86) || CPU(X86_64)
591     if (sourceStride == 1) {
592         // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
593         while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
594             max = std::max(max, fabsf(*sourceP));
595             sourceP++;
596             n--;
597         }
598 
599         // Now the sourceP is aligned, use SSE.
600         int tailFrames = n % 4;
601         const float* endP = sourceP + n - tailFrames;
602         __m128 source;
603         __m128 mMax = _mm_setzero_ps();
604         int mask = 0x7FFFFFFF;
605         __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask));
606 
607         while (sourceP < endP) {
608             source = _mm_load_ps(sourceP);
609             // Calculate the absolute value by anding source with mask, the sign bit is set to 0.
610             source = _mm_and_ps(source, mMask);
611             mMax = _mm_max_ps(mMax, source);
612             sourceP += 4;
613         }
614 
615         // Get max from the SSE results.
616         const float* groupMaxP = reinterpret_cast<float*>(&mMax);
617         max = std::max(max, groupMaxP[0]);
618         max = std::max(max, groupMaxP[1]);
619         max = std::max(max, groupMaxP[2]);
620         max = std::max(max, groupMaxP[3]);
621 
622         n = tailFrames;
623     }
624 #elif HAVE(ARM_NEON_INTRINSICS)
625     if (sourceStride == 1) {
626         int tailFrames = n % 4;
627         const float* endP = sourceP + n - tailFrames;
628 
629         float32x4_t fourMax = vdupq_n_f32(0);
630         while (sourceP < endP) {
631             float32x4_t source = vld1q_f32(sourceP);
632             fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
633             sourceP += 4;
634         }
635         float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));
636 
637         float groupMax[2];
638         vst1_f32(groupMax, twoMax);
639         max = std::max(groupMax[0], groupMax[1]);
640 
641         n = tailFrames;
642     }
643 #endif
644 
645     while (n--) {
646         max = std::max(max, fabsf(*sourceP));
647         sourceP += sourceStride;
648     }
649 
650     ASSERT(maxP);
651     *maxP = max;
652 }
653 
654 void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
655 {
656     int n = framesToProcess;
657     float lowThreshold = *lowThresholdP;
658     float highThreshold = *highThresholdP;
659 
660     // FIXME: Optimize for SSE2.
661 #if HAVE(ARM_NEON_INTRINSICS)
662     if ((sourceStride == 1) && (destStride == 1)) {
663         int tailFrames = n % 4;
664         const float* endP = destP + n - tailFrames;
665 
666         float32x4_t low = vdupq_n_f32(lowThreshold);
667         float32x4_t high = vdupq_n_f32(highThreshold);
668         while (destP < endP) {
669             float32x4_t source = vld1q_f32(sourceP);
670             vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
671             sourceP += 4;
672             destP += 4;
673         }
674         n = tailFrames;
675     }
676 #endif
677     while (n--) {
678         *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold);
679         sourceP += sourceStride;
680         destP += destStride;
681     }
682 }
683 
684 #endif // OS(MACOSX)
685 
686 } // namespace VectorMath
687 
688 } // namespace blink
689 
690 #endif // ENABLE(WEB_AUDIO)
691