1 /*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
14 * its contributors may be used to endorse or promote products derived
15 * from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include "config.h"
30
31 #if ENABLE(WEB_AUDIO)
32
33 #include "platform/audio/SincResampler.h"
34
35 #include "platform/audio/AudioBus.h"
36 #include "wtf/CPU.h"
37 #include "wtf/MathExtras.h"
38
39 #if CPU(X86) || CPU(X86_64)
40 #include <emmintrin.h>
41 #endif
42
43 using namespace std;
44
45 // Input buffer layout, dividing the total buffer into regions (r0 - r5):
46 //
47 // |----------------|----------------------------------------------------------------|----------------|
48 //
49 // blockSize + kernelSize / 2
50 // <-------------------------------------------------------------------------------->
51 // r0
52 //
53 // kernelSize / 2 kernelSize / 2 kernelSize / 2 kernelSize / 2
54 // <---------------> <---------------> <---------------> <--------------->
55 // r1 r2 r3 r4
56 //
57 // blockSize
58 // <-------------------------------------------------------------->
59 // r5
60
61 // The Algorithm:
62 //
63 // 1) Consume input frames into r0 (r1 is zero-initialized).
64 // 2) Position kernel centered at start of r0 (r2) and generate output frames until kernel is centered at start of r4.
65 // or we've finished generating all the output frames.
66 // 3) Copy r3 to r1 and r4 to r2.
67 // 4) Consume input frames into r5 (zero-pad if we run out of input).
68 // 5) Goto (2) until all of input is consumed.
69 //
70 // note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc.
71
72 namespace WebCore {
73
SincResampler(double scaleFactor,unsigned kernelSize,unsigned numberOfKernelOffsets)74 SincResampler::SincResampler(double scaleFactor, unsigned kernelSize, unsigned numberOfKernelOffsets)
75 : m_scaleFactor(scaleFactor)
76 , m_kernelSize(kernelSize)
77 , m_numberOfKernelOffsets(numberOfKernelOffsets)
78 , m_kernelStorage(m_kernelSize * (m_numberOfKernelOffsets + 1))
79 , m_virtualSourceIndex(0)
80 , m_blockSize(512)
81 , m_inputBuffer(m_blockSize + m_kernelSize) // See input buffer layout above.
82 , m_source(0)
83 , m_sourceFramesAvailable(0)
84 , m_sourceProvider(0)
85 , m_isBufferPrimed(false)
86 {
87 initializeKernel();
88 }
89
initializeKernel()90 void SincResampler::initializeKernel()
91 {
92 // Blackman window parameters.
93 double alpha = 0.16;
94 double a0 = 0.5 * (1.0 - alpha);
95 double a1 = 0.5;
96 double a2 = 0.5 * alpha;
97
98 // sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter.
99 double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0;
100
101 // The sinc function is an idealized brick-wall filter, but since we're windowing it the
102 // transition from pass to stop does not happen right away. So we should adjust the
103 // lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end.
104 // FIXME: this value is empirical and to be more exact should vary depending on m_kernelSize.
105 sincScaleFactor *= 0.9;
106
107 int n = m_kernelSize;
108 int halfSize = n / 2;
109
110 // Generates a set of windowed sinc() kernels.
111 // We generate a range of sub-sample offsets from 0.0 to 1.0.
112 for (unsigned offsetIndex = 0; offsetIndex <= m_numberOfKernelOffsets; ++offsetIndex) {
113 double subsampleOffset = static_cast<double>(offsetIndex) / m_numberOfKernelOffsets;
114
115 for (int i = 0; i < n; ++i) {
116 // Compute the sinc() with offset.
117 double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset);
118 double sinc = !s ? 1.0 : sin(s) / s;
119 sinc *= sincScaleFactor;
120
121 // Compute Blackman window, matching the offset of the sinc().
122 double x = (i - subsampleOffset) / n;
123 double window = a0 - a1 * cos(twoPiDouble * x) + a2 * cos(twoPiDouble * 2.0 * x);
124
125 // Window the sinc() function and store at the correct offset.
126 m_kernelStorage[i + offsetIndex * m_kernelSize] = sinc * window;
127 }
128 }
129 }
130
consumeSource(float * buffer,unsigned numberOfSourceFrames)131 void SincResampler::consumeSource(float* buffer, unsigned numberOfSourceFrames)
132 {
133 ASSERT(m_sourceProvider);
134 if (!m_sourceProvider)
135 return;
136
137 // Wrap the provided buffer by an AudioBus for use by the source provider.
138 RefPtr<AudioBus> bus = AudioBus::create(1, numberOfSourceFrames, false);
139
140 // FIXME: Find a way to make the following const-correct:
141 bus->setChannelMemory(0, buffer, numberOfSourceFrames);
142
143 m_sourceProvider->provideInput(bus.get(), numberOfSourceFrames);
144 }
145
146 namespace {
147
148 // BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer.
149
150 class BufferSourceProvider FINAL : public AudioSourceProvider {
151 public:
BufferSourceProvider(const float * source,size_t numberOfSourceFrames)152 BufferSourceProvider(const float* source, size_t numberOfSourceFrames)
153 : m_source(source)
154 , m_sourceFramesAvailable(numberOfSourceFrames)
155 {
156 }
157
158 // Consumes samples from the in-memory buffer.
provideInput(AudioBus * bus,size_t framesToProcess)159 virtual void provideInput(AudioBus* bus, size_t framesToProcess) OVERRIDE
160 {
161 ASSERT(m_source && bus);
162 if (!m_source || !bus)
163 return;
164
165 float* buffer = bus->channel(0)->mutableData();
166
167 // Clamp to number of frames available and zero-pad.
168 size_t framesToCopy = min(m_sourceFramesAvailable, framesToProcess);
169 memcpy(buffer, m_source, sizeof(float) * framesToCopy);
170
171 // Zero-pad if necessary.
172 if (framesToCopy < framesToProcess)
173 memset(buffer + framesToCopy, 0, sizeof(float) * (framesToProcess - framesToCopy));
174
175 m_sourceFramesAvailable -= framesToCopy;
176 m_source += framesToCopy;
177 }
178
179 private:
180 const float* m_source;
181 size_t m_sourceFramesAvailable;
182 };
183
184 } // namespace
185
process(const float * source,float * destination,unsigned numberOfSourceFrames)186 void SincResampler::process(const float* source, float* destination, unsigned numberOfSourceFrames)
187 {
188 // Resample an in-memory buffer using an AudioSourceProvider.
189 BufferSourceProvider sourceProvider(source, numberOfSourceFrames);
190
191 unsigned numberOfDestinationFrames = static_cast<unsigned>(numberOfSourceFrames / m_scaleFactor);
192 unsigned remaining = numberOfDestinationFrames;
193
194 while (remaining) {
195 unsigned framesThisTime = min(remaining, m_blockSize);
196 process(&sourceProvider, destination, framesThisTime);
197
198 destination += framesThisTime;
199 remaining -= framesThisTime;
200 }
201 }
202
process(AudioSourceProvider * sourceProvider,float * destination,size_t framesToProcess)203 void SincResampler::process(AudioSourceProvider* sourceProvider, float* destination, size_t framesToProcess)
204 {
205 bool isGood = sourceProvider && m_blockSize > m_kernelSize && m_inputBuffer.size() >= m_blockSize + m_kernelSize && !(m_kernelSize % 2);
206 ASSERT(isGood);
207 if (!isGood)
208 return;
209
210 m_sourceProvider = sourceProvider;
211
212 unsigned numberOfDestinationFrames = framesToProcess;
213
214 // Setup various region pointers in the buffer (see diagram above).
215 float* r0 = m_inputBuffer.data() + m_kernelSize / 2;
216 float* r1 = m_inputBuffer.data();
217 float* r2 = r0;
218 float* r3 = r0 + m_blockSize - m_kernelSize / 2;
219 float* r4 = r0 + m_blockSize;
220 float* r5 = r0 + m_kernelSize / 2;
221
222 // Step (1)
223 // Prime the input buffer at the start of the input stream.
224 if (!m_isBufferPrimed) {
225 consumeSource(r0, m_blockSize + m_kernelSize / 2);
226 m_isBufferPrimed = true;
227 }
228
229 // Step (2)
230
231 while (numberOfDestinationFrames) {
232 while (m_virtualSourceIndex < m_blockSize) {
233 // m_virtualSourceIndex lies in between two kernel offsets so figure out what they are.
234 int sourceIndexI = static_cast<int>(m_virtualSourceIndex);
235 double subsampleRemainder = m_virtualSourceIndex - sourceIndexI;
236
237 double virtualOffsetIndex = subsampleRemainder * m_numberOfKernelOffsets;
238 int offsetIndex = static_cast<int>(virtualOffsetIndex);
239
240 float* k1 = m_kernelStorage.data() + offsetIndex * m_kernelSize;
241 float* k2 = k1 + m_kernelSize;
242
243 // Initialize input pointer based on quantized m_virtualSourceIndex.
244 float* inputP = r1 + sourceIndexI;
245
246 // We'll compute "convolutions" for the two kernels which straddle m_virtualSourceIndex
247 float sum1 = 0;
248 float sum2 = 0;
249
250 // Figure out how much to weight each kernel's "convolution".
251 double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex;
252
253 // Generate a single output sample.
254 int n = m_kernelSize;
255
256 #define CONVOLVE_ONE_SAMPLE \
257 input = *inputP++; \
258 sum1 += input * *k1; \
259 sum2 += input * *k2; \
260 ++k1; \
261 ++k2;
262
263 {
264 float input;
265
266 #if CPU(X86) || CPU(X86_64)
267 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
268 while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) {
269 CONVOLVE_ONE_SAMPLE
270 n--;
271 }
272
273 // Now the inputP is aligned and start to apply SSE.
274 float* endP = inputP + n - n % 4;
275 __m128 mInput;
276 __m128 mK1;
277 __m128 mK2;
278 __m128 mul1;
279 __m128 mul2;
280
281 __m128 sums1 = _mm_setzero_ps();
282 __m128 sums2 = _mm_setzero_ps();
283 bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F);
284 bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F);
285
286 #define LOAD_DATA(l1, l2) \
287 mInput = _mm_load_ps(inputP); \
288 mK1 = _mm_##l1##_ps(k1); \
289 mK2 = _mm_##l2##_ps(k2);
290
291 #define CONVOLVE_4_SAMPLES \
292 mul1 = _mm_mul_ps(mInput, mK1); \
293 mul2 = _mm_mul_ps(mInput, mK2); \
294 sums1 = _mm_add_ps(sums1, mul1); \
295 sums2 = _mm_add_ps(sums2, mul2); \
296 inputP += 4; \
297 k1 += 4; \
298 k2 += 4;
299
300 if (k1Aligned && k2Aligned) { // both aligned
301 while (inputP < endP) {
302 LOAD_DATA(load, load)
303 CONVOLVE_4_SAMPLES
304 }
305 } else if (!k1Aligned && k2Aligned) { // only k2 aligned
306 while (inputP < endP) {
307 LOAD_DATA(loadu, load)
308 CONVOLVE_4_SAMPLES
309 }
310 } else if (k1Aligned && !k2Aligned) { // only k1 aligned
311 while (inputP < endP) {
312 LOAD_DATA(load, loadu)
313 CONVOLVE_4_SAMPLES
314 }
315 } else { // both non-aligned
316 while (inputP < endP) {
317 LOAD_DATA(loadu, loadu)
318 CONVOLVE_4_SAMPLES
319 }
320 }
321
322 // Summarize the SSE results to sum1 and sum2.
323 float* groupSumP = reinterpret_cast<float*>(&sums1);
324 sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
325 groupSumP = reinterpret_cast<float*>(&sums2);
326 sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
327
328 n %= 4;
329 while (n) {
330 CONVOLVE_ONE_SAMPLE
331 n--;
332 }
333 #else
334 // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better.
335
336 // Optimize size 32 and size 64 kernels by unrolling the while loop.
337 // A 20 - 30% speed improvement was measured in some cases by using this approach.
338
339 if (n == 32) {
340 CONVOLVE_ONE_SAMPLE // 1
341 CONVOLVE_ONE_SAMPLE // 2
342 CONVOLVE_ONE_SAMPLE // 3
343 CONVOLVE_ONE_SAMPLE // 4
344 CONVOLVE_ONE_SAMPLE // 5
345 CONVOLVE_ONE_SAMPLE // 6
346 CONVOLVE_ONE_SAMPLE // 7
347 CONVOLVE_ONE_SAMPLE // 8
348 CONVOLVE_ONE_SAMPLE // 9
349 CONVOLVE_ONE_SAMPLE // 10
350 CONVOLVE_ONE_SAMPLE // 11
351 CONVOLVE_ONE_SAMPLE // 12
352 CONVOLVE_ONE_SAMPLE // 13
353 CONVOLVE_ONE_SAMPLE // 14
354 CONVOLVE_ONE_SAMPLE // 15
355 CONVOLVE_ONE_SAMPLE // 16
356 CONVOLVE_ONE_SAMPLE // 17
357 CONVOLVE_ONE_SAMPLE // 18
358 CONVOLVE_ONE_SAMPLE // 19
359 CONVOLVE_ONE_SAMPLE // 20
360 CONVOLVE_ONE_SAMPLE // 21
361 CONVOLVE_ONE_SAMPLE // 22
362 CONVOLVE_ONE_SAMPLE // 23
363 CONVOLVE_ONE_SAMPLE // 24
364 CONVOLVE_ONE_SAMPLE // 25
365 CONVOLVE_ONE_SAMPLE // 26
366 CONVOLVE_ONE_SAMPLE // 27
367 CONVOLVE_ONE_SAMPLE // 28
368 CONVOLVE_ONE_SAMPLE // 29
369 CONVOLVE_ONE_SAMPLE // 30
370 CONVOLVE_ONE_SAMPLE // 31
371 CONVOLVE_ONE_SAMPLE // 32
372 } else if (n == 64) {
373 CONVOLVE_ONE_SAMPLE // 1
374 CONVOLVE_ONE_SAMPLE // 2
375 CONVOLVE_ONE_SAMPLE // 3
376 CONVOLVE_ONE_SAMPLE // 4
377 CONVOLVE_ONE_SAMPLE // 5
378 CONVOLVE_ONE_SAMPLE // 6
379 CONVOLVE_ONE_SAMPLE // 7
380 CONVOLVE_ONE_SAMPLE // 8
381 CONVOLVE_ONE_SAMPLE // 9
382 CONVOLVE_ONE_SAMPLE // 10
383 CONVOLVE_ONE_SAMPLE // 11
384 CONVOLVE_ONE_SAMPLE // 12
385 CONVOLVE_ONE_SAMPLE // 13
386 CONVOLVE_ONE_SAMPLE // 14
387 CONVOLVE_ONE_SAMPLE // 15
388 CONVOLVE_ONE_SAMPLE // 16
389 CONVOLVE_ONE_SAMPLE // 17
390 CONVOLVE_ONE_SAMPLE // 18
391 CONVOLVE_ONE_SAMPLE // 19
392 CONVOLVE_ONE_SAMPLE // 20
393 CONVOLVE_ONE_SAMPLE // 21
394 CONVOLVE_ONE_SAMPLE // 22
395 CONVOLVE_ONE_SAMPLE // 23
396 CONVOLVE_ONE_SAMPLE // 24
397 CONVOLVE_ONE_SAMPLE // 25
398 CONVOLVE_ONE_SAMPLE // 26
399 CONVOLVE_ONE_SAMPLE // 27
400 CONVOLVE_ONE_SAMPLE // 28
401 CONVOLVE_ONE_SAMPLE // 29
402 CONVOLVE_ONE_SAMPLE // 30
403 CONVOLVE_ONE_SAMPLE // 31
404 CONVOLVE_ONE_SAMPLE // 32
405 CONVOLVE_ONE_SAMPLE // 33
406 CONVOLVE_ONE_SAMPLE // 34
407 CONVOLVE_ONE_SAMPLE // 35
408 CONVOLVE_ONE_SAMPLE // 36
409 CONVOLVE_ONE_SAMPLE // 37
410 CONVOLVE_ONE_SAMPLE // 38
411 CONVOLVE_ONE_SAMPLE // 39
412 CONVOLVE_ONE_SAMPLE // 40
413 CONVOLVE_ONE_SAMPLE // 41
414 CONVOLVE_ONE_SAMPLE // 42
415 CONVOLVE_ONE_SAMPLE // 43
416 CONVOLVE_ONE_SAMPLE // 44
417 CONVOLVE_ONE_SAMPLE // 45
418 CONVOLVE_ONE_SAMPLE // 46
419 CONVOLVE_ONE_SAMPLE // 47
420 CONVOLVE_ONE_SAMPLE // 48
421 CONVOLVE_ONE_SAMPLE // 49
422 CONVOLVE_ONE_SAMPLE // 50
423 CONVOLVE_ONE_SAMPLE // 51
424 CONVOLVE_ONE_SAMPLE // 52
425 CONVOLVE_ONE_SAMPLE // 53
426 CONVOLVE_ONE_SAMPLE // 54
427 CONVOLVE_ONE_SAMPLE // 55
428 CONVOLVE_ONE_SAMPLE // 56
429 CONVOLVE_ONE_SAMPLE // 57
430 CONVOLVE_ONE_SAMPLE // 58
431 CONVOLVE_ONE_SAMPLE // 59
432 CONVOLVE_ONE_SAMPLE // 60
433 CONVOLVE_ONE_SAMPLE // 61
434 CONVOLVE_ONE_SAMPLE // 62
435 CONVOLVE_ONE_SAMPLE // 63
436 CONVOLVE_ONE_SAMPLE // 64
437 } else {
438 while (n--) {
439 // Non-optimized using actual while loop.
440 CONVOLVE_ONE_SAMPLE
441 }
442 }
443 #endif
444 }
445
446 // Linearly interpolate the two "convolutions".
447 double result = (1.0 - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2;
448
449 *destination++ = result;
450
451 // Advance the virtual index.
452 m_virtualSourceIndex += m_scaleFactor;
453
454 --numberOfDestinationFrames;
455 if (!numberOfDestinationFrames)
456 return;
457 }
458
459 // Wrap back around to the start.
460 m_virtualSourceIndex -= m_blockSize;
461
462 // Step (3) Copy r3 to r1 and r4 to r2.
463 // This wraps the last input frames back to the start of the buffer.
464 memcpy(r1, r3, sizeof(float) * (m_kernelSize / 2));
465 memcpy(r2, r4, sizeof(float) * (m_kernelSize / 2));
466
467 // Step (4)
468 // Refresh the buffer with more input.
469 consumeSource(r5, m_blockSize);
470 }
471 }
472
473 } // namespace WebCore
474
475 #endif // ENABLE(WEB_AUDIO)
476