1 /*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
14 * its contributors may be used to endorse or promote products derived
15 * from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include "config.h"
30
31 #if ENABLE(WEB_AUDIO)
32
33 #include "platform/audio/SincResampler.h"
34
35 #include "platform/audio/AudioBus.h"
36 #include "wtf/CPU.h"
37 #include "wtf/MathExtras.h"
38
39 #if CPU(X86) || CPU(X86_64)
40 #include <emmintrin.h>
41 #endif
42
43 // Input buffer layout, dividing the total buffer into regions (r0 - r5):
44 //
45 // |----------------|----------------------------------------------------------------|----------------|
46 //
47 // blockSize + kernelSize / 2
48 // <-------------------------------------------------------------------------------->
49 // r0
50 //
51 // kernelSize / 2 kernelSize / 2 kernelSize / 2 kernelSize / 2
52 // <---------------> <---------------> <---------------> <--------------->
53 // r1 r2 r3 r4
54 //
55 // blockSize
56 // <-------------------------------------------------------------->
57 // r5
58
59 // The Algorithm:
60 //
61 // 1) Consume input frames into r0 (r1 is zero-initialized).
62 // 2) Position kernel centered at start of r0 (r2) and generate output frames until kernel is centered at start of r4.
63 // or we've finished generating all the output frames.
64 // 3) Copy r3 to r1 and r4 to r2.
65 // 4) Consume input frames into r5 (zero-pad if we run out of input).
66 // 5) Goto (2) until all of input is consumed.
67 //
68 // note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc.
69
70 namespace blink {
71
SincResampler(double scaleFactor,unsigned kernelSize,unsigned numberOfKernelOffsets)72 SincResampler::SincResampler(double scaleFactor, unsigned kernelSize, unsigned numberOfKernelOffsets)
73 : m_scaleFactor(scaleFactor)
74 , m_kernelSize(kernelSize)
75 , m_numberOfKernelOffsets(numberOfKernelOffsets)
76 , m_kernelStorage(m_kernelSize * (m_numberOfKernelOffsets + 1))
77 , m_virtualSourceIndex(0)
78 , m_blockSize(512)
79 , m_inputBuffer(m_blockSize + m_kernelSize) // See input buffer layout above.
80 , m_source(0)
81 , m_sourceFramesAvailable(0)
82 , m_sourceProvider(0)
83 , m_isBufferPrimed(false)
84 {
85 initializeKernel();
86 }
87
initializeKernel()88 void SincResampler::initializeKernel()
89 {
90 // Blackman window parameters.
91 double alpha = 0.16;
92 double a0 = 0.5 * (1.0 - alpha);
93 double a1 = 0.5;
94 double a2 = 0.5 * alpha;
95
96 // sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter.
97 double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0;
98
99 // The sinc function is an idealized brick-wall filter, but since we're windowing it the
100 // transition from pass to stop does not happen right away. So we should adjust the
101 // lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end.
102 // FIXME: this value is empirical and to be more exact should vary depending on m_kernelSize.
103 sincScaleFactor *= 0.9;
104
105 int n = m_kernelSize;
106 int halfSize = n / 2;
107
108 // Generates a set of windowed sinc() kernels.
109 // We generate a range of sub-sample offsets from 0.0 to 1.0.
110 for (unsigned offsetIndex = 0; offsetIndex <= m_numberOfKernelOffsets; ++offsetIndex) {
111 double subsampleOffset = static_cast<double>(offsetIndex) / m_numberOfKernelOffsets;
112
113 for (int i = 0; i < n; ++i) {
114 // Compute the sinc() with offset.
115 double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset);
116 double sinc = !s ? 1.0 : std::sin(s) / s;
117 sinc *= sincScaleFactor;
118
119 // Compute Blackman window, matching the offset of the sinc().
120 double x = (i - subsampleOffset) / n;
121 double window = a0 - a1 * std::cos(twoPiDouble * x) + a2 * std::cos(twoPiDouble * 2.0 * x);
122
123 // Window the sinc() function and store at the correct offset.
124 m_kernelStorage[i + offsetIndex * m_kernelSize] = sinc * window;
125 }
126 }
127 }
128
consumeSource(float * buffer,unsigned numberOfSourceFrames)129 void SincResampler::consumeSource(float* buffer, unsigned numberOfSourceFrames)
130 {
131 ASSERT(m_sourceProvider);
132 if (!m_sourceProvider)
133 return;
134
135 // Wrap the provided buffer by an AudioBus for use by the source provider.
136 RefPtr<AudioBus> bus = AudioBus::create(1, numberOfSourceFrames, false);
137
138 // FIXME: Find a way to make the following const-correct:
139 bus->setChannelMemory(0, buffer, numberOfSourceFrames);
140
141 m_sourceProvider->provideInput(bus.get(), numberOfSourceFrames);
142 }
143
144 namespace {
145
146 // BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer.
147
148 class BufferSourceProvider FINAL : public AudioSourceProvider {
149 public:
BufferSourceProvider(const float * source,size_t numberOfSourceFrames)150 BufferSourceProvider(const float* source, size_t numberOfSourceFrames)
151 : m_source(source)
152 , m_sourceFramesAvailable(numberOfSourceFrames)
153 {
154 }
155
156 // Consumes samples from the in-memory buffer.
provideInput(AudioBus * bus,size_t framesToProcess)157 virtual void provideInput(AudioBus* bus, size_t framesToProcess) OVERRIDE
158 {
159 ASSERT(m_source && bus);
160 if (!m_source || !bus)
161 return;
162
163 float* buffer = bus->channel(0)->mutableData();
164
165 // Clamp to number of frames available and zero-pad.
166 size_t framesToCopy = std::min(m_sourceFramesAvailable, framesToProcess);
167 memcpy(buffer, m_source, sizeof(float) * framesToCopy);
168
169 // Zero-pad if necessary.
170 if (framesToCopy < framesToProcess)
171 memset(buffer + framesToCopy, 0, sizeof(float) * (framesToProcess - framesToCopy));
172
173 m_sourceFramesAvailable -= framesToCopy;
174 m_source += framesToCopy;
175 }
176
177 private:
178 const float* m_source;
179 size_t m_sourceFramesAvailable;
180 };
181
182 } // namespace
183
process(const float * source,float * destination,unsigned numberOfSourceFrames)184 void SincResampler::process(const float* source, float* destination, unsigned numberOfSourceFrames)
185 {
186 // Resample an in-memory buffer using an AudioSourceProvider.
187 BufferSourceProvider sourceProvider(source, numberOfSourceFrames);
188
189 unsigned numberOfDestinationFrames = static_cast<unsigned>(numberOfSourceFrames / m_scaleFactor);
190 unsigned remaining = numberOfDestinationFrames;
191
192 while (remaining) {
193 unsigned framesThisTime = std::min(remaining, m_blockSize);
194 process(&sourceProvider, destination, framesThisTime);
195
196 destination += framesThisTime;
197 remaining -= framesThisTime;
198 }
199 }
200
process(AudioSourceProvider * sourceProvider,float * destination,size_t framesToProcess)201 void SincResampler::process(AudioSourceProvider* sourceProvider, float* destination, size_t framesToProcess)
202 {
203 bool isGood = sourceProvider && m_blockSize > m_kernelSize && m_inputBuffer.size() >= m_blockSize + m_kernelSize && !(m_kernelSize % 2);
204 ASSERT(isGood);
205 if (!isGood)
206 return;
207
208 m_sourceProvider = sourceProvider;
209
210 unsigned numberOfDestinationFrames = framesToProcess;
211
212 // Setup various region pointers in the buffer (see diagram above).
213 float* r0 = m_inputBuffer.data() + m_kernelSize / 2;
214 float* r1 = m_inputBuffer.data();
215 float* r2 = r0;
216 float* r3 = r0 + m_blockSize - m_kernelSize / 2;
217 float* r4 = r0 + m_blockSize;
218 float* r5 = r0 + m_kernelSize / 2;
219
220 // Step (1)
221 // Prime the input buffer at the start of the input stream.
222 if (!m_isBufferPrimed) {
223 consumeSource(r0, m_blockSize + m_kernelSize / 2);
224 m_isBufferPrimed = true;
225 }
226
227 // Step (2)
228
229 while (numberOfDestinationFrames) {
230 while (m_virtualSourceIndex < m_blockSize) {
231 // m_virtualSourceIndex lies in between two kernel offsets so figure out what they are.
232 int sourceIndexI = static_cast<int>(m_virtualSourceIndex);
233 double subsampleRemainder = m_virtualSourceIndex - sourceIndexI;
234
235 double virtualOffsetIndex = subsampleRemainder * m_numberOfKernelOffsets;
236 int offsetIndex = static_cast<int>(virtualOffsetIndex);
237
238 float* k1 = m_kernelStorage.data() + offsetIndex * m_kernelSize;
239 float* k2 = k1 + m_kernelSize;
240
241 // Initialize input pointer based on quantized m_virtualSourceIndex.
242 float* inputP = r1 + sourceIndexI;
243
244 // We'll compute "convolutions" for the two kernels which straddle m_virtualSourceIndex
245 float sum1 = 0;
246 float sum2 = 0;
247
248 // Figure out how much to weight each kernel's "convolution".
249 double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex;
250
251 // Generate a single output sample.
252 int n = m_kernelSize;
253
254 #define CONVOLVE_ONE_SAMPLE \
255 input = *inputP++; \
256 sum1 += input * *k1; \
257 sum2 += input * *k2; \
258 ++k1; \
259 ++k2;
260
261 {
262 float input;
263
264 #if CPU(X86) || CPU(X86_64)
265 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
266 while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) {
267 CONVOLVE_ONE_SAMPLE
268 n--;
269 }
270
271 // Now the inputP is aligned and start to apply SSE.
272 float* endP = inputP + n - n % 4;
273 __m128 mInput;
274 __m128 mK1;
275 __m128 mK2;
276 __m128 mul1;
277 __m128 mul2;
278
279 __m128 sums1 = _mm_setzero_ps();
280 __m128 sums2 = _mm_setzero_ps();
281 bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F);
282 bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F);
283
284 #define LOAD_DATA(l1, l2) \
285 mInput = _mm_load_ps(inputP); \
286 mK1 = _mm_##l1##_ps(k1); \
287 mK2 = _mm_##l2##_ps(k2);
288
289 #define CONVOLVE_4_SAMPLES \
290 mul1 = _mm_mul_ps(mInput, mK1); \
291 mul2 = _mm_mul_ps(mInput, mK2); \
292 sums1 = _mm_add_ps(sums1, mul1); \
293 sums2 = _mm_add_ps(sums2, mul2); \
294 inputP += 4; \
295 k1 += 4; \
296 k2 += 4;
297
298 if (k1Aligned && k2Aligned) { // both aligned
299 while (inputP < endP) {
300 LOAD_DATA(load, load)
301 CONVOLVE_4_SAMPLES
302 }
303 } else if (!k1Aligned && k2Aligned) { // only k2 aligned
304 while (inputP < endP) {
305 LOAD_DATA(loadu, load)
306 CONVOLVE_4_SAMPLES
307 }
308 } else if (k1Aligned && !k2Aligned) { // only k1 aligned
309 while (inputP < endP) {
310 LOAD_DATA(load, loadu)
311 CONVOLVE_4_SAMPLES
312 }
313 } else { // both non-aligned
314 while (inputP < endP) {
315 LOAD_DATA(loadu, loadu)
316 CONVOLVE_4_SAMPLES
317 }
318 }
319
320 // Summarize the SSE results to sum1 and sum2.
321 float* groupSumP = reinterpret_cast<float*>(&sums1);
322 sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
323 groupSumP = reinterpret_cast<float*>(&sums2);
324 sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
325
326 n %= 4;
327 while (n) {
328 CONVOLVE_ONE_SAMPLE
329 n--;
330 }
331 #else
332 // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better.
333
334 // Optimize size 32 and size 64 kernels by unrolling the while loop.
335 // A 20 - 30% speed improvement was measured in some cases by using this approach.
336
337 if (n == 32) {
338 CONVOLVE_ONE_SAMPLE // 1
339 CONVOLVE_ONE_SAMPLE // 2
340 CONVOLVE_ONE_SAMPLE // 3
341 CONVOLVE_ONE_SAMPLE // 4
342 CONVOLVE_ONE_SAMPLE // 5
343 CONVOLVE_ONE_SAMPLE // 6
344 CONVOLVE_ONE_SAMPLE // 7
345 CONVOLVE_ONE_SAMPLE // 8
346 CONVOLVE_ONE_SAMPLE // 9
347 CONVOLVE_ONE_SAMPLE // 10
348 CONVOLVE_ONE_SAMPLE // 11
349 CONVOLVE_ONE_SAMPLE // 12
350 CONVOLVE_ONE_SAMPLE // 13
351 CONVOLVE_ONE_SAMPLE // 14
352 CONVOLVE_ONE_SAMPLE // 15
353 CONVOLVE_ONE_SAMPLE // 16
354 CONVOLVE_ONE_SAMPLE // 17
355 CONVOLVE_ONE_SAMPLE // 18
356 CONVOLVE_ONE_SAMPLE // 19
357 CONVOLVE_ONE_SAMPLE // 20
358 CONVOLVE_ONE_SAMPLE // 21
359 CONVOLVE_ONE_SAMPLE // 22
360 CONVOLVE_ONE_SAMPLE // 23
361 CONVOLVE_ONE_SAMPLE // 24
362 CONVOLVE_ONE_SAMPLE // 25
363 CONVOLVE_ONE_SAMPLE // 26
364 CONVOLVE_ONE_SAMPLE // 27
365 CONVOLVE_ONE_SAMPLE // 28
366 CONVOLVE_ONE_SAMPLE // 29
367 CONVOLVE_ONE_SAMPLE // 30
368 CONVOLVE_ONE_SAMPLE // 31
369 CONVOLVE_ONE_SAMPLE // 32
370 } else if (n == 64) {
371 CONVOLVE_ONE_SAMPLE // 1
372 CONVOLVE_ONE_SAMPLE // 2
373 CONVOLVE_ONE_SAMPLE // 3
374 CONVOLVE_ONE_SAMPLE // 4
375 CONVOLVE_ONE_SAMPLE // 5
376 CONVOLVE_ONE_SAMPLE // 6
377 CONVOLVE_ONE_SAMPLE // 7
378 CONVOLVE_ONE_SAMPLE // 8
379 CONVOLVE_ONE_SAMPLE // 9
380 CONVOLVE_ONE_SAMPLE // 10
381 CONVOLVE_ONE_SAMPLE // 11
382 CONVOLVE_ONE_SAMPLE // 12
383 CONVOLVE_ONE_SAMPLE // 13
384 CONVOLVE_ONE_SAMPLE // 14
385 CONVOLVE_ONE_SAMPLE // 15
386 CONVOLVE_ONE_SAMPLE // 16
387 CONVOLVE_ONE_SAMPLE // 17
388 CONVOLVE_ONE_SAMPLE // 18
389 CONVOLVE_ONE_SAMPLE // 19
390 CONVOLVE_ONE_SAMPLE // 20
391 CONVOLVE_ONE_SAMPLE // 21
392 CONVOLVE_ONE_SAMPLE // 22
393 CONVOLVE_ONE_SAMPLE // 23
394 CONVOLVE_ONE_SAMPLE // 24
395 CONVOLVE_ONE_SAMPLE // 25
396 CONVOLVE_ONE_SAMPLE // 26
397 CONVOLVE_ONE_SAMPLE // 27
398 CONVOLVE_ONE_SAMPLE // 28
399 CONVOLVE_ONE_SAMPLE // 29
400 CONVOLVE_ONE_SAMPLE // 30
401 CONVOLVE_ONE_SAMPLE // 31
402 CONVOLVE_ONE_SAMPLE // 32
403 CONVOLVE_ONE_SAMPLE // 33
404 CONVOLVE_ONE_SAMPLE // 34
405 CONVOLVE_ONE_SAMPLE // 35
406 CONVOLVE_ONE_SAMPLE // 36
407 CONVOLVE_ONE_SAMPLE // 37
408 CONVOLVE_ONE_SAMPLE // 38
409 CONVOLVE_ONE_SAMPLE // 39
410 CONVOLVE_ONE_SAMPLE // 40
411 CONVOLVE_ONE_SAMPLE // 41
412 CONVOLVE_ONE_SAMPLE // 42
413 CONVOLVE_ONE_SAMPLE // 43
414 CONVOLVE_ONE_SAMPLE // 44
415 CONVOLVE_ONE_SAMPLE // 45
416 CONVOLVE_ONE_SAMPLE // 46
417 CONVOLVE_ONE_SAMPLE // 47
418 CONVOLVE_ONE_SAMPLE // 48
419 CONVOLVE_ONE_SAMPLE // 49
420 CONVOLVE_ONE_SAMPLE // 50
421 CONVOLVE_ONE_SAMPLE // 51
422 CONVOLVE_ONE_SAMPLE // 52
423 CONVOLVE_ONE_SAMPLE // 53
424 CONVOLVE_ONE_SAMPLE // 54
425 CONVOLVE_ONE_SAMPLE // 55
426 CONVOLVE_ONE_SAMPLE // 56
427 CONVOLVE_ONE_SAMPLE // 57
428 CONVOLVE_ONE_SAMPLE // 58
429 CONVOLVE_ONE_SAMPLE // 59
430 CONVOLVE_ONE_SAMPLE // 60
431 CONVOLVE_ONE_SAMPLE // 61
432 CONVOLVE_ONE_SAMPLE // 62
433 CONVOLVE_ONE_SAMPLE // 63
434 CONVOLVE_ONE_SAMPLE // 64
435 } else {
436 while (n--) {
437 // Non-optimized using actual while loop.
438 CONVOLVE_ONE_SAMPLE
439 }
440 }
441 #endif
442 }
443
444 // Linearly interpolate the two "convolutions".
445 double result = (1.0 - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2;
446
447 *destination++ = result;
448
449 // Advance the virtual index.
450 m_virtualSourceIndex += m_scaleFactor;
451
452 --numberOfDestinationFrames;
453 if (!numberOfDestinationFrames)
454 return;
455 }
456
457 // Wrap back around to the start.
458 m_virtualSourceIndex -= m_blockSize;
459
460 // Step (3) Copy r3 to r1 and r4 to r2.
461 // This wraps the last input frames back to the start of the buffer.
462 memcpy(r1, r3, sizeof(float) * (m_kernelSize / 2));
463 memcpy(r2, r4, sizeof(float) * (m_kernelSize / 2));
464
465 // Step (4)
466 // Refresh the buffer with more input.
467 consumeSource(r5, m_blockSize);
468 }
469 }
470
471 } // namespace blink
472
473 #endif // ENABLE(WEB_AUDIO)
474