I'm using the Web Audio API to create a simple spectrum analyzer using the computer microphone as the input signal. The basic functionality of my current implementation works fine, using the default sampling rate (usually 48KHz, but could be 44.1KHz depending on the browser).
For some applications, I would like to use a lower sampling rate (~8KHz) for the FFT.
It looks like the Web Audio API is adding support to customize the sample rate, currently only available on FireFox (https://developer.mozilla.org/en-US/docs/Web/API/AudioContextOptions/sampleRate).
Adding sample rate to the context constructor:
// create AudioContext object named 'audioCtx'
var audioCtx = new (AudioContext || webkitAudioContext)({sampleRate: 8000,});
console.log(audioCtx.sampleRate)
The console outputs '8000' (in FireFox), so it appears to be working up to this point.
The microphone is turned on by the user using a pull-down menu. This is the function servicing that pull-down:
var microphone;
function getMicInputState()
{
let selectedValue = document.getElementById("micOffOn").value;
if (selectedValue === "on") {
navigator.mediaDevices.getUserMedia({audio: true})
.then(stream => {
microphone = audioCtx.createMediaStreamSource(stream);
microphone.connect(analyserNode);
})
.catch(err => { alert("Microphone is required."); });
} else {
microphone.disconnect();
}
}
In FireFox, using the pulldown to activate the microphone displays a popup requesting access to the microphone (as normally expected). After clicking to allow the microphone, the console displays:
"Connecting AudioNodes from AudioContexts with different sample-rate is currently not supported".
The display of the spectrum analyzer remains blank.
Any ideas how to overcome this error? If we can get past this, any guidance on how to specify sampleRate when the user's soundcard sampling rate is unknown?
One approach to overcome this is passing audio packets captured from microphone to analyzer node via a script processor node that re-samples the audio packets passing through it.
Brief overview of script processor node
Every script processor node has an input buffer and an output buffer.
When audio enters the input buffer, the script processor node fires
onaudioprocess event.
Whatever is placed in the output buffer of script processor node becomes its output.
For detailed specs, refer : Script processor node
Here is the pseudo-code:
Create live media source, script processor node and analyzer node
Connect live media source to analyzer node via script processor
node
Whenever an audio packet enters the script processor
node, onaudioprocess event is fired
When onaudioprocess event is fired :
4.1) Extract audio data from input buffer
4.2) Re-sample audio data
4.3) Place re-sampled data in output buffer
The following code snippet implements the above pseudocode:
var microphone;
// *** 1) create a script processor node
var scriptProcessorNode = audioCtx.createScriptProcessor(4096, 1, 1);
function getMicInputState()
{
let selectedValue = document.getElementById("micOffOn").value;
if (selectedValue === "on") {
navigator.mediaDevices.getUserMedia({audio: true})
.then(stream => {
microphone = audioCtx.createMediaStreamSource(stream);
// *** 2) connect live media source to analyserNode via script processor node
microphone.connect(scriptProcessorNode);
scriptProcessorNode.connect(analyserNode);
})
.catch(err => { alert("Microphone is required."); });
} else {
microphone.disconnect();
}
}
// *** 3) Whenever an audio packet passes through script processor node, resample it
scriptProcessorNode.onaudioprocess = function(event){
var inputBuffer = event.inputBuffer;
var outputBuffer = event.outputBuffer;
for(var channel = 0; channel < outputBuffer.numberOfChannels; channel++){
var inputData = inputBuffer.getChannelData(channel);
var outputData = outputBuffer.getChannelData(channel);
// *** 3.1) Resample inputData
var fromSampleRate = audioCtx.sampleRate;
var toSampleRate = 8000;
var resampledAudio = downsample(inputData, fromSampleRate, toSampleRate);
// *** 3.2) make output equal to the resampled audio
for (var sample = 0; sample < outputData.length; sample++) {
outputData[sample] = resampledAudio[sample];
}
}
}
function downsample(buffer, fromSampleRate, toSampleRate) {
// buffer is a Float32Array
var sampleRateRatio = Math.round(fromSampleRate / toSampleRate);
var newLength = Math.round(buffer.length / sampleRateRatio);
var result = new Float32Array(newLength);
var offsetResult = 0;
var offsetBuffer = 0;
while (offsetResult < result.length) {
var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
var accum = 0, count = 0;
for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
accum += buffer[i];
count++;
}
result[offsetResult] = accum / count;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result;
}
Update - 03 Nov, 2020
Script Processor Node is being deprecated and replaced with AudioWorklets.
The approach to changing the sample rate remains the same.
Downsampling from the constructor and connecting an AnalyserNode is now possible in Chrome and Safari.
So the following code, taken from the corresponding MDN documentation, would work:
const audioContext = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: 8000
});
const mediaStream = await navigator.mediaDevices.getUserMedia({
audio: true,
video: false
});
const mediaStreamSource = audioContext.createMediaStreamSource(mediaStream);
const analyser = audioContext.createAnalyser();
analyser.fftSize = 256;
const bufferLength = analyser.frequencyBinCount;
const dataArray = new Uint8Array(bufferLength);
analyser.getByteFrequencyData(dataArray);
mediaStreamSource.connect(analyser);
const title = document.createElement("div");
title.innerText = `Sampling frequency 8kHz:`;
const wrapper = document.createElement("div");
const canvas = document.createElement("canvas");
wrapper.appendChild(canvas);
document.body.appendChild(title);
document.body.appendChild(wrapper);
const canvasCtx = canvas.getContext("2d");
function draw() {
requestAnimationFrame(draw);
analyser.getByteFrequencyData(dataArray);
canvasCtx.fillStyle = "rgb(0, 0, 0)";
canvasCtx.fillRect(0, 0, canvas.width, canvas.height);
var barWidth = canvas.width / bufferLength;
var barHeight = 0;
var x = 0;
for (var i = 0; i < bufferLength; i++) {
barHeight = dataArray[i] / 2;
canvasCtx.fillStyle = "rgb(" + (2 * barHeight + 100) + ",50,50)";
canvasCtx.fillRect(x, canvas.height - barHeight / 2, barWidth, barHeight);
x += barWidth + 1;
}
}
draw();
See here for a demo where both 48kHz and 8kHz sampled signal frequencies are displayed: https://codesandbox.io/s/vibrant-moser-cfex33
Related
Recently I've been working on a voice platform allowing for users to talk to each other under certain conditions. However, it only seems to be smooth when your ping is really low.
Here's the structure currently:
Raw PCM Audio From User -> Web Socket -> Sent to all clients that meet a certain condition
There's nothing particularly special about my WebSocket. It's made in Java and just sends the data received directly from clients to other clients (just temporary).
Issue:
For users that have a decent amount of ping (>100ms) their audio cuts out (choppy) at certain parts and doesn't seem to load fast enough even with the code I have in place. If a video of this will help, let me know!
This is the code I have for recording and playback currently (using AudioWorkletProcessors)
Recording:
buffers = [];
buffersLength = 0;
process(inputs, _, parameters) {
const [input] = inputs;
if (parameters.muted[0] == 1) {
return true;
}
// Create one buffer from this input
const dataLen = input[0].byteLength;
const channels = input.length;
const bufferForSocket = new Uint8Array(dataLen * channels + 9);
bufferForSocket.set(new Uint8Array([channels]), 0);
bufferForSocket.set(numberToArrayBuffer(sampleRate), 1);
bufferForSocket.set(numberToArrayBuffer(input[0].byteLength), 5);
for (let i = 0; i < channels; i++) {
bufferForSocket.set(new Uint8Array(input[i].buffer), 9 + dataLen * i);
}
// Add buffers to a list
this.buffers.push(bufferForSocket);
this.buffersLength += bufferForSocket.byteLength;
// If we have 25 buffers, send them off to websocket
if (this.buffers.length >= 25) {
const combinedBuffer = new Uint8Array(
this.buffersLength + 4 + 4 * this.buffers.length
);
combinedBuffer.set(numberToArrayBuffer(this.buffers.length), 0);
let offset = 4;
for (let i = 0; i < this.buffers.length; i++) {
const buffer = this.buffers[i];
combinedBuffer.set(numberToArrayBuffer(buffer.byteLength), offset);
combinedBuffer.set(buffer, offset + 4);
offset += buffer.byteLength + 4;
}
this.buffers.length = 0;
this.buffersLength = 0;
// This is what sends to WebSocket
this.port.postMessage(combinedBuffer.buffer);
}
return true;
}
Playback:
class extends AudioWorkletProcessor {
buffers = new LinkedList();
timeTillNextBuffer = 0;
process(_, outputs, __) {
const [output] = outputs;
const linkedBuffers = this.buffers.last();
// If we aren't currently playing a buffer and we cannot play a buffer right now, return
if (
linkedBuffers == null ||
(linkedBuffers.buffers.length === 0 && this.timeTillNextBuffer > Date.now())
)
return true;
const buffer = linkedBuffers.buffers.removeLast();
// Current buffer is finished, remove it
if (linkedBuffers.index === linkedBuffers.buffers.length) {
this.buffers.removeLast();
}
if (buffer === null) {
return true;
}
const inputData = buffer.channels;
// put audio to output
for (let channel = 0; channel < outputs.length; channel++) {
const channelData = inputData[channel];
const outputData = output[channel];
for (let i = 0; i < channelData.length; i++) {
outputData[i] = channelData[i];
}
}
return true;
}
static get parameterDescriptors() {
return [];
}
onBuffer(buffer) {
// buffer is ArrayBuffer
const buffersGiven = new DataView(buffer, 0, 4).getUint32(0, true);
let offset = 4;
const buffers = new LinkedList();
const linkedBuffers = { index: 0, buffers };
// Read buffers from WebSocket (created in the snippet above)
for (let i = 0; i < buffersGiven; i++) {
const bufferLength = new DataView(buffer, offset, 4).getUint32(0, true);
const numberOfChannels = new DataView(buffer, offset + 4, 1).getUint8(0, true);
const sampleRate = new DataView(buffer, offset + 5, 4).getUint32(0, true);
const channelLength = new DataView(buffer, offset + 9, 4).getUint32(0, true);
const channels = [];
for (let i = 0; i < numberOfChannels; i++) {
const start = offset + 13 + i * channelLength;
channels[i] = new Float32Array(buffer.slice(start), 0, channelLength / 4);
}
buffers.push({ channelLength, numberOfChannels, sampleRate, channels });
offset += bufferLength + 4;
}
this.buffers.push(linkedBuffers);
// Jitter buffer
this.timeTillNextBuffer = 50 - this.buffers.length * 25 + Date.now();
}
constructor() {
super();
this.port.onmessage = (e) => this.onBuffer(e.data); // Data directly from WebSocket
}
}
I heard the use of Atomics can help because of how the AudioContext plays blank audio when returning. Any tips would be greatly appreciated! Also, if anything is unclear, please let me know!
My code has somewhat of a Jitter buffer system and it doesn't seem to work at all. The audio that a user receives from me (low ping) is clear. However, they (high ping) send me choppy audio. Furthermore, this choppy audio seems to build up and it gets more delayed the more packets I receive.
(question rewritten integrating bits of information from answers, plus making it more concise.)
I use analyser=audioContext.createAnalyser() in order to process audio data, and I'm trying to understand the details better.
I choose an fftSize, say 2048, then I create an array buffer of 2048 floats with Float32Array, and then, in an animation loop
(called 60 times per second on most machines, via window.requestAnimationFrame), I do
analyser.getFloatTimeDomainData(buffer);
which will fill my buffer with 2048 floating point sample data points.
When the handler is called the next time, 1/60 second has passed. To calculate how much that is in units of samples,
we have to divide it by the duration of 1 sample, and get (1/60)/(1/44100) = 735.
So the next handler call takes place (on average) 735 samples later.
So there is overlap between subsequent buffers, like this:
We know from the spec (search for 'render quantum') that everything happens in "chunck sizes" which are multiples of 128.
So (in terms of audio processing), one would expect that the next handler call will usually be either 5*128 = 640 samples later,
or else 6*128 = 768 samples later - those being the multiples of 128 closest to 735 samples = (1/60) second.
Calling this amount "Δ-samples", how do I find out what it is (during each handler call), 640 or 768 or something else?
Reliably, like this:
Consider the 'old buffer' (from previous handler call). If you delete "Δ-samples" many samples at the beginning, copy the remainder, and then append "Δ-samples" many new samples, that should be the current buffer. And indeed, I tried that,
and that is the case. It turns out "Δ-samples" often is 384, 512, 896. It is trivial but time consuming to determine
"Δ-samples" in a loop.
I would like to compute "Δ-samples" without performing that loop.
One would think the following would work:
(audioContext.currentTime() - (result of audioContext.currentTime() during last time handler ran))/(duration of 1 sample)
I tried that (see code below where I also "stich together" the various buffers, trying to reconstruct the original buffer),
and - surprise - it works about 99.9% of the time in Chrome, and about 95% of the time in Firefox.
I also tried audioContent.getOutputTimestamp().contextTime, which does not work in Chrome, and works 9?% in Firefox.
Is there any way to find "Δ-samples" (without looking at the buffers), which works reliably?
Second question, the "reconstructed" buffer (all the buffers from callbacks stitched together), and the original sound buffer
are not exactly the same, there is some (small, but noticable, more than usual "rounding error") difference, and that is bigger in Firefox.
Where does that come from? - You know, as I understand the spec, those should be the same.
var soundFile = 'https://mathheadinclouds.github.io/audio/sounds/la.mp3';
var audioContext = null;
var isPlaying = false;
var sourceNode = null;
var analyser = null;
var theBuffer = null;
var reconstructedBuffer = null;
var soundRequest = null;
var loopCounter = -1;
var FFT_SIZE = 2048;
var rafID = null;
var buffers = [];
var timesSamples = [];
var timeSampleDiffs = [];
var leadingWaste = 0;
window.addEventListener('load', function() {
soundRequest = new XMLHttpRequest();
soundRequest.open("GET", soundFile, true);
soundRequest.responseType = "arraybuffer";
//soundRequest.onload = function(evt) {}
soundRequest.send();
var btn = document.createElement('button');
btn.textContent = 'go';
btn.addEventListener('click', function(evt) {
goButtonClick(this, evt)
});
document.body.appendChild(btn);
});
function goButtonClick(elt, evt) {
initAudioContext(togglePlayback);
elt.parentElement.removeChild(elt);
}
function initAudioContext(callback) {
audioContext = new AudioContext();
audioContext.decodeAudioData(soundRequest.response, function(buffer) {
theBuffer = buffer;
callback();
});
}
function createAnalyser() {
analyser = audioContext.createAnalyser();
analyser.fftSize = FFT_SIZE;
}
function startWithSourceNode() {
sourceNode.connect(analyser);
analyser.connect(audioContext.destination);
sourceNode.start(0);
isPlaying = true;
sourceNode.addEventListener('ended', function(evt) {
sourceNode = null;
analyser = null;
isPlaying = false;
loopCounter = -1;
window.cancelAnimationFrame(rafID);
console.log('buffer length', theBuffer.length);
console.log('reconstructedBuffer length', reconstructedBuffer.length);
console.log('audio callback called counter', buffers.length);
console.log('root mean square error', Math.sqrt(checkResult() / theBuffer.length));
console.log('lengths of time between requestAnimationFrame callbacks, measured in audio samples:');
console.log(timeSampleDiffs);
console.log(
timeSampleDiffs.filter(function(val) {
return val === 384
}).length,
timeSampleDiffs.filter(function(val) {
return val === 512
}).length,
timeSampleDiffs.filter(function(val) {
return val === 640
}).length,
timeSampleDiffs.filter(function(val) {
return val === 768
}).length,
timeSampleDiffs.filter(function(val) {
return val === 896
}).length,
'*',
timeSampleDiffs.filter(function(val) {
return val > 896
}).length,
timeSampleDiffs.filter(function(val) {
return val < 384
}).length
);
console.log(
timeSampleDiffs.filter(function(val) {
return val === 384
}).length +
timeSampleDiffs.filter(function(val) {
return val === 512
}).length +
timeSampleDiffs.filter(function(val) {
return val === 640
}).length +
timeSampleDiffs.filter(function(val) {
return val === 768
}).length +
timeSampleDiffs.filter(function(val) {
return val === 896
}).length
)
});
myAudioCallback();
}
function togglePlayback() {
sourceNode = audioContext.createBufferSource();
sourceNode.buffer = theBuffer;
createAnalyser();
startWithSourceNode();
}
function myAudioCallback(time) {
++loopCounter;
if (!buffers[loopCounter]) {
buffers[loopCounter] = new Float32Array(FFT_SIZE);
}
var buf = buffers[loopCounter];
analyser.getFloatTimeDomainData(buf);
var now = audioContext.currentTime;
var nowSamp = Math.round(audioContext.sampleRate * now);
timesSamples[loopCounter] = nowSamp;
var j, sampDiff;
if (loopCounter === 0) {
console.log('start sample: ', nowSamp);
reconstructedBuffer = new Float32Array(theBuffer.length + FFT_SIZE + nowSamp);
leadingWaste = nowSamp;
for (j = 0; j < FFT_SIZE; j++) {
reconstructedBuffer[nowSamp + j] = buf[j];
}
} else {
sampDiff = nowSamp - timesSamples[loopCounter - 1];
timeSampleDiffs.push(sampDiff);
var expectedEqual = FFT_SIZE - sampDiff;
for (j = 0; j < expectedEqual; j++) {
if (reconstructedBuffer[nowSamp + j] !== buf[j]) {
console.error('unexpected error', loopCounter, j);
// debugger;
}
}
for (j = expectedEqual; j < FFT_SIZE; j++) {
reconstructedBuffer[nowSamp + j] = buf[j];
}
//console.log(loopCounter, nowSamp, sampDiff);
}
rafID = window.requestAnimationFrame(myAudioCallback);
}
function checkResult() {
var ch0 = theBuffer.getChannelData(0);
var ch1 = theBuffer.getChannelData(1);
var sum = 0;
var idxDelta = leadingWaste + FFT_SIZE;
for (var i = 0; i < theBuffer.length; i++) {
var samp0 = ch0[i];
var samp1 = ch1[i];
var samp = (samp0 + samp1) / 2;
var check = reconstructedBuffer[i + idxDelta];
var diff = samp - check;
var sqDiff = diff * diff;
sum += sqDiff;
}
return sum;
}
In above snippet, I do the following. I load with XMLHttpRequest a 1 second mp3 audio file from my github.io page (I sing 'la' for 1 second). After it has loaded, a button is shown, saying 'go', and after pressing that, the audio is played back by putting it into a bufferSource node and then doing .start on that. the bufferSource is the fed to our analyser, et cetera
related question
I also have the snippet code on my github.io page - makes reading the console easier.
I think the AnalyserNode is not what you want in this situation. You want to grab the data and keep it synchronized with raf. Use a ScriptProcessorNode or AudioWorkletNode to grab the data. Then you'll get all the data as it comes. No problems with overlap, or missing data or anything.
Note also that the clocks for raf and audio may be different and hence things may drift over time. You'll have to compensate for that yourself if you need to.
Unfortunately there is no way to find out the exact point in time at which the data returned by an AnalyserNode was captured. But you might be on the right track with your current approach.
All the values returned by the AnalyserNode are based on the "current-time-domain-data". This is basically the internal buffer of the AnalyserNode at a certain point in time. Since the Web Audio API has a fixed render quantum of 128 samples I would expect this buffer to evolve in steps of 128 samples as well. But currentTime usually evolves in steps of 128 samples already.
Furthermore the AnalyserNode has a smoothingTimeConstant property. It is responsible for "blurring" the returned values. The default value is 0.8. For your use case you probably want to set this to 0.
EDIT: As Raymond Toy pointed out in the comments the smoothingtimeconstant only has an effect on the frequency data. Since the question is about getFloatTimeDomainData() it will have no effect on the returned values.
I hope this helps but I think it would be easier to get all the samples of your audio signal by using an AudioWorklet. It would definitely be more reliable.
I'm not really following your math, so I can't tell exactly what you had wrong, but you seem to look at this in a too complicated manner.
The fftSize doesn't really matter here, what you want to calculate is how many samples have been passed since the last frame.
To calculate this, you just need to
Measure the time elapsed from last frame.
Divide this time by the time of a single frame.
The time of a single frame, is simply 1 / context.sampleRate.
So really all you need is currentTime - previousTime * ( 1 / sampleRate) and you'll find the index in the last frame where the data starts being repeated in the new one.
And only then, if you want the index in the new frame you'd subtract this index from the fftSize.
Now for why you sometimes have gaps, it's because AudioContext.prototype.currentTime returns the timestamp of the beginning of the next block to be passed to the graph.
The one we want here is AudioContext.prototype.getOuputTimestamp().contextTime which represents the timestamp of now, on the same same base as currentTime (i.e the creation of the context).
(function loop(){requestAnimationFrame(loop);})();
(async()=>{
const ctx = new AudioContext();
const buf = await fetch("https://upload.wikimedia.org/wikipedia/en/d/d3/Beach_Boys_-_Good_Vibrations.ogg").then(r=>r.arrayBuffer());
const aud_buf = await ctx.decodeAudioData(buf);
const source = ctx.createBufferSource();
source.buffer = aud_buf;
source.loop = true;
const analyser = ctx.createAnalyser();
const fftSize = analyser.fftSize = 2048;
source.loop = true;
source.connect( analyser );
source.start(0);
// for debugging we use two different buffers
const arr1 = new Float32Array( fftSize );
const arr2 = new Float32Array( fftSize );
const single_sample_dur = (1 / ctx.sampleRate);
console.log( 'single sample duration (ms)', single_sample_dur * 1000);
onclick = e => {
if( ctx.state === "suspended" ) {
ctx.resume();
return console.log( 'starting context, please try again' );
}
console.log( '-------------' );
requestAnimationFrame( () => {
// first frame
const time1 = ctx.getOutputTimestamp().contextTime;
analyser.getFloatTimeDomainData( arr1 );
requestAnimationFrame( () => {
// second frame
const time2 = ctx.getOutputTimestamp().contextTime;
analyser.getFloatTimeDomainData( arr2 );
const elapsed_time = time2 - time1;
console.log( 'elapsed time between two frame (ms)', elapsed_time * 1000 );
const calculated_index = fftSize - Math.round( elapsed_time / single_sample_dur );
console.log( 'calculated index of new data', calculated_index );
// for debugging we can just search for the first index where the data repeats
const real_time = fftSize - arr1.indexOf( arr2[ 0 ] );
console.log( 'real index', real_time > fftSize ? 0 : real_time );
if( calculated_index !== real_time > fftSize ? 0 : real_time ) {
console.error( 'different' );
}
});
});
};
document.body.classList.add('ready');
})().catch( console.error );
body:not(.ready) pre { display: none; }
<pre>click to record two new frames</pre>
Trying to understand the Web Audio API better. We're using it to create an AudioContext and then sending audio to be transcribed. I want to be able to determine when there is a natural pause in speech or when the user stopped speaking.
Is there some data in onaudioprocess callback that can be accessed to determine pauses/breaks in speech?
let context = new AudioContext();
context.onstatechange = () => {};
this.setState({ context: context });
let source = context.createMediaStreamSource(stream);
let processor = context.createScriptProcessor(4096, 1, 1);
source.connect(processor);
processor.connect(context.destination);
processor.onaudioprocess = (event) => {
// Do some magic here
}
I tried a solution that is suggested on this post but did not achieve the results I need. Post: HTML Audio recording until silence?
When I parse for silence as the post suggests, I get the same result - either 0 or 128
let context = new AudioContext();
let source = context.createMediaStreamSource(stream);
let processor = context.createScriptProcessor(4096, 1, 1);
source.connect(processor);
processor.connect(context.destination);
/***
* Crete analyser
*
**/
let analyser = context.createAnalyser();
analyser.smoothingTimeConstant = 0;
analyser.fftSize = 2048;
let buffLength = analyser.frequencyBinCount;
let arrayFreqDomain = new Uint8Array(buffLength);
let arrayTimeDomain = new Uint8Array(buffLength);
processor.connect(analyser);
processor.onaudioprocess = (event) => {
/**
*
* Parse live real-time buffer looking for silence
*
**/
let f, t;
analyser.getByteFrequencyData(arrayFreqDomain);
analyser.getByteTimeDomainData(arrayTimeDomain);
for (var i = 0; i < buffLength; i++) {
arrayFreqDomain[i]; <---- gives 0 value always
arrayTimeDomain[i]; <---- gives 128 value always
}
}
Looking at the documentation for the getByteFrequencyData method I can see how it is supposed to be giving a different value (in the documentation example it will give a different barHeight), but it isn't working for me. https://developer.mozilla.org/en-US/docs/Web/API/AnalyserNode/getByteFrequencyData#Example
Now I'm developing a software from Electron. It is a music player application.
What I would to do is I have to extract audio feature from (an) incoming song(s). This is not realtime extraction because I would like to extract feature entire of a song.
I use a library call Meyda and Web Audio API. I have noticed that my implementation consume amount of RAM (about ~2,000 MB).
Here is my implementation:
let offlineCtx = new OfflineAudioContext(
2,
duration * sampleRate,
sampleRate
)
let source = offlineCtx.createBufferSource()
let buffer = await audioCtx.decodeAudioData(songData.buffer)
source.buffer = buffer
source.connect(offlineCtx.destination)
source.start()
const SLICING_WINDOW_SIZE = 1024
let renderedBuffer = await offlineCtx.startRendering()
let channelData = await renderedBuffer.getChannelData(0)
let results = []
for (let i = 0; i < channelData.length - SLICING_WINDOW_SIZE; i += SLICING_WINDOW_SIZE) {
const r = Meyda.extract(
'mfcc',
channelData.slice(i, i + SLICING_WINDOW_SIZE)
)
results.push(r)
}
Is there a way to reduce RAM consuming? Thanks!
I need to layer looping .wav tracks that ultimately I will need to be able to turn on and off and keep in sync.
First I load the tracks and stopped BufferLoader from turning the loaded arraybuffer into an AudioBuffer (hence the false)
function loadTracks(data) {
for (var i = 0; i < data.length; i++) {
trackUrls.push(data[i]['url']);
};
bufferLoader = new BufferLoader(context, trackUrls, finishedLoading);
bufferLoader.load(false);
return loaderDefered.promise;
}
When you click a button on screen it calls startStop().
function startStop(index, name, isPlaying) {
if(!activeBuffer) {
activeBuffer = bufferList[index];
}else{
activeBuffer = appendBuffer(activeBuffer, bufferList[index]);
}
context.decodeAudioData(activeBuffer, function(buffer){
audioBuffer = buffer;
play();
})
function play() {
var scheduledTime = 0.015;
try {
audioSource.stop(scheduledTime);
} catch (e) {}
audioSource = context.createBufferSource();
audioSource.buffer = audioBuffer;
audioSource.loop = true;
audioSource.connect(context.destination);
var currentTime = context.currentTime + 0.010 || 0;
audioSource.start(scheduledTime - 0.005, currentTime, audioBuffer.duration - currentTime);
audioSource.playbackRate.value = 1;
}
Most of the code I found on this guys github.
In the demo you can hear he is layering AudioBuffers.
I have tried the same on my hosting.
Disregarding the argularJS stuff, the Web Audio stuff is happening on the service.js at:
/js/angular/service.js
If you open the console and click the buttons you can see the activeBuffer.byteLength (type ArrayBuffer) is incrementing, however even after being decoded by the context.decodeAudioData method it still only plays the first sound you clicked instead of a merged AudioBuffer
I'm not sure I totally understand your scenario - don't you want these to be playing simultaneously? (i.e. bass gets layered on top of the drums).
Your current code is trying to concatenate an additional audio file whenever you hit the button for that file. You can't just concatenate audio files (in their ENCODED form) and then run it through decode - the decodeAudioData method is decoding the first complete sound in the arraybuffer, then stopping (because it's done decoding the sound).
What you should do is change the logic to concatenate the buffer data from the resulting AudioBuffers (see below). Even this logic isn't QUITE what you should do - this is still caching the encoded audio files, and decoding every time you hit the button. Instead, you should cache the decoded audio buffers, and just concatenate it.
function startStop(index, name, isPlaying) {
// Note we're decoding just the new sound
context.decodeAudioData( bufferList[index], function(buffer){
// We have a decoded buffer - now we need to concatenate it
audioBuffer = buffer;
if(!audioBuffer) {
audioBuffer = buffer;
}else{
audioBuffer = concatenateAudioBuffers(audioBuffer, buffer);
}
play();
})
}
function concatenateAudioBuffers(buffer1, buffer2) {
if (!buffer1 || !buffer2) {
console.log("no buffers!");
return null;
}
if (buffer1.numberOfChannels != buffer2.numberOfChannels) {
console.log("number of channels is not the same!");
return null;
}
if (buffer1.sampleRate != buffer2.sampleRate) {
console.log("sample rates don't match!");
return null;
}
var tmp = context.createBuffer(buffer1.numberOfChannels, buffer1.length + buffer2.length, buffer1.sampleRate);
for (var i=0; i<tmp.numberOfChannels; i++) {
var data = tmp.getChannelData(i);
data.set(buffer1.getChannelData(i));
data.set(buffer2.getChannelData(i),buffer1.length);
}
return tmp;
};
SOLVED:
To get multiple loops of the same duration playing at the same time and keep in sync even when you start and stop them randomly.
First, create all your buffer sources where bufferList is an array of AudioBuffers and the first sound is a sound you are going to read from and overwrite with your other sounds.
function createAllBufferSources() {
for (var i = 0; i < bufferList.length; i++) {
var source = context.createBufferSource();
source.buffer = bufferList[i];
source.loop = true;
bufferSources.push(source);
};
console.log(bufferSources)
}
Then:
function start() {
var rewrite = bufferSources[0];
rewrite.connect(context.destination);
var processNode = context.createScriptProcessor(2048, 2, 2);
rewrite.connect(processNode)
processNode.onaudioprocess = function(e) {
//getting the left and right of the sound we want to overwrite
var left = rewrite.buffer.getChannelData(0);
var right = rewrite.buffer.getChannelData(1);
var overL = [],
overR = [],
i, a, b, l;
l = bufferList.length,
//storing all the loops channel data
for (i = 0; i < l; i++) {
overL[i] = bufferList[i].getChannelData(0);
overR[i] = bufferList[i].getChannelData(1);
}
//looping through the channel data of the sound we are going to overwrite
a = 0, b = overL.length, l = left.length;
for (i = 0; i < l; i++) {
//making sure its a blank before we start to write
left[i] -= left[i];
right[i] -= right[i];
//looping through all the sounds we want to add and assigning the bytes to the old sound, both at the same position
for (a = 0; a < b; a++) {
left[i] += overL[a][i];
right[i] += overR[a][i];
}
left[i] /= b;
right[i] /= b);
}
};
processNode.connect(context.destination);
rewrite.start(0)
}
If you remove a AudioBuffer from bufferList and add it again at any point, it will always be in sync.
EDIT:
Keep in mind that:
-processor node gets garbage collected weirdly.
-This is very taxing, might want to think about using WebWorkers somehow