I have developed a webpage that listens to the microphone, calculates the root means square (the average loudness) and changes the colour of the text areas on the webpage.
I found the audio-rms package, but the example uses an oscillator and I am not sure how to replace it with a microphone stream.
I then found an article on HTML5 Rocks about capturing audio, and I used some of the code to capture the audio for use in realtime.
I already have some code that is supposed to calculate the rms from the stream, but the problem is that the microphone never sends any audio. By using console logs, I have worked out that the code works up line 8, but it does not work by line 11, which happens to be when calling navigator.mediaDevices.getUserMedia
The code I am using is below and you can view the files on GitHub:
+function () {
var errorCallback = function (e) {
console.log('Permission Rejected!', e);
};
var ctx = new AudioContext()
if (navigator.mediaDevices.getUserMedia) {
//works here
navigator.mediaDevices.getUserMedia({audio: true}, function (stream)
{
//Doesn't work here.
// 2048 sample buffer, 1 channel in, 1 channel out
var processor = ctx.createScriptProcessor(2048, 1, 1)
var source
console.log("processor")
source = ctx.createMediaElementSource(stream)
console.log("media element")
source.connect(processor)
source.connect(ctx.destination)
processor.connect(ctx.destination)
stream.play()
console.log("stream play")
// loop through PCM data and calculate average
// volume for a given 2048 sample buffer
processor.onaudioprocess = function (evt) {
var input = evt.inputBuffer.getChannelData(0)
, len = input.length
, total = i = 0
, rms
while (i < len) total += Math.abs(input[i++])
rms = Math.sqrt(total / len)
console.log(rmsLevel)
if (rmsLevel > 65) { document.getElementById("TL").style.backgroundColor = "rgb(255, 0, 0)"; }
else if (rmsLevel > 60 && rmsLevel <= 65) { document.getElementById("L").style.backgroundColor = "rgb(255, 140, 0)"; }
...
}
}, errorCallback);
} else {
alert("Error. :(")
}
}()
function resetColours() {
document.getElementById("TL").style.backgroundColor = "rgb(110, 110, 110)";
document.getElementById("L").style.backgroundColor = "rgb(110, 110, 110)";
...
}
You've got incorrect usage of navigator.MediaDevices.getUserMedia - you wrote it in the old style of navigator.getUserMedia callbacks, not in the Promise-based way of navigator.MediaDevices.gUM. Take a look at https://developer.mozilla.org/en-US/docs/Web/API/MediaDevices/getUserMedia.
Instead of
navigator.mediaDevices.getUserMedia({audio: true}, function (stream) {
...
}, errorcallback );
You should say
navigator.mediaDevices.getUserMedia({audio: true}).then( function (stream) {
...
}).catch(function(err) {
/* handle the error */
});
Related
I've implemented the Google Cloud Speech to Text API using realtime streamed audio for the past few weeks. While initially everything looked really well, I've been testing the product on some more devices lately and have found some real weird irregularities when it comes to some iDevices.
First of all, here are the relevant code pieces:
Frontend (React Component)
constructor(props) {
super(props);
this.audio = props.audio;
this.socket = new SocketClient();
this.bufferSize = 2048;
}
/**
* Initializes the users microphone and the audio stream.
*
* #return {void}
*/
startAudioStream = async () => {
const AudioContext = window.AudioContext || window.webkitAudioContext;
this.audioCtx = new AudioContext();
this.processor = this.audioCtx.createScriptProcessor(this.bufferSize, 1, 1);
this.processor.connect(this.audioCtx.destination);
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
/* Debug through instant playback:
this.audio.srcObject = stream;
this.audio.play();
return; */
this.globalStream = stream;
this.audioCtx.resume();
this.input = this.audioCtx.createMediaStreamSource(stream);
this.input.connect(this.processor);
this.processor.onaudioprocess = (e) => {
this.microphoneProcess(e);
};
this.setState({ streaming: true });
}
/**
* Processes microphone input and passes it to the server via the open socket connection.
*
* #param {AudioProcessingEvent} e
* #return {void}
*/
microphoneProcess = (e) => {
const { speaking, askingForConfirmation, askingForErrorConfirmation } = this.state;
const left = e.inputBuffer.getChannelData(0);
const left16 = Helpers.downsampleBuffer(left, 44100, 16000);
if (speaking === false) {
this.socket.emit('stream', {
audio: left16,
context: askingForConfirmation || askingForErrorConfirmation ? 'zip_code_yes_no' : 'zip_code',
speechContext: askingForConfirmation || askingForErrorConfirmation ? ['ja', 'nein', 'ne', 'nö', 'falsch', 'neu', 'korrektur', 'korrigieren', 'stopp', 'halt', 'neu'] : ['$OPERAND'],
});
}
}
Helpers (DownsampleBuffer)
/**
* Downsamples a given audio buffer from sampleRate to outSampleRate.
* #param {Array} buffer The audio buffer to downsample.
* #param {number} sampleRate The original sample rate.
* #param {number} outSampleRate The new sample rate.
* #return {Array} The downsampled audio buffer.
*/
static downsampleBuffer(buffer, sampleRate, outSampleRate) {
if (outSampleRate === sampleRate) {
return buffer;
}
if (outSampleRate > sampleRate) {
throw new Error('Downsampling rate show be smaller than original sample rate');
}
const sampleRateRatio = sampleRate / outSampleRate;
const newLength = Math.round(buffer.length / sampleRateRatio);
const result = new Int16Array(newLength);
let offsetResult = 0;
let offsetBuffer = 0;
while (offsetResult < result.length) {
const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
let accum = 0;
let count = 0;
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
accum += buffer[i];
count++;
}
result[offsetResult] = Math.min(1, accum / count) * 0x7FFF;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result.buffer;
}
Backend (Socket Server)
io.on('connection', (socket) => {
logger.debug('New client connected');
const speechClient = new SpeechService(socket);
socket.on('stream', (data) => {
const audioData = data.audio;
const context = data.context;
const speechContext = data.speechContext;
speechClient.transcribe(audioData, context, speechContext);
});
});
Backend (Speech Client / Transcribe Function where data is sent to GCloud)
async transcribe(data, context, speechContext, isFile = false) {
if (!this.recognizeStream) {
logger.debug('Initiating new Google Cloud Speech client...');
let waitingForMoreData = false;
// Create new stream to the Google Speech client
this.recognizeStream = this.speechClient
.streamingRecognize({
config: {
encoding: 'LINEAR16',
sampleRateHertz: 16000,
languageCode: 'de-DE',
speechContexts: speechContext ? [{ phrases: speechContext }] : undefined,
},
interimResults: false,
singleUtterance: true,
})
.on('error', (error) => {
if (error.code === 11) {
this.recognizeStream.destroy();
this.recognizeStream = null;
return;
}
this.socket.emit('error');
this.recognizeStream.destroy();
this.recognizeStream = null;
logger.error(`Received error from Google Cloud Speech client: ${error.message}`);
})
.on('data', async (gdata) => {
if ((!gdata.results || !gdata.results[0]) && gdata.speechEventType === 'END_OF_SINGLE_UTTERANCE') {
logger.debug('Received END_OF_SINGLE_UTTERANCE - waiting 300ms for more data before restarting stream');
waitingForMoreData = true;
setTimeout(() => {
if (waitingForMoreData === true) {
// User was silent for too long - restart stream
this.recognizeStream.destroy();
this.recognizeStream = null;
}
}, 300);
return;
}
waitingForMoreData = false;
const transcription = gdata.results[0].alternatives[0].transcript;
logger.debug(`Transcription: ${transcription}`);
// Emit transcription and MP3 file of answer
this.socket.emit('transcription', transcription);
const filename = await ttsClient.getAnswerFromTranscription(transcription, 'fairy', context); // TODO-Final: Dynamic character
if (filename !== null) this.socket.emit('speech', `${config.publicScheme}://${config.publicHost}:${config.publicPort}/${filename}`);
// Restart stream
if (this.recognizeStream) this.recognizeStream.destroy();
this.recognizeStream = null;
});
}
// eslint-disable-next-line security/detect-non-literal-fs-filename
if (isFile === true) fs.createReadStream(data).pipe(this.recognizeStream);
else this.recognizeStream.write(data);
}
Now, the behavior varies heavily throughout my tested devices. I've originally developed on an iMac 2017 using Google Chrome as a browser. Works like a charm. Then, tested on an iPhone 11 Pro and iPad Air 4, both on Safari and as a full-screen web app. Again, works like a charm.
Afterwards I've tried with an iPad Pro 12.9" 2017. Suddenly, Google Cloud sometimes doesn't return a transcription at all, some other times it returns stuff which, only using very much fantasy, sounds like the actually spoken text. Same behavior on an iPad 5 and an iPhone 6 Plus.
I don't really know where to go from here. What I've read up on so far at least is that with the iPhone 6s (no idea about iPads unfortunately) the hardware sample rate was changed from 44.1khz to 48khz. So I thought, this might be it, played around with the sample rates everywhere in the code, no success. Also, I've noticed that my iMac with Google Chrome also runs on 44.1khz like the "old" iPads where transcription doesn't work. Likewise, the new iPads run on 48khz - and here everything works fine. So this can't be it.
What I've noticed as well: When I connect some AirPods to the "broken" devices and use them as audio input, everything works again. So this must have something to do with processing of the internal microphone of those devices. I just don't know what exactly.
Could anyone lead me to the right direction? What has changed between these device generations in regards to audio and the microphone?
Update 1: I've now implemented a quick function which writes the streamed PCM data from the frontend to a file in the backend using node-wav. I think, I'm getting closer now - on the devices, where the speech recognition goes nuts, I sound like a chipmunk (extremely high-pitched). I've also noticed that the binary audio data is flowing in way slower than on the devices where everything is working fine. So this probably has to do with sample/bit rate, encoding or something. Unfortunately I'm not an audio expert, so not sure what to do next.
Update 2: After a lot of trial end error, I've found that if I set the sample rate to about 9500 to 10000 in the Google Cloud RecognizeConfig, everything works. When I set this as the sample rate for the node-wav file output, it sounds okay as well. If I reset the "outgoing" sample rate to GCloud to 16000 again and downsample the audio input to about 25000 instead of 16000 in the frontend from 44100 (see "Frontend (React Component)" in the "microphoneProcess" function), it works as well. So there seems to be some kind of ~0.6 factor in sample rate differences. However, I still don't know where this behavior is coming from: Both Chrome on the working iMac and Safari on the "broken" iPads have a audioContext.sampleRate of 44100. Therefore, when I downsample them to 16000 in the code, I'd suppose both should work, whereas only the iMac works. It seems like the iPad is working with a different sample rate internally?
After a ton of trial and error, I've found the problem (and the solution).
It seems like "older" iDevice models - like the 2017 iPad Pro - have some weird peculiarity of automatically adjusting the microphone sample rate to the rate of played audio. Even though the hardware sample rate of those devices is set to 44.1khz, as soon as some audio is played, the rate changes. This can be observed through something like this:
const audioCtx = new webkitAudioContext();
console.log(`Current sample rate: ${audioCtx.sampleRate}`); // 44100
const audio = new Audio();
audio.src = 'some_audio.mp3';
await audio.play();
console.log(`Current sample rate: ${audioCtx.sampleRate}`); // Sample rate of the played audio
In my case I've played some synthesized speech from Google Text-to-Speech before opening the speech transcription socket. Those sound files have a sample rate of 24khz - exactly the sample rate Google Cloud received my audio input in.
The solution therefore was - something I should have done anyways - to downsample everything to 16khz (see my helper function in the question), but not from hard-coded 44.1khz, rather from the current sample rate of the audio context. So I've changed my microphoneProcess() function like this:
const left = e.inputBuffer.getChannelData(0);
const left16 = Helpers.downsampleBuffer(left, this.audioCtx.sampleRate, 16000);
Conclusion: Do not trust Safari with the sample rate on page load. It might change.
UPDATE
I found an example of high pass filter using webAudio here. Implemented it like this in my code.
function modifyGain(audioTrack, gainValue){
var ctx = new AudioContext();
var src = ctx.createMediaStreamTrackSource(audioTrack);
var dst = ctx.createMediaStreamDestination();
// create a filter node
var filterNode = ctx.createBiquadFilter();
filterNode.type = 'highpass';
filterNode.frequency.value = 0;
// cutoff frequency: for highpass, audio is attenuated below this frequency
var gainNode = ctx.createGain();
gainNode.gain.value = 1;
src.connect(filterNode);
//filterNode.connect(dst);
filterNode.connect(gainNode);
gainNode.connect(dst);
//alert(ctx.dst)
return dst.stream.getTracks()[0];
}
try {
webcamStream = await navigator.mediaDevices.getUserMedia(mediaConstraints);
document.getElementById("local_video").srcObject = webcamStream;
} catch(err) {
handleGetUserMediaError(err);
return;
}
// Add the tracks from the stream to the RTCPeerConnection
try {
webcamStream.getTracks().forEach(
function (track){
if(track.kind === 'audio'){
track = modifyGain(track, 0.5) //process only audio tracks
}
myPeerConnection.addTrack(track, webcamStream);
});
showLocalVideoContainer();
} catch(err) {
handleGetUserMediaError(err);
}
Before I can actually test if low sounds are silenced by highpass filter I am facing an issue. Using modifyGain mutes audio completely after few seconds. I tried 0, 1500 etc. It goes silence after few seconds.
Original POST
I am using the below constraints to try to suppress noise.
var mediaConstraints = {
audio: {advanced: [
{
echoCancellation: {exact: true}
},
{
autoGainControl: {exact: true}
},
{
noiseSuppression: {exact: true}
},
{
highpassFilter: {exact: true}
}
]
},
video: {
facingMode: "user",
width: { min: 160, ideal: 320, max: 640 },
height: { min: 120, ideal: 240, max: 480 },
}
};
But I want to silence some higher frequencies too. Even if I slowly move my phone on some surface the mic catches the noise and sends it to other peer. It catches even my breathing sound and send that to other side when I place it near my check(like phone call). I want some control over the frequencies which can allow me to select a threshold below which sounds will be not picked by mic.
I have tried searching but I am not sure what exactly will work for my case and how should I do it. I think following are my choices, but I maybe wrong.
Change SDP( codec params?).
Use webAudio and process the mic input before passing it on to other peer.
Use webAudio and process the received audio from other peer and direct it to output device(speaker).
Any help will be appreciated.
Thanks
You can process the audio from your microphone by piping it through web audio & using the BiquadFilter:
const stream = await navigator.mediaDevices.getUserMedia({audio: true});
const ctx = new AudioContext();
const src = ctx.createMediaStreamSource(stream);
const dst = ctx.createMediaStreamDestination();
const biquad = ctx.createBiquadFilter();
[src, biquad, dst].reduce((a, b) => a && a.connect(b));
audio.srcObject = dst.stream;
biquad.type = "highpass";
biquad.gain = 25;
biquad.frequency.value = 1000;
rangeButton.oninput = () => biquad.frequency.value = rangeButton.value;
Here's a working fiddle.
It goes silent after few seconds.
Could be a garbage collection issue. If you're experiencing this, try assigning your stream to a global JS var.
I'm trying to write a small Audio library for a specific web application, to solve the problem of Web Audio Buffer Sources requiring long load times, I'm trying to switch a HTML5 Audio Source(via MediaElementSourceNode) with a Buffer Source once the Buffer Source is ready to play. with a 20 minute track, it takes Web Audio's Buffer Source roughly 5 seconds to decode and start playing.
using MediaElementSourceNode is required for using the PanNode in Web Audio
First, I thought it was a JS main thread Latency issue that was throwing off the "start Time". I thought I could solve by making sure the code that disables MediaElementSource and enables BufferSourceNode are as close to execution as possible.
Then, I though it must be the HTML having a small delay when it starts, causing the recorded startTime to be off, to get around this, I used a Event Handler listening for 'play'.
I searched around and discovered Gapless 5 apparently did this without issue, looking at its source code, i could not discovered how it is switching sources seamlessly
play(offset) {
this.createNodes();
this.connectNodes();
//if webAudio's buffer source is not ready, starting playing with HTML5
if (!this.audioClip.isWebAudioReady() &&
this.audioClip.playType > 0) {
this.playHTML5();
}
//returns true if buffer != null
if (!this.audioClip.isWebAudioReady()) {
this.audioClip.addDecodeListener(this.play.bind(this));
}
if (this.audioClip.isWebAudioReady()) {
this.playBufferSource();
}
playHTML5() {
var context = AudioManager.context();
if (this.audioClip.isHTML5Ready()) {
this.createHTMLSourceNode();
console.log("playing HTML5");
this.mediaElementSourceNode.connect(this.gainNode);
this.mediaElementSourceNode.source.play();
this.startTime = context.currentTime;
}
else {
console.log('not ready yet');
this.audioClip.addLoadListener(this.playHTML5.bind(this));
}
}
playBufferSource() {
var context = AudioManager.context();
var offset = context.currentTime - this.startTime;
if (!this.bufferSourceNode) {
this.createBufferSourceNode();
}
this.bufferSourceNode.connect(this.gainNode);
//hoplessly attempt to make up for Thread latencey
offset = context.currentTime - this.startTime;
if (this.audioClip.playType > 0) {
this.mediaElementSourceNode.disconnect();
this.mediaElementSourceNode = null;
}
if (this.audioClip.playType == 0) {
offset = 0;
this.bufferSourceNode.start(0, offset);
}
else {
offset = context.currentTime - this.startTime;
this.bufferSourceNode.start(0, offset);
}
// console.log("starting web audio at" + offset);
}
createBufferSourceNode() {
var context = AudioManager.context();
if (!this.audioClip.webAudioReady) {
console.log('Web Audio not ready!, Sometihng went wrong!');
return;
}
var buffer = this.audioClip.buffer;
this.bufferSourceNode = context.createBufferSource();
//When using anything other than Buffer,
//we want to disable pitching.
if (this.audioClip.playType == NS.PlayTypes.Buffer) {
this.bufferSourceNode.playbackRate.setValueAtTime(this._pitch,
context.currentTime);
}
this.bufferSourceNode.buffer = buffer;
}
createHTMLSourceNode() {
var context = AudioManager.context();
var HTMLAudio = this.audioClip.mediaElement.cloneNode(false);
//HTMLAudio.addEventListener('ended', onHTML5Ended.bind(this), false);
HTMLAudio.addEventListener('play', this.onHTML5Play.bind(this), false);
var sourceNode = context.createMediaElementSource(HTMLAudio);
sourceNode.source = HTMLAudio;
this.mediaElementSourceNode = sourceNode;
}
/**
*
*/
onHTML5Play() {
this.startTime = AudioManager.context().currentTime;
console.log("HTML5 started playing");
}
Since I'm starting the second source as close as possible in time with the first, I should technically not hear any clicks if the waveform line up close enough, but the resulting clicks are very audible, sometimes 2 clicks are audible.
I'm exploring the Web Audio api in an attempt to try and adapt some aspects of the api into a non-web framework I'm working on (which'll get compiled for the web via Emscripten).
Take the following code:
var audioCtx = new AudioContext();
// imagine I've called getUserMedia and have the stream from a mic.
var source = audioCtx.createMediaStreamSource(stream);
// make a filter to alter the input somehow
var biquadFilter = audioCtx.createBiquadFilter();
// imagine we've set some settings
source.connect(biquadFilter);
Say I wanted to get the raw data of the input stream after it's been altered by the BiQuadFilter (or any other filter). Is there any way to do that? As far as I can tell it looks like the AnalyserNode might be what I'm looking for but ideally it'd be great to just pull a buffer off the end of the graph if possible.
Any hints or suggestions are appreciated.
There are two ways...
ScriptProcessorNode
You can use a ScriptProcessorNode, which is normally used to process data in your own code, to simply record the raw 32-bit float PCM audio data.
Whether this node outputs anything or not is up to you. I usually copy the input data to the output out of convenience, but there is a slight overhead to that.
MediaRecorder
The MediaRecorder can be used to record MediaStreams, both audio and/or video. First you'll need a MediaStreamAudioDestinationNode. Once you have that, you can use the MediaRecorder with the resulting stream to record it.
It's important to note that typically with the MediaRecorder, you're recording compressed audio with a lossy codec. This is essentially the purpose of the MediaRecorder. However, support for PCM in WebM has recently been added by at least Chrome. Just use {type: 'audio/webm;codecs=pcm'} when instantiating your MediaRecorder.
(I haven't tested this yet, but I suspect you're going to end up with 16-bit PCM, not 32-bit float which is used internally in the Web Audio API.)
here is a web page just save it mycode.html then give its file location to your browser ... it will prompt to gain access to your microphone ... take notice of createMediaStreamSource as well as where raw audio buffer is accessed then printed to browser console log ... essentially you define callback functions which make available the raw audio upon each Web Audio API event loop iteration - enjoy
<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>capture microphone then show time & frequency domain output</title>
<script type="text/javascript">
var webaudio_tooling_obj = function () {
var audioContext = new AudioContext();
console.log("audio is starting up ...");
var BUFF_SIZE_RENDERER = 16384;
var SIZE_SHOW = 3; // number of array elements to show in console output
var audioInput = null,
microphone_stream = null,
gain_node = null,
script_processor_node = null,
script_processor_analysis_node = null,
analyser_node = null;
if (!navigator.getUserMedia)
navigator.getUserMedia = navigator.getUserMedia || navigator.webkitGetUserMedia ||
navigator.mozGetUserMedia || navigator.msGetUserMedia;
if (navigator.getUserMedia){
navigator.getUserMedia({audio:true},
function(stream) {
start_microphone(stream);
},
function(e) {
alert('Error capturing audio.');
}
);
} else { alert('getUserMedia not supported in this browser.'); }
// ---
function show_some_data(given_typed_array, num_row_to_display, label) {
var size_buffer = given_typed_array.length;
var index = 0;
console.log("__________ " + label);
if (label === "time") {
for (; index < num_row_to_display && index < size_buffer; index += 1) {
var curr_value_time = (given_typed_array[index] / 128) - 1.0;
console.log(curr_value_time);
}
} else if (label === "frequency") {
for (; index < num_row_to_display && index < size_buffer; index += 1) {
console.log(given_typed_array[index]);
}
} else {
throw new Error("ERROR - must pass time or frequency");
}
}
function process_microphone_buffer(event) {
var i, N, inp, microphone_output_buffer;
// not needed for basic feature set
// microphone_output_buffer = event.inputBuffer.getChannelData(0); // just mono - 1 channel for now
}
function start_microphone(stream){
gain_node = audioContext.createGain();
gain_node.connect( audioContext.destination );
microphone_stream = audioContext.createMediaStreamSource(stream);
microphone_stream.connect(gain_node);
script_processor_node = audioContext.createScriptProcessor(BUFF_SIZE_RENDERER, 1, 1);
script_processor_node.onaudioprocess = process_microphone_buffer;
microphone_stream.connect(script_processor_node);
// --- enable volume control for output speakers
document.getElementById('volume').addEventListener('change', function() {
var curr_volume = this.value;
gain_node.gain.value = curr_volume;
console.log("curr_volume ", curr_volume);
});
// --- setup FFT
script_processor_analysis_node = audioContext.createScriptProcessor(2048, 1, 1);
script_processor_analysis_node.connect(gain_node);
analyser_node = audioContext.createAnalyser();
analyser_node.smoothingTimeConstant = 0;
analyser_node.fftSize = 2048;
microphone_stream.connect(analyser_node);
analyser_node.connect(script_processor_analysis_node);
var buffer_length = analyser_node.frequencyBinCount;
var array_freq_domain = new Uint8Array(buffer_length);
var array_time_domain = new Uint8Array(buffer_length);
console.log("buffer_length " + buffer_length);
script_processor_analysis_node.onaudioprocess = function() {
// get the average for the first channel
analyser_node.getByteFrequencyData(array_freq_domain);
analyser_node.getByteTimeDomainData(array_time_domain);
// draw the spectrogram
if (microphone_stream.playbackState == microphone_stream.PLAYING_STATE) {
show_some_data(array_freq_domain, SIZE_SHOW, "frequency");
show_some_data(array_time_domain, SIZE_SHOW, "time"); // store this to record to aggregate buffer/file
}
};
}
}(); // webaudio_tooling_obj = function()
</script>
</head>
<body>
<p>Volume</p>
<input id="volume" type="range" min="0" max="1" step="0.1" value="0.0"/>
</body>
</html>
a variation of above approach will allow you to replace microphone with your own logic to synthesize the audio curve again with ability to access the audio buffer
On other alternative is to create your graph with an OfflineAudioContext. You do have to know before hand how much data you want to capture, but if you do, you'll get your result faster than real-time, usually.
You'll get the raw PCM data so you can save it or analyze it, further modify it, or whatever.
I have a live, constant source of waveform data that gives me a second of single-channel audio with constant sample rate every second. Currently I play them this way:
// data : Float32Array, context: AudioContext
function audioChunkReceived (context, data, sample_rate) {
var audioBuffer = context.createBuffer(2, data.length, sample_rate);
audioBuffer.getChannelData(0).set(data);
var source = context.createBufferSource(); // creates a sound source
source.buffer = audioBuffer;
source.connect(context.destination);
source.start(0);
}
Audio plays fine but with noticeable pauses between consecutive chunks being played (as expected). I'd like to get rid of them and I understand I'll have to introduce some kind of buffering.
Questions:
Is there a JS library that can do this for me? (I'm in the process of searching through them)
If there is no library that can do this, how should I do it myself?
Detecting when playback finished in one source and have another one ready to play it immediately afterwards? (using AudioBufferSourceNode.onended event handler)
Create one large buffer and copy my audio chunks one after another and control the flow using AudioBufferSourceNode.start AudioBufferSourceNode.stop functions?
Something different?
I've written a small class in TypeScript that serves as buffer for now. It has bufferSize defined for controlling how many chunks it can hold. It's short and self-descriptive so I'll paste it here. There is much to improve so any ideas are welcome.
( you can quickly convert it to JS using: https://www.typescriptlang.org/play/ )
class SoundBuffer {
private chunks : Array<AudioBufferSourceNode> = [];
private isPlaying: boolean = false;
private startTime: number = 0;
private lastChunkOffset: number = 0;
constructor(public ctx:AudioContext, public sampleRate:number,public bufferSize:number = 6, private debug = true) { }
private createChunk(chunk:Float32Array) {
var audioBuffer = this.ctx.createBuffer(2, chunk.length, this.sampleRate);
audioBuffer.getChannelData(0).set(chunk);
var source = this.ctx.createBufferSource();
source.buffer = audioBuffer;
source.connect(this.ctx.destination);
source.onended = (e:Event) => {
this.chunks.splice(this.chunks.indexOf(source),1);
if (this.chunks.length == 0) {
this.isPlaying = false;
this.startTime = 0;
this.lastChunkOffset = 0;
}
};
return source;
}
private log(data:string) {
if (this.debug) {
console.log(new Date().toUTCString() + " : " + data);
}
}
public addChunk(data: Float32Array) {
if (this.isPlaying && (this.chunks.length > this.bufferSize)) {
this.log("chunk discarded");
return; // throw away
} else if (this.isPlaying && (this.chunks.length <= this.bufferSize)) { // schedule & add right now
this.log("chunk accepted");
let chunk = this.createChunk(data);
chunk.start(this.startTime + this.lastChunkOffset);
this.lastChunkOffset += chunk.buffer.duration;
this.chunks.push(chunk);
} else if ((this.chunks.length < (this.bufferSize / 2)) && !this.isPlaying) { // add & don't schedule
this.log("chunk queued");
let chunk = this.createChunk(data);
this.chunks.push(chunk);
} else { // add & schedule entire buffer
this.log("queued chunks scheduled");
this.isPlaying = true;
let chunk = this.createChunk(data);
this.chunks.push(chunk);
this.startTime = this.ctx.currentTime;
this.lastChunkOffset = 0;
for (let i = 0;i<this.chunks.length;i++) {
let chunk = this.chunks[i];
chunk.start(this.startTime + this.lastChunkOffset);
this.lastChunkOffset += chunk.buffer.duration;
}
}
}
}
You don't show how audioChunkReceived, but to get seamless playback, you have to make sure you have the data before you want to play it and before the previous one stops playing.
Once you have this, you can schedule the newest chunk to start playing when the previous one ends by calling start(t), where t is the end time of the previous chunk.
However, if the buffer sample rate is different from the context.sampleRate, it's probably not going to play smoothly because of the resampling that is needed to convert the buffer to the context rate.
I think it is because you allocate your buffer for 2 channel.
change that to one.
context.createBuffer(2, data.length, sample_rate);
to
context.createBuffer(1, data.length, sample_rate);