We are developing a video stream from a mobile device to a computer using WebRTC. The mobile device might lose its connection completely and the computer should be able to detect that. Right now, the video just freezes. But neither of the EventHandlers of RTCPeerConnection are called in such a situation.
So how can such a connection failure be detected on the other peer?
How can a peer detect connection problems on connection establishment in the first place?
As a workaround in Firefox, you could use getStats to detect if packets stop coming in:
var findStat = (m, type) => [...m.values()].find(s => s.type == type && !s.isRemote);
var hasConnected = new Promise(resolve => pc.oniceconnectionstatechange =
e => pc.iceConnectionState == "connected" && resolve());
var hasDropped = hasConnected.then(() => new Promise(resolve => {
var lastPackets = countdown = 0, timeout = 3; // seconds
var iv = setInterval(() => pc.getStats().then(stats => {
var packets = findStat(stats, "inbound-rtp").packetsReceived;
countdown = (packets - lastPackets)? timeout : countdown - 1;
if (!countdown) resolve(clearInterval(iv));
lastPackets = packets;
}), 1000);
}));
Here's a demo: https://jsfiddle.net/4rzhe7n8/
the iceconnectionstatechange handler should fire after ~5-10 seconds of not receiving data from the peer anymore (in Chrome; Firefox is working on that currently). See https://webrtc.github.io/samples/src/content/peerconnection/states/ for an example.
Related
I am facing difficulty in running the remote stream.
I am getting stream Object Ontrack Event but the video not getting played. (auto run is true)
this is happening with a few peers, not with all the peers.
peer.iceConnectionState state changed to 'disconnected' and will never come back to the connected state.
what could be the solution for this situation.
peer.ontrack = e => {
console.log('video is receiveing')
e.track.onunmute = () => {
console.log('track unmuted');
addVideoElement(userId, e.streams[0])
}
}
function addVideoElement(userId, stream) {
const existingVideoElement = document.getElementById(userId)
console.log('existingVideoElement ',existingVideoElement)
if(existingVideoElement) {
existingVideoElement.srcObject = stream
return
}
console.log('new element for user', userId)
const videoCointainer = document.getElementById('videoListContainer')
const videoElement = document.createElement('video')
videoElement.setAttribute('id',userId)
videoElement.srcObject = stream
videoElement.muted = 0
videoElement.autoplay = true
if(iOS) {
videoElement.playsInline = true
}
if (userId === socket.id) {
videoElement.volume = 0
}
videoCointainer.appendChild(videoElement)
shouldMute(false)
}
Just because you get a stream object this does not mean the peerconnection got established which is a prerequisite for receiving data to display. If the first ice connection state you hit is "disconnected" that means the connection failed. Adding a TURN server increases the chance of success.
I've implemented the Google Cloud Speech to Text API using realtime streamed audio for the past few weeks. While initially everything looked really well, I've been testing the product on some more devices lately and have found some real weird irregularities when it comes to some iDevices.
First of all, here are the relevant code pieces:
Frontend (React Component)
constructor(props) {
super(props);
this.audio = props.audio;
this.socket = new SocketClient();
this.bufferSize = 2048;
}
/**
* Initializes the users microphone and the audio stream.
*
* #return {void}
*/
startAudioStream = async () => {
const AudioContext = window.AudioContext || window.webkitAudioContext;
this.audioCtx = new AudioContext();
this.processor = this.audioCtx.createScriptProcessor(this.bufferSize, 1, 1);
this.processor.connect(this.audioCtx.destination);
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
/* Debug through instant playback:
this.audio.srcObject = stream;
this.audio.play();
return; */
this.globalStream = stream;
this.audioCtx.resume();
this.input = this.audioCtx.createMediaStreamSource(stream);
this.input.connect(this.processor);
this.processor.onaudioprocess = (e) => {
this.microphoneProcess(e);
};
this.setState({ streaming: true });
}
/**
* Processes microphone input and passes it to the server via the open socket connection.
*
* #param {AudioProcessingEvent} e
* #return {void}
*/
microphoneProcess = (e) => {
const { speaking, askingForConfirmation, askingForErrorConfirmation } = this.state;
const left = e.inputBuffer.getChannelData(0);
const left16 = Helpers.downsampleBuffer(left, 44100, 16000);
if (speaking === false) {
this.socket.emit('stream', {
audio: left16,
context: askingForConfirmation || askingForErrorConfirmation ? 'zip_code_yes_no' : 'zip_code',
speechContext: askingForConfirmation || askingForErrorConfirmation ? ['ja', 'nein', 'ne', 'nö', 'falsch', 'neu', 'korrektur', 'korrigieren', 'stopp', 'halt', 'neu'] : ['$OPERAND'],
});
}
}
Helpers (DownsampleBuffer)
/**
* Downsamples a given audio buffer from sampleRate to outSampleRate.
* #param {Array} buffer The audio buffer to downsample.
* #param {number} sampleRate The original sample rate.
* #param {number} outSampleRate The new sample rate.
* #return {Array} The downsampled audio buffer.
*/
static downsampleBuffer(buffer, sampleRate, outSampleRate) {
if (outSampleRate === sampleRate) {
return buffer;
}
if (outSampleRate > sampleRate) {
throw new Error('Downsampling rate show be smaller than original sample rate');
}
const sampleRateRatio = sampleRate / outSampleRate;
const newLength = Math.round(buffer.length / sampleRateRatio);
const result = new Int16Array(newLength);
let offsetResult = 0;
let offsetBuffer = 0;
while (offsetResult < result.length) {
const nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
let accum = 0;
let count = 0;
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
accum += buffer[i];
count++;
}
result[offsetResult] = Math.min(1, accum / count) * 0x7FFF;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result.buffer;
}
Backend (Socket Server)
io.on('connection', (socket) => {
logger.debug('New client connected');
const speechClient = new SpeechService(socket);
socket.on('stream', (data) => {
const audioData = data.audio;
const context = data.context;
const speechContext = data.speechContext;
speechClient.transcribe(audioData, context, speechContext);
});
});
Backend (Speech Client / Transcribe Function where data is sent to GCloud)
async transcribe(data, context, speechContext, isFile = false) {
if (!this.recognizeStream) {
logger.debug('Initiating new Google Cloud Speech client...');
let waitingForMoreData = false;
// Create new stream to the Google Speech client
this.recognizeStream = this.speechClient
.streamingRecognize({
config: {
encoding: 'LINEAR16',
sampleRateHertz: 16000,
languageCode: 'de-DE',
speechContexts: speechContext ? [{ phrases: speechContext }] : undefined,
},
interimResults: false,
singleUtterance: true,
})
.on('error', (error) => {
if (error.code === 11) {
this.recognizeStream.destroy();
this.recognizeStream = null;
return;
}
this.socket.emit('error');
this.recognizeStream.destroy();
this.recognizeStream = null;
logger.error(`Received error from Google Cloud Speech client: ${error.message}`);
})
.on('data', async (gdata) => {
if ((!gdata.results || !gdata.results[0]) && gdata.speechEventType === 'END_OF_SINGLE_UTTERANCE') {
logger.debug('Received END_OF_SINGLE_UTTERANCE - waiting 300ms for more data before restarting stream');
waitingForMoreData = true;
setTimeout(() => {
if (waitingForMoreData === true) {
// User was silent for too long - restart stream
this.recognizeStream.destroy();
this.recognizeStream = null;
}
}, 300);
return;
}
waitingForMoreData = false;
const transcription = gdata.results[0].alternatives[0].transcript;
logger.debug(`Transcription: ${transcription}`);
// Emit transcription and MP3 file of answer
this.socket.emit('transcription', transcription);
const filename = await ttsClient.getAnswerFromTranscription(transcription, 'fairy', context); // TODO-Final: Dynamic character
if (filename !== null) this.socket.emit('speech', `${config.publicScheme}://${config.publicHost}:${config.publicPort}/${filename}`);
// Restart stream
if (this.recognizeStream) this.recognizeStream.destroy();
this.recognizeStream = null;
});
}
// eslint-disable-next-line security/detect-non-literal-fs-filename
if (isFile === true) fs.createReadStream(data).pipe(this.recognizeStream);
else this.recognizeStream.write(data);
}
Now, the behavior varies heavily throughout my tested devices. I've originally developed on an iMac 2017 using Google Chrome as a browser. Works like a charm. Then, tested on an iPhone 11 Pro and iPad Air 4, both on Safari and as a full-screen web app. Again, works like a charm.
Afterwards I've tried with an iPad Pro 12.9" 2017. Suddenly, Google Cloud sometimes doesn't return a transcription at all, some other times it returns stuff which, only using very much fantasy, sounds like the actually spoken text. Same behavior on an iPad 5 and an iPhone 6 Plus.
I don't really know where to go from here. What I've read up on so far at least is that with the iPhone 6s (no idea about iPads unfortunately) the hardware sample rate was changed from 44.1khz to 48khz. So I thought, this might be it, played around with the sample rates everywhere in the code, no success. Also, I've noticed that my iMac with Google Chrome also runs on 44.1khz like the "old" iPads where transcription doesn't work. Likewise, the new iPads run on 48khz - and here everything works fine. So this can't be it.
What I've noticed as well: When I connect some AirPods to the "broken" devices and use them as audio input, everything works again. So this must have something to do with processing of the internal microphone of those devices. I just don't know what exactly.
Could anyone lead me to the right direction? What has changed between these device generations in regards to audio and the microphone?
Update 1: I've now implemented a quick function which writes the streamed PCM data from the frontend to a file in the backend using node-wav. I think, I'm getting closer now - on the devices, where the speech recognition goes nuts, I sound like a chipmunk (extremely high-pitched). I've also noticed that the binary audio data is flowing in way slower than on the devices where everything is working fine. So this probably has to do with sample/bit rate, encoding or something. Unfortunately I'm not an audio expert, so not sure what to do next.
Update 2: After a lot of trial end error, I've found that if I set the sample rate to about 9500 to 10000 in the Google Cloud RecognizeConfig, everything works. When I set this as the sample rate for the node-wav file output, it sounds okay as well. If I reset the "outgoing" sample rate to GCloud to 16000 again and downsample the audio input to about 25000 instead of 16000 in the frontend from 44100 (see "Frontend (React Component)" in the "microphoneProcess" function), it works as well. So there seems to be some kind of ~0.6 factor in sample rate differences. However, I still don't know where this behavior is coming from: Both Chrome on the working iMac and Safari on the "broken" iPads have a audioContext.sampleRate of 44100. Therefore, when I downsample them to 16000 in the code, I'd suppose both should work, whereas only the iMac works. It seems like the iPad is working with a different sample rate internally?
After a ton of trial and error, I've found the problem (and the solution).
It seems like "older" iDevice models - like the 2017 iPad Pro - have some weird peculiarity of automatically adjusting the microphone sample rate to the rate of played audio. Even though the hardware sample rate of those devices is set to 44.1khz, as soon as some audio is played, the rate changes. This can be observed through something like this:
const audioCtx = new webkitAudioContext();
console.log(`Current sample rate: ${audioCtx.sampleRate}`); // 44100
const audio = new Audio();
audio.src = 'some_audio.mp3';
await audio.play();
console.log(`Current sample rate: ${audioCtx.sampleRate}`); // Sample rate of the played audio
In my case I've played some synthesized speech from Google Text-to-Speech before opening the speech transcription socket. Those sound files have a sample rate of 24khz - exactly the sample rate Google Cloud received my audio input in.
The solution therefore was - something I should have done anyways - to downsample everything to 16khz (see my helper function in the question), but not from hard-coded 44.1khz, rather from the current sample rate of the audio context. So I've changed my microphoneProcess() function like this:
const left = e.inputBuffer.getChannelData(0);
const left16 = Helpers.downsampleBuffer(left, this.audioCtx.sampleRate, 16000);
Conclusion: Do not trust Safari with the sample rate on page load. It might change.
I have an issue where RTCPeerConnection is not being garbage collected after being created via user interaction. The instances remain in chrome://webrtc-internals/ indefinitely and eventually users see an error about not being able to create new instances. I've reduced this down to a simple test page and this what I have:
<script>
let pc
function createPeerConnection() {
console.log("CREATE CONNECTION")
pc = new RTCPeerConnection()
}
function destroyPeerConnection() {
console.log("DISPOSE CONNECTION")
if (pc) {
pc.close()
pc = null
}
}
setTimeout(() => {
createPeerConnection()
setTimeout(destroyPeerConnection, 2000)
}, 1000)
</script>
<div>
<button onClick="javascript:createPeerConnection()">CREATE</button>
<button onClick="javascript:destroyPeerConnection()">DESTROY</button>
</div>
Interesting points to note:
On page load the timeout fires and creates, then destroys an instance, and this seems to work (instance disappears from webrtc internals after 5-10 seconds)
When clicking CREATE and DESTROY buttons, instances are not removed (they appear in webrtc internals indefinitely)
What am I doing wrong - feel like it must be something very simple. Any pointers much appreciated!
EDIT
In webrtc internals for the connection that is not being disposed I can see a media stream is still attached...
However, I am running the following code when closing the call:
console.log("DISPOSE CONNECTION")
pc.onsignalingstatechange = null
pc.onconnectionstatechange = null
pc.onnegotiationneeded = null
pc.onicecandidate = null
pc.ontrack = null
pc.getSenders().forEach(sender => {
console.log("STOP SENDER", sender)
pc.removeTrack(sender)
sender.setStreams()
sender.track?.stop()
})
pc.getReceivers().forEach(receiver => {
receiver.track?.stop()
})
pc.getTransceivers().forEach(transceiver => {
pc.removeTrack(transceiver.sender)
transceiver.sender.setStreams()
transceiver.sender.track?.stop()
transceiver.stop()
})
pc.close()
I am doing a POC and my requirement is that I want to implement the feature like OK google or Hey Siri on browser.
I am using the Chrome Browser's Web speech api. The things I noticed that I can't continuous the recognition as it terminates automatically after a certain period of time and I know its relevant because of security concern. I just does another hack like when the SpeechReognition terminates then on its end event I further start the SpeechRecogntion but it is not the best way to implement such a solution because suppose if I am using the 2 instances of same application on the different browser tab then It doesn't work or may be I am using another application in my browser that uses the speech recognition then both the application doesn't behave the same as expected. I am looking for a best approach to solve this problem.
Thanks in advance.
Since your problem is that you can't run the SpeechRecognition continuously for long periods of time, one way would be to start the SpeechRecognition only when you get some input in the mic.
This way only when there is some input, you will start the SR, looking for your magic_word.
If the magic_word is found, then you will be able to use the SR normally for your other tasks.
This can be detected by the WebAudioAPI, which is not tied by this time restriction SR suffers from. You can feed it by an LocalMediaStream from MediaDevices.getUserMedia.
For more info, on below script, you can see this answer.
Here is how you could attach it to a SpeechRecognition:
const magic_word = ##YOUR_MAGIC_WORD##;
// initialize our SpeechRecognition object
let recognition = new webkitSpeechRecognition();
recognition.lang = 'en-US';
recognition.interimResults = false;
recognition.maxAlternatives = 1;
recognition.continuous = true;
// detect the magic word
recognition.onresult = e => {
// extract all the transcripts
var transcripts = [].concat.apply([], [...e.results]
.map(res => [...res]
.map(alt => alt.transcript)
)
);
if(transcripts.some(t => t.indexOf(magic_word) > -1)){
//do something awesome, like starting your own command listeners
}
else{
// didn't understood...
}
}
// called when we detect silence
function stopSpeech(){
recognition.stop();
}
// called when we detect sound
function startSpeech(){
try{ // calling it twice will throw...
recognition.start();
}
catch(e){}
}
// request a LocalMediaStream
navigator.mediaDevices.getUserMedia({audio:true})
// add our listeners
.then(stream => detectSilence(stream, stopSpeech, startSpeech))
.catch(e => log(e.message));
function detectSilence(
stream,
onSoundEnd = _=>{},
onSoundStart = _=>{},
silence_delay = 500,
min_decibels = -80
) {
const ctx = new AudioContext();
const analyser = ctx.createAnalyser();
const streamNode = ctx.createMediaStreamSource(stream);
streamNode.connect(analyser);
analyser.minDecibels = min_decibels;
const data = new Uint8Array(analyser.frequencyBinCount); // will hold our data
let silence_start = performance.now();
let triggered = false; // trigger only once per silence event
function loop(time) {
requestAnimationFrame(loop); // we'll loop every 60th of a second to check
analyser.getByteFrequencyData(data); // get current data
if (data.some(v => v)) { // if there is data above the given db limit
if(triggered){
triggered = false;
onSoundStart();
}
silence_start = time; // set it to now
}
if (!triggered && time - silence_start > silence_delay) {
onSoundEnd();
triggered = true;
}
}
loop();
}
As a plunker, since neither StackSnippets nor jsfiddle's iframes will allow gUM in two versions...
Scenario: You would like to know if TURN server is being used for a particular call and which one from the array of TURN servers you provided during PeerConnection creation, is being used. Right now there are two options:
Wireshark: But when you are behind a corporate proxy and TURN server is outside that, wireshark would show the Proxy IP as the destination.( also not the mention the inconvenience of running it in the background)
Going through the stats page and finding out, chrome --> chrome://webrtc-internals and Firefox --> about:webrtc
I would like to use a alternative to the above two, programmatically determine this so I do not have to leave my application page.
Update: I've updated the example to follow the latest spec, with maplike getStats.
The following approach follows the specification and currently only works in Firefox, because Chrome implements getStats() incorrectly at the moment. Hopefully, a version of the adapter.js polyfill should be available soon that will make this work in Chrome as well.
When you run this fiddle in Firefox, you'll see:
checking
connected
Does not use TURN
This is because the example provides both a STUN and a TURN server. But when I modify the config to use TURN only with iceTransportPolicy: "relay", I see:
checking
connected
Uses TURN server: 10.252.73.50
Note that the turn server I use is behind a VPN, so it won't work for you, but feel free to modify the fiddle with your own server (just don't save it unless you want the info to become public!)
While I haven't tested with more than one turn server, as you can see the IP address shown matches the turn server configured, so it should be possible to tell which server is used using this approach.
// Turn server is on Mozilla's VPN.
var cfg = { iceTransportPolicy: "all", // set to "relay" to force TURN.
iceServers: [{ urls: "stun:stun.l.google.com:19302" },
{ urls: "turn:10.252.73.50",
username:"webrtc", credential:"firefox" }] };
var pc1 = new RTCPeerConnection(cfg), pc2 = new RTCPeerConnection(cfg);
pc1.onicecandidate = e => pc2.addIceCandidate(e.candidate);
pc2.onicecandidate = e => pc1.addIceCandidate(e.candidate);
pc2.oniceconnectionstatechange = () => log(pc2.iceConnectionState);
pc2.onaddstream = e => v2.srcObject = e.stream;
var findSelected = stats =>
[...stats.values()].find(s => s.type == "candidate-pair" && s.selected);
var start = () => navigator.mediaDevices.getUserMedia({ video: true })
.then(stream => pc1.addStream(v1.srcObject = stream))
.then(() => pc1.createOffer()).then(d => pc1.setLocalDescription(d))
.then(() => pc2.setRemoteDescription(pc1.localDescription))
.then(() => pc2.createAnswer()).then(d => pc2.setLocalDescription(d))
.then(() => pc1.setRemoteDescription(pc2.localDescription))
.then(() => waitUntil(() => pc1.getStats().then(s => findSelected(s))))
.then(() => pc1.getStats())
.then(stats => {
var candidate = stats.get(findSelected(stats).localCandidateId);
if (candidate.candidateType == "relayed") {
log("Uses TURN server: " + candidate.ipAddress);
} else {
log("Does not use TURN (uses " + candidate.candidateType + ").");
}
})
.catch(log);
var waitUntil = f => Promise.resolve(f())
.then(done => done || wait(200).then(() => waitUntil(f)));
var wait = ms => new Promise(resolve => setTimeout(resolve, ms));
var log = msg => div.innerHTML += msg +"<br>";
var failed = e => log(e +", line "+ e.lineNumber);
<video id="v1" width="108" height="81" autoplay></video>
<video id="v2" width="108" height="81" autoplay></video><br>
<button onclick="start()">Start!</button><br><div id="div"></div>
<script src="https://webrtc.github.io/adapter/adapter-latest.js"></script>
I wrote and tested the below piece of code, works in latest versions of both firefox and chrome, getConnectionDetails returns a promise which resolves to connection details:
function getConnectionDetails(peerConnection){
var connectionDetails = {}; // the final result object.
if(window.chrome){ // checking if chrome
var reqFields = [ 'googLocalAddress',
'googLocalCandidateType',
'googRemoteAddress',
'googRemoteCandidateType'
];
return new Promise(function(resolve, reject){
peerConnection.getStats(function(stats){
var filtered = stats.result().filter(function(e){return e.id.indexOf('Conn-audio')==0 && e.stat('googActiveConnection')=='true'})[0];
if(!filtered) return reject('Something is wrong...');
reqFields.forEach(function(e){connectionDetails[e.replace('goog', '')] = filtered.stat(e)});
resolve(connectionDetails);
});
});
}else{ // assuming it is firefox
return peerConnection.getStats(null).then(function(stats){
var selectedCandidatePair = stats[Object.keys(stats).filter(function(key){return stats[key].selected})[0]]
, localICE = stats[selectedCandidatePair.localCandidateId]
, remoteICE = stats[selectedCandidatePair.remoteCandidateId];
connectionDetails.LocalAddress = [localICE.ipAddress, localICE.portNumber].join(':');
connectionDetails.RemoteAddress = [remoteICE.ipAddress, remoteICE.portNumber].join(':');
connectionDetails.LocalCandidateType = localICE.candidateType;
connectionDetails.RemoteCandidateType = remoteICE.candidateType;
return connectionDetails;
});
}
}
I would like point out one thing, all these three methods fail in one scenario: two turn servers running from same machine on different ports, only reliable way I found was looking at the turn server logs.