Im trying to forward audio from openAI's gpt-4o-audio-preview model from my NodeJS application to a client through webRTC (using flutter_webrtc). I created a class that transforms the audio delta's that gpt-4o-audio-preview returns and upsamples to 48khz and breaks up the buffer into a more "realtime" stream. This still isnt producing perfect audio every time and Im not entirely sure why. Ill occasionally get an entire response from gpt-4o-audio-preview to play back great but other times I hear the first couple words and it very quickly trails off into a robotic tone then stops.
handleLLMAudio() is what is called right from the output from the openAI library. Using chat completions im passing the value from event.choices[0]?.delta?.audio?.data
.
My AudioStreamer class:
const wrtc = require("@roamhq/wrtc");
class AudioStreamer {
constructor() {
this.pc = null;
this.inputSampleRate = 24000; // gpt-4o-audio-preview output sample rate
this.outputSampleRate = 48000; // webrtc sample rate
this.samplesPerFrame = 480; // For PCM16, 960 bytes = 480 samples
this.audioBuffer = new Float32Array(0);
this.track = null;
this.mediaStream = null;
this.isPlaying = false;
this.lastPlayTime = 0;
// Calculate frame duration in milliseconds
this.frameDuration = (this.samplesPerFrame / this.outputSampleRate) * 1000; // ~10ms
}
/**
* Called just after a new RTCPeerConnection and RTCAudioSource is created
* this.setPeerConnection is also called before this method
* @param {RTCAudioSource} audioSource
*/
async initialize(audioSource) {
const { MediaStream } = wrtc;
this.audioSource = audioSource;
this.track = this.audioSource.createTrack();
this.pc.addTrack(this.track);
this.mediaStream = new MediaStream([this.audioSource]);
}
/**
* Takes in base64 audio strings and begins the transformation to realtime audio
* @param {String} base64Audio PCM16 audio delta's right from gpt-4o-audio-preview
*/
handleLLMAudio(base64Audio) {
const buffer = Buffer.from(base64Audio, 'base64');
const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.length);
const pcm16 = new Float32Array(buffer.length / 2);
for (let i = 0; i < pcm16.length; i++) {
const int16Value = view.getInt16(i * 2, true);
pcm16[i] = int16Value / 32768.0;
}
const resampled = this.resampleBuffer(pcm16);
// Add to buffer
const newBuffer = new Float32Array(this.audioBuffer.length + resampled.length);
newBuffer.set(this.audioBuffer);
newBuffer.set(resampled, this.audioBuffer.length);
this.audioBuffer = newBuffer;
// Start playback if not already playing
if (!this.isPlaying) {
this.isPlaying = true;
this.lastPlayTime = Date.now();
this.processAudioBuffer();
}
}
resampleBuffer(inputBuffer) {
const outputLength = Math.ceil(inputBuffer.length * (this.outputSampleRate / this.inputSampleRate));
const output = new Float32Array(outputLength);
for (let i = 0; i < outputLength; i++) {
const inputIndex = (i * this.inputSampleRate / this.outputSampleRate);
const index = Math.floor(inputIndex);
const fraction = inputIndex - index;
const a = inputBuffer[index] || 0;
const b = inputBuffer[index + 1] || 0;
output[i] = a + fraction * (b - a);
}
return output;
}
float32ToPCM16(float32Array) {
const pcm16 = new Int16Array(float32Array.length);
for (let i = 0; i < float32Array.length; i++) {
const sample = Math.max(-1, Math.min(1, float32Array[i]));
pcm16[i] = Math.round(sample * 32767);
}
return pcm16;
}
sendAudioFrame(samples) {
const pcm16Samples = this.float32ToPCM16(samples);
this.audioSource.onData({
samples: pcm16Samples,
sampleRate: this.outputSampleRate,
channelCount: 1,
bitsPerSample: 16
});
}
async processAudioBuffer() {
if (!this.isPlaying || this.audioBuffer.length < this.samplesPerFrame) {
return;
}
const now = Date.now();
const timeSinceLastFrame = now - this.lastPlayTime;
if (timeSinceLastFrame >= this.frameDuration) {
const frame = this.audioBuffer.slice(0, this.samplesPerFrame);
this.sendAudioFrame(frame);
this.audioBuffer = this.audioBuffer.slice(this.samplesPerFrame);
this.lastPlayTime = now;
}
// Schedule next frame
setTimeout(() => this.processAudioBuffer(), Math.max(0, this.frameDuration - timeSinceLastFrame));
}
setPeerConnection(pc) {
this.pc = pc;
}
reset() {
console.log("AudioStreamer reset()");
this.lastPlayTime = 0;
this.audioBuffer = new Float32Array(0);
}
cleanup() {
this.isPlaying = false;
if (this.track) {
this.track.stop();
}
}
}
module.exports = AudioStreamer;