node.js - Streaming audio from an LLM through webRTC

Im trying to forward audio from openAI's gpt-4o-audio-preview model from my NodeJS application to a client through webRTC (using flutter_webrtc). I created a class that transforms the audio delta's that gpt-4o-audio-preview returns and upsamples to 48khz and breaks up the buffer into a more "realtime" stream. This still isnt producing perfect audio every time and Im not entirely sure why. Ill occasionally get an entire response from gpt-4o-audio-preview to play back great but other times I hear the first couple words and it very quickly trails off into a robotic tone then stops.

handleLLMAudio() is what is called right from the output from the openAI library. Using chat completions im passing the value from event.choices[0]?.delta?.audio?.data.

My AudioStreamer class:

const wrtc = require("@roamhq/wrtc");
    
class AudioStreamer {
   constructor() {
      this.pc = null;
      this.inputSampleRate = 24000; // gpt-4o-audio-preview output sample rate
      this.outputSampleRate = 48000; // webrtc sample rate
      this.samplesPerFrame = 480;  // For PCM16, 960 bytes = 480 samples
      this.audioBuffer = new Float32Array(0);
      this.track = null;
      this.mediaStream = null;
      this.isPlaying = false;
      this.lastPlayTime = 0;
      
      // Calculate frame duration in milliseconds
      this.frameDuration = (this.samplesPerFrame / this.outputSampleRate) * 1000; // ~10ms
   }

   /**
    * Called just after a new RTCPeerConnection and RTCAudioSource is created 
    * this.setPeerConnection is also called before this method
    * @param {RTCAudioSource} audioSource 
    */
   async initialize(audioSource) {
      const { MediaStream } = wrtc;
      this.audioSource = audioSource;
      this.track = this.audioSource.createTrack();
      this.pc.addTrack(this.track);
      this.mediaStream = new MediaStream([this.audioSource]);
   }

   /**
    * Takes in base64 audio strings and begins the transformation to realtime audio
    * @param {String} base64Audio PCM16 audio delta's right from gpt-4o-audio-preview 
    */
   handleLLMAudio(base64Audio) {
      const buffer = Buffer.from(base64Audio, 'base64');
      const view = new DataView(buffer.buffer, buffer.byteOffset, buffer.length);
      const pcm16 = new Float32Array(buffer.length / 2);
      
      for (let i = 0; i < pcm16.length; i++) {
         const int16Value = view.getInt16(i * 2, true);
         pcm16[i] = int16Value / 32768.0;
      }

      const resampled = this.resampleBuffer(pcm16);

      // Add to buffer
      const newBuffer = new Float32Array(this.audioBuffer.length + resampled.length);
      newBuffer.set(this.audioBuffer);
      newBuffer.set(resampled, this.audioBuffer.length);
      this.audioBuffer = newBuffer;

      // Start playback if not already playing
      if (!this.isPlaying) {
         this.isPlaying = true;
         this.lastPlayTime = Date.now();
         this.processAudioBuffer();
      }
   }

   resampleBuffer(inputBuffer) {
      const outputLength = Math.ceil(inputBuffer.length * (this.outputSampleRate / this.inputSampleRate));
      const output = new Float32Array(outputLength);
      
      for (let i = 0; i < outputLength; i++) {
         const inputIndex = (i * this.inputSampleRate / this.outputSampleRate);
         const index = Math.floor(inputIndex);
         const fraction = inputIndex - index;
         
         const a = inputBuffer[index] || 0;
         const b = inputBuffer[index + 1] || 0;
         output[i] = a + fraction * (b - a);
      }
      
      return output;
   }

   float32ToPCM16(float32Array) {
      const pcm16 = new Int16Array(float32Array.length);
      for (let i = 0; i < float32Array.length; i++) {
         const sample = Math.max(-1, Math.min(1, float32Array[i]));
         pcm16[i] = Math.round(sample * 32767);
      }
      return pcm16;
   }

   sendAudioFrame(samples) {
      const pcm16Samples = this.float32ToPCM16(samples);
      
      this.audioSource.onData({
         samples: pcm16Samples,
         sampleRate: this.outputSampleRate,
         channelCount: 1,
         bitsPerSample: 16
      });
   }

   async processAudioBuffer() {
      if (!this.isPlaying || this.audioBuffer.length < this.samplesPerFrame) {
         return;
      }

      const now = Date.now();
      const timeSinceLastFrame = now - this.lastPlayTime;

      if (timeSinceLastFrame >= this.frameDuration) {
         const frame = this.audioBuffer.slice(0, this.samplesPerFrame);
         this.sendAudioFrame(frame);
         this.audioBuffer = this.audioBuffer.slice(this.samplesPerFrame);
         this.lastPlayTime = now;
      }

      // Schedule next frame
      setTimeout(() => this.processAudioBuffer(), Math.max(0, this.frameDuration - timeSinceLastFrame));
   }

   setPeerConnection(pc) {
      this.pc = pc;
   }

   reset() {
      console.log("AudioStreamer reset()");
      this.lastPlayTime = 0;
      this.audioBuffer = new Float32Array(0);
   }

   cleanup() {
      this.isPlaying = false;
      if (this.track) {
         this.track.stop();
      }
   }
}

module.exports = AudioStreamer;

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

node.js - Streaming audio from an LLM through webRTC - Stack Overflow

与本文相关的文章

评论列表(0)