javascript - React Native Text to Speech with Online Transcription

Is there any way you'd recommend to get live (streaming) transcribed audio using a model on par with OpenAI Whisper or AssemblyAI in React Native.

I made a NodeJS server that forwarded my messages and transcription from React Native to Deepgram. It fails 90% of the time but works sometimes. I know that the issue occurs before the response is received from Deepgram. Are the websocket packets getting to big for longer transcriptions? Am I sending Finalize at the wrong time?

speech.ts

import { Audio } from "expo-av";
import { useEffect, useRef, useState } from "react";

export default function useSpeech() {
    const [recording, setRecording] = useState<Audio.Recording | null>(null);
    const [messages, setMessages] = useState<string[]>([]);
    const [inProgressMessage, setInProgressMessage] = useState<string | null>(null);
    const wsRef = useRef<WebSocket | null>(null);
    const [isAudioSetup, setIsAudioSetup] = useState(false);
    const [isWSReady, setIsWSReady] = useState(false);

    // Create WebSocket connection to Deepgram
    const setupWS = () => {
        const newWs = new WebSocket(
            `wss://time-defend.fly.dev`,
        );
        wsRef.current = newWs;
        // Handle WebSocket events
        newWs.onopen = () => {
            console.log('WebSocket connected');
            if (newWs === wsRef.current)
                setIsWSReady(true);
            console.log(newWs, wsRef.current, "newWs === ws", newWs === wsRef.current)
        };
        newWs.onmessage = (event) => {
            console.log('Received message:', event.data);
            const data = JSON.parse(event.data);
            // handle transcription
        };
        newWs.onerror = (error) => {
            console.error('WebSocket error:', error);
        };
        newWs.onclose = (event) => {
            console.log('WebSocket closed:', event.reason);
            if (newWs === wsRef.current)
                setIsWSReady(false);
        };
        setIsWSReady(false);
    };

    useEffect(() => {
        setupWS();
        // Request permissions and set up audio
        const setupAudio = async () => {
            try {
                await Audio.requestPermissionsAsync();
                await Audio.setAudioModeAsync({
                    allowsRecordingIOS: true,
                    playsInSilentModeIOS: true,
                });
                setIsAudioSetup(true);
            } catch (err) {
                console.error('Failed to setup audio:', err);
            }
        };
        setupAudio();

        // Cleanup WebSocket when unmounting
        return () => {
            if (wsRef.current) {
                wsRef.current.close();
            }
        };
    }, []);

    // Start Recording + Open WebSocket
    const startRecording = async () => {
        try {
            // Create a new recording
            const { recording: newRecording } = await Audio.Recording.createAsync(
                Audio.RecordingOptionsPresets.HIGH_QUALITY,
                (status) => {
                    // This callback receives recording status updates
                    console.log('Recording status:', status);
                },
                100 // Update interval in milliseconds
            );
            setRecording(newRecording);

            if (!wsRef.current) {
                throw new Error('WebSocket not initialized');
            }
        } catch (err) {
            console.error('Failed to start recording:', err);
        }
    };

    // Stop Recording + send "Finalize"
    const stopRecording = async () => {
        try {
            setIsWSReady(false);
            if (!recording) return;

            await recording.stopAndUnloadAsync();
            recording._options?.web.bitsPerSecond
            const uri = recording.getURI();
            setRecording(null);

            // wss://api.deepgram/v1/listen?punctuate=true&channels=1&sample_rate=16000&encoding=linear16
            // get the recording and encode it the right way as a string
            // Get the audio file as a blob

            // Web: Use fetch and Blob
            const response = await fetch(uri!);
            const blob = await response.blob();
            const reader = new FileReader();

            reader.onloadend = () => {
                let base64String = reader.result!;

                // console.log(base64String)

                // wsRef.current!.send(Buffer.from(base64String.split(",")[1], "base64"));
                wsRef.current!.send(base64String as ArrayBuffer)
                wsRef.current!.send("Finalize");

                // Once done, let Deepgram know we're finished sending audio
                // if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
                //   wsRef.current.send('Finalize');
                // }
            };

            reader.readAsArrayBuffer(blob);

            console.log('Recording stopped, file saved at:', uri);
        } catch (err) {
            console.error('Failed to stop recording:', err);
        }
    };
    const isReady = isAudioSetup && wsRef.current?.readyState === WebSocket.OPEN && isWSReady;
    console.log("isReady", isReady, isAudioSetup, wsRef.current?.readyState === WebSocket.OPEN, isWSReady)
    const isRecording = recording !== null;
    return { isReady, isRecording, startRecording, stopRecording, inProgressMessage, messages };
}

Is there any way you'd recommend to get live (streaming) transcribed audio using a model on par with OpenAI Whisper or AssemblyAI in React Native.

speech.ts

import { Audio } from "expo-av";
import { useEffect, useRef, useState } from "react";

export default function useSpeech() {
    const [recording, setRecording] = useState<Audio.Recording | null>(null);
    const [messages, setMessages] = useState<string[]>([]);
    const [inProgressMessage, setInProgressMessage] = useState<string | null>(null);
    const wsRef = useRef<WebSocket | null>(null);
    const [isAudioSetup, setIsAudioSetup] = useState(false);
    const [isWSReady, setIsWSReady] = useState(false);

    // Create WebSocket connection to Deepgram
    const setupWS = () => {
        const newWs = new WebSocket(
            `wss://time-defend.fly.dev`,
        );
        wsRef.current = newWs;
        // Handle WebSocket events
        newWs.onopen = () => {
            console.log('WebSocket connected');
            if (newWs === wsRef.current)
                setIsWSReady(true);
            console.log(newWs, wsRef.current, "newWs === ws", newWs === wsRef.current)
        };
        newWs.onmessage = (event) => {
            console.log('Received message:', event.data);
            const data = JSON.parse(event.data);
            // handle transcription
        };
        newWs.onerror = (error) => {
            console.error('WebSocket error:', error);
        };
        newWs.onclose = (event) => {
            console.log('WebSocket closed:', event.reason);
            if (newWs === wsRef.current)
                setIsWSReady(false);
        };
        setIsWSReady(false);
    };

    useEffect(() => {
        setupWS();
        // Request permissions and set up audio
        const setupAudio = async () => {
            try {
                await Audio.requestPermissionsAsync();
                await Audio.setAudioModeAsync({
                    allowsRecordingIOS: true,
                    playsInSilentModeIOS: true,
                });
                setIsAudioSetup(true);
            } catch (err) {
                console.error('Failed to setup audio:', err);
            }
        };
        setupAudio();

        // Cleanup WebSocket when unmounting
        return () => {
            if (wsRef.current) {
                wsRef.current.close();
            }
        };
    }, []);

    // Start Recording + Open WebSocket
    const startRecording = async () => {
        try {
            // Create a new recording
            const { recording: newRecording } = await Audio.Recording.createAsync(
                Audio.RecordingOptionsPresets.HIGH_QUALITY,
                (status) => {
                    // This callback receives recording status updates
                    console.log('Recording status:', status);
                },
                100 // Update interval in milliseconds
            );
            setRecording(newRecording);

            if (!wsRef.current) {
                throw new Error('WebSocket not initialized');
            }
        } catch (err) {
            console.error('Failed to start recording:', err);
        }
    };

    // Stop Recording + send "Finalize"
    const stopRecording = async () => {
        try {
            setIsWSReady(false);
            if (!recording) return;

            await recording.stopAndUnloadAsync();
            recording._options?.web.bitsPerSecond
            const uri = recording.getURI();
            setRecording(null);

            // wss://api.deepgram/v1/listen?punctuate=true&channels=1&sample_rate=16000&encoding=linear16
            // get the recording and encode it the right way as a string
            // Get the audio file as a blob

            // Web: Use fetch and Blob
            const response = await fetch(uri!);
            const blob = await response.blob();
            const reader = new FileReader();

            reader.onloadend = () => {
                let base64String = reader.result!;

                // console.log(base64String)

                // wsRef.current!.send(Buffer.from(base64String.split(",")[1], "base64"));
                wsRef.current!.send(base64String as ArrayBuffer)
                wsRef.current!.send("Finalize");

                // Once done, let Deepgram know we're finished sending audio
                // if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
                //   wsRef.current.send('Finalize');
                // }
            };

            reader.readAsArrayBuffer(blob);

            console.log('Recording stopped, file saved at:', uri);
        } catch (err) {
            console.error('Failed to stop recording:', err);
        }
    };
    const isReady = isAudioSetup && wsRef.current?.readyState === WebSocket.OPEN && isWSReady;
    console.log("isReady", isReady, isAudioSetup, wsRef.current?.readyState === WebSocket.OPEN, isWSReady)
    const isRecording = recording !== null;
    return { isReady, isRecording, startRecording, stopRecording, inProgressMessage, messages };
}

Share Improve this question edited Mar 17 at 17:34 asked Mar 17 at 17:05 Michael Barr 552 silver badges7 bronze badges

Add a comment |

1 Answer 1

Sorted by: Reset to default 0

If you want to get ai response from audio to text, then you can easily use Google AI studio and a generative AI model like Gemini-flash-2.0

I am sharing you the demo code and you can also visit there, documentation I am also sharing a deepGram example that I used

Google Gemini AI

Gemini audio to text documentation

/ Make sure to include these imports:
// import { GoogleAIFileManager, FileState } from "@google/generative-ai/server";
// import { GoogleGenerativeAI } from "@google/generative-ai";
const fileManager = new GoogleAIFileManager(process.env.API_KEY);

const uploadResult = await fileManager.uploadFile(
  `${mediaPath}/samplesmall.mp3`,
  {
    mimeType: "audio/mp3",
    displayName: "Audio sample",
  },
);

let file = await fileManager.getFile(uploadResult.file.name);
while (file.state === FileState.PROCESSING) {
  process.stdout.write(".");
  // Sleep for 10 seconds
  await new Promise((resolve) => setTimeout(resolve, 10_000));
  // Fetch the file from the API again
  file = await fileManager.getFile(uploadResult.file.name);
}

if (file.state === FileState.FAILED) {
  throw new Error("Audio processing failed.");
}

// View the response.
console.log(
  `Uploaded file ${uploadResult.file.displayName} as: ${uploadResult.file.uri}`,
);

const genAI = new GoogleGenerativeAI(process.env.API_KEY);
const model = genAI.getGenerativeModel({ model: "gemini-1.5-flash" });
const result = await model.generateContent([
  "Tell me about this audio clip.",
  {
    fileData: {
      fileUri: uploadResult.file.uri,
      mimeType: uploadResult.file.mimeType,
    },
  },
]);
console.log(result.response.text());

Deepgram pre-recorded to text. it gives better output

const { createClient } = require('@deepgram/sdk');
const deepgram = createClient(process.env.DEEPGRAM_API_KEY);
  transcribeFile = async (filePath: string) => {
    const { result, error } = await deepgram.listen.prerecorded.transcribeFile(
      // path to the audio file
      fs.readFileSync(filePath),
      {
        model: 'nova-3',
        smart_format: true,
      },
    );

    if (error) throw error;
    //Print the results
    return result?.results?.channels[0]?.alternatives[0]?.transcript;
  };

Hope this helps. Good luck

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

javascript - React Native Text to Speech with Online Transcription - Stack Overflow

1 Answer 1

与本文相关的文章

评论列表(0)