Is there any way you'd recommend to get live (streaming) transcribed audio using a model on par with OpenAI Whisper or AssemblyAI in React Native.
I made a NodeJS server that forwarded my messages and transcription from React Native to Deepgram. It fails 90% of the time but works sometimes. I know that the issue occurs before the response is received from Deepgram. Are the websocket packets getting to big for longer transcriptions? Am I sending Finalize at the wrong time?
speech.ts
import { Audio } from "expo-av";
import { useEffect, useRef, useState } from "react";
export default function useSpeech() {
const [recording, setRecording] = useState<Audio.Recording | null>(null);
const [messages, setMessages] = useState<string[]>([]);
const [inProgressMessage, setInProgressMessage] = useState<string | null>(null);
const wsRef = useRef<WebSocket | null>(null);
const [isAudioSetup, setIsAudioSetup] = useState(false);
const [isWSReady, setIsWSReady] = useState(false);
// Create WebSocket connection to Deepgram
const setupWS = () => {
const newWs = new WebSocket(
`wss://time-defend.fly.dev`,
);
wsRef.current = newWs;
// Handle WebSocket events
newWs.onopen = () => {
console.log('WebSocket connected');
if (newWs === wsRef.current)
setIsWSReady(true);
console.log(newWs, wsRef.current, "newWs === ws", newWs === wsRef.current)
};
newWs.onmessage = (event) => {
console.log('Received message:', event.data);
const data = JSON.parse(event.data);
// handle transcription
};
newWs.onerror = (error) => {
console.error('WebSocket error:', error);
};
newWs.onclose = (event) => {
console.log('WebSocket closed:', event.reason);
if (newWs === wsRef.current)
setIsWSReady(false);
};
setIsWSReady(false);
};
useEffect(() => {
setupWS();
// Request permissions and set up audio
const setupAudio = async () => {
try {
await Audio.requestPermissionsAsync();
await Audio.setAudioModeAsync({
allowsRecordingIOS: true,
playsInSilentModeIOS: true,
});
setIsAudioSetup(true);
} catch (err) {
console.error('Failed to setup audio:', err);
}
};
setupAudio();
// Cleanup WebSocket when unmounting
return () => {
if (wsRef.current) {
wsRef.current.close();
}
};
}, []);
// Start Recording + Open WebSocket
const startRecording = async () => {
try {
// Create a new recording
const { recording: newRecording } = await Audio.Recording.createAsync(
Audio.RecordingOptionsPresets.HIGH_QUALITY,
(status) => {
// This callback receives recording status updates
console.log('Recording status:', status);
},
100 // Update interval in milliseconds
);
setRecording(newRecording);
if (!wsRef.current) {
throw new Error('WebSocket not initialized');
}
} catch (err) {
console.error('Failed to start recording:', err);
}
};
// Stop Recording + send "Finalize"
const stopRecording = async () => {
try {
setIsWSReady(false);
if (!recording) return;
await recording.stopAndUnloadAsync();
recording._options?.web.bitsPerSecond
const uri = recording.getURI();
setRecording(null);
// wss://api.deepgram/v1/listen?punctuate=true&channels=1&sample_rate=16000&encoding=linear16
// get the recording and encode it the right way as a string
// Get the audio file as a blob
// Web: Use fetch and Blob
const response = await fetch(uri!);
const blob = await response.blob();
const reader = new FileReader();
reader.onloadend = () => {
let base64String = reader.result!;
// console.log(base64String)
// wsRef.current!.send(Buffer.from(base64String.split(",")[1], "base64"));
wsRef.current!.send(base64String as ArrayBuffer)
wsRef.current!.send("Finalize");
// Once done, let Deepgram know we're finished sending audio
// if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
// wsRef.current.send('Finalize');
// }
};
reader.readAsArrayBuffer(blob);
console.log('Recording stopped, file saved at:', uri);
} catch (err) {
console.error('Failed to stop recording:', err);
}
};
const isReady = isAudioSetup && wsRef.current?.readyState === WebSocket.OPEN && isWSReady;
console.log("isReady", isReady, isAudioSetup, wsRef.current?.readyState === WebSocket.OPEN, isWSReady)
const isRecording = recording !== null;
return { isReady, isRecording, startRecording, stopRecording, inProgressMessage, messages };
}
Is there any way you'd recommend to get live (streaming) transcribed audio using a model on par with OpenAI Whisper or AssemblyAI in React Native.
I made a NodeJS server that forwarded my messages and transcription from React Native to Deepgram. It fails 90% of the time but works sometimes. I know that the issue occurs before the response is received from Deepgram. Are the websocket packets getting to big for longer transcriptions? Am I sending Finalize at the wrong time?
speech.ts
import { Audio } from "expo-av";
import { useEffect, useRef, useState } from "react";
export default function useSpeech() {
const [recording, setRecording] = useState<Audio.Recording | null>(null);
const [messages, setMessages] = useState<string[]>([]);
const [inProgressMessage, setInProgressMessage] = useState<string | null>(null);
const wsRef = useRef<WebSocket | null>(null);
const [isAudioSetup, setIsAudioSetup] = useState(false);
const [isWSReady, setIsWSReady] = useState(false);
// Create WebSocket connection to Deepgram
const setupWS = () => {
const newWs = new WebSocket(
`wss://time-defend.fly.dev`,
);
wsRef.current = newWs;
// Handle WebSocket events
newWs.onopen = () => {
console.log('WebSocket connected');
if (newWs === wsRef.current)
setIsWSReady(true);
console.log(newWs, wsRef.current, "newWs === ws", newWs === wsRef.current)
};
newWs.onmessage = (event) => {
console.log('Received message:', event.data);
const data = JSON.parse(event.data);
// handle transcription
};
newWs.onerror = (error) => {
console.error('WebSocket error:', error);
};
newWs.onclose = (event) => {
console.log('WebSocket closed:', event.reason);
if (newWs === wsRef.current)
setIsWSReady(false);
};
setIsWSReady(false);
};
useEffect(() => {
setupWS();
// Request permissions and set up audio
const setupAudio = async () => {
try {
await Audio.requestPermissionsAsync();
await Audio.setAudioModeAsync({
allowsRecordingIOS: true,
playsInSilentModeIOS: true,
});
setIsAudioSetup(true);
} catch (err) {
console.error('Failed to setup audio:', err);
}
};
setupAudio();
// Cleanup WebSocket when unmounting
return () => {
if (wsRef.current) {
wsRef.current.close();
}
};
}, []);
// Start Recording + Open WebSocket
const startRecording = async () => {
try {
// Create a new recording
const { recording: newRecording } = await Audio.Recording.createAsync(
Audio.RecordingOptionsPresets.HIGH_QUALITY,
(status) => {
// This callback receives recording status updates
console.log('Recording status:', status);
},
100 // Update interval in milliseconds
);
setRecording(newRecording);
if (!wsRef.current) {
throw new Error('WebSocket not initialized');
}
} catch (err) {
console.error('Failed to start recording:', err);
}
};
// Stop Recording + send "Finalize"
const stopRecording = async () => {
try {
setIsWSReady(false);
if (!recording) return;
await recording.stopAndUnloadAsync();
recording._options?.web.bitsPerSecond
const uri = recording.getURI();
setRecording(null);
// wss://api.deepgram/v1/listen?punctuate=true&channels=1&sample_rate=16000&encoding=linear16
// get the recording and encode it the right way as a string
// Get the audio file as a blob
// Web: Use fetch and Blob
const response = await fetch(uri!);
const blob = await response.blob();
const reader = new FileReader();
reader.onloadend = () => {
let base64String = reader.result!;
// console.log(base64String)
// wsRef.current!.send(Buffer.from(base64String.split(",")[1], "base64"));
wsRef.current!.send(base64String as ArrayBuffer)
wsRef.current!.send("Finalize");
// Once done, let Deepgram know we're finished sending audio
// if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
// wsRef.current.send('Finalize');
// }
};
reader.readAsArrayBuffer(blob);
console.log('Recording stopped, file saved at:', uri);
} catch (err) {
console.error('Failed to stop recording:', err);
}
};
const isReady = isAudioSetup && wsRef.current?.readyState === WebSocket.OPEN && isWSReady;
console.log("isReady", isReady, isAudioSetup, wsRef.current?.readyState === WebSocket.OPEN, isWSReady)
const isRecording = recording !== null;
return { isReady, isRecording, startRecording, stopRecording, inProgressMessage, messages };
}
Share
Improve this question
edited Mar 17 at 17:34
Michael Barr
asked Mar 17 at 17:05
Michael BarrMichael Barr
552 silver badges7 bronze badges
1 Answer
Reset to default 0If you want to get ai response from audio to text, then you can easily use Google AI studio and a generative AI model like Gemini-flash-2.0
I am sharing you the demo code and you can also visit there, documentation I am also sharing a deepGram example that I used
Google Gemini AI
Gemini audio to text documentation
/ Make sure to include these imports:
// import { GoogleAIFileManager, FileState } from "@google/generative-ai/server";
// import { GoogleGenerativeAI } from "@google/generative-ai";
const fileManager = new GoogleAIFileManager(process.env.API_KEY);
const uploadResult = await fileManager.uploadFile(
`${mediaPath}/samplesmall.mp3`,
{
mimeType: "audio/mp3",
displayName: "Audio sample",
},
);
let file = await fileManager.getFile(uploadResult.file.name);
while (file.state === FileState.PROCESSING) {
process.stdout.write(".");
// Sleep for 10 seconds
await new Promise((resolve) => setTimeout(resolve, 10_000));
// Fetch the file from the API again
file = await fileManager.getFile(uploadResult.file.name);
}
if (file.state === FileState.FAILED) {
throw new Error("Audio processing failed.");
}
// View the response.
console.log(
`Uploaded file ${uploadResult.file.displayName} as: ${uploadResult.file.uri}`,
);
const genAI = new GoogleGenerativeAI(process.env.API_KEY);
const model = genAI.getGenerativeModel({ model: "gemini-1.5-flash" });
const result = await model.generateContent([
"Tell me about this audio clip.",
{
fileData: {
fileUri: uploadResult.file.uri,
mimeType: uploadResult.file.mimeType,
},
},
]);
console.log(result.response.text());
Deepgram pre-recorded to text. it gives better output
const { createClient } = require('@deepgram/sdk');
const deepgram = createClient(process.env.DEEPGRAM_API_KEY);
transcribeFile = async (filePath: string) => {
const { result, error } = await deepgram.listen.prerecorded.transcribeFile(
// path to the audio file
fs.readFileSync(filePath),
{
model: 'nova-3',
smart_format: true,
},
);
if (error) throw error;
//Print the results
return result?.results?.channels[0]?.alternatives[0]?.transcript;
};
Hope this helps. Good luck