I'm trying to use faster-whisper for speech-to-text transcription in a Jupyter Notebook. I followed a YouTube tutorial and refined my code with ChatGPT, but I'm not getting any transcription output. The recording seems to work, but the speech_recognition function doesn't display any text.
Here's my code:
import ipywidgets as wd
from IPython.display import display
from threading import Thread
from queue import Queue
import sounddevice as sd
import numpy as np
import faster_whisper
import pyaudio
# Load the whisper model
model = faster_whisper.WhisperModel("small", device="cpu", compute_type="int8")
recordings = Queue()
# UI buttons
record_button = wd.Button(description="Record", disabled=False, button_style="success", icon="microphone")
stop_button = wd.Button(description="Stop", disabled=False, button_style="warning", icon="stop")
output = wd.Output()
# PyAudio setup
p = pyaudio.PyAudio()
default_device_index = p.get_default_input_device_info().get("index", None)
CHANNELS = 1
FRAME_RATE = 16000
RECORD_SECONDS = 20
AUDIO_FORMAT = pyaudio.paInt16
SAMPLE_SIZE = 2
CHUNK = 1024
is_recording = False
def record_microphone():
"""Records audio from the microphone and puts it in a queue."""
global is_recording
p = pyaudio.PyAudio()
stream = p.open(format=AUDIO_FORMAT, channels=CHANNELS, rate=FRAME_RATE,
input=True, input_device_index=default_device_index, frames_per_buffer=CHUNK)
while is_recording:
data = stream.read(CHUNK)
recordings.put(data)
stream.stop_stream()
stream.close()
p.terminate()
def speech_recognition():
"""Processes audio from the queue and transcribes it using Faster-Whisper."""
audio_buffer = []
while is_recording or not recordings.empty():
if not recordings.empty():
data = recordings.get()
audio_buffer.append(np.frombuffer(data, dtype=np.int16))
if len(audio_buffer) * CHUNK >= FRAME_RATE:
# Normalize audio
audio_chunk = np.concatenate(audio_buffer).astype(np.float32) / 32768
audio_buffer = []
segments, _ = model.transcribe(audio_chunk, language="en", beam_size=5)
with output:
for segment in segments:
display(segment.text)
def start_recording(data):
"""Starts recording and transcription threads."""
global is_recording
is_recording = True
with output:
display("Listening...")
record_thread = Thread(target=record_microphone)
transcribe_thread = Thread(target=speech_recognition)
record_thread.start()
transcribe_thread.start()
def stop_recording(data):
"""Stops the recording process."""
global is_recording
is_recording = False
with output:
display("Stopped.")
record_button.on_click(start_recording)
stop_button.on_click(stop_recording)
display(record_button, stop_button, output)
Any help is much appreciated