I’m working on a project where I need to use the speech_recognition module to process audio in real-time. However, from my research, it seems that speech_recognition (which works with pyaudio) only supports input sources like microphones and doesn’t seem to support capturing system output (the audio played through speakers). I need to find a way to capture the system’s output audio so that it can be processed by the recognizer.
I’ve tried using speech_recognition with pyaudio and sounddevice to record audio from the microphone, but neither captures system audio. I've researched into using both loopback methods and directly using output sources with speech_recognition, but most of them seem to be paid services? Any guidance/help would be appreciated.
Here's the code.
import numpy as np
import speech_recognition as sr
from time import sleep
def select_microphone():
mics = sr.Microphone.list_microphone_names()
input_mics = []
for index, name in enumerate(sr.Microphone.list_microphone_names()):
input_mics.append([index,name])
if not input_mics:
print("No available microphones.")
exit(1)
print("Available Microphones:")
for i, mic in input_mics:
print(f"{i}: {mic}")
while True:
try:
choice = int(input("Select microphone index: "))
if 0 <= choice < len(input_mics):
return sr.Microphone(sample_rate=16000, device_index=input_mics[choice][0])
except ValueError:
pass
print("Invalid selection. Try again.")
# Load model quickly
model = whisper.load_model("small")
data_queue = Queue()
transcription = ['']
recorder = sr.Recognizer()
# Select microphone
source = select_microphone()
def record_callback(_, audio: sr.AudioData):
data_queue.put(audio.get_raw_data())
with source:
recorder.adjust_for_ambient_noise(source)
recorder.listen_in_background(source, record_callback)
while True:
try:
if not data_queue.empty():
audio_data = b''.join(data_queue.queue)
data_queue.queue.clear()
audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
else:
sleep(0.25)
except KeyboardInterrupt:
break
print("\nFinal Transcription:")
print("\n".join(transcription))