I am trying to set up audio transcription using VOLK and Wave in Python. The final issue i have is that when I try to read the total frames in my audio file, the number is much higher than the actual frames in the file. For example, one audio file that I transcribed only contained 3440000 frames, but my program read 9779874 total frames. The console output was therefore "Progress: 3.52% Transcribed, processed 344000 / 9779874 frames[DEBUG] End of file reached."
My file processing function is below. Any help would be appreciated.
def process_file(self, file_path):
print(f"[DEBUG] Started processing file: {file_path}")
extension = os.path.splitext(file_path)[1].lower()
print(f"[DEBUG] File extension: {extension}")
#If it's an MP3, convert to WAV
if extension == ".mp3":
file_path = convert_mp3_to_wav(file_path)
if not file_path:
print("[ERROR] Conversion failed, skipping file.")
return
elif extension != ".wav":
print(f"[ERROR] Unsupported file format: {extension}")
return
# Open the audio file
try:
wf = wave.open(file_path, "rb")
print(f"[DEBUG] Opened WAV file: {file_path} ({wf.getnchannels()} channels, {wf.getsampwidth()} width, {wf.getframerate()} Hz)")
except Exception as e:
print(f"[ERROR] Failed to open file {file_path}: {e}")
return
# Check if audio is in the right format
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000:
print(f"[ERROR] Audio format is incorrect. Expected 1 channel, 2 bytes width, 16000 Hz sample rate.")
return
recognizer = KaldiRecognizer(model, wf.getframerate())
total_frames = wf.getnframes()
processed_frames = 0
text_result = ""
start_time = time.time()
print(f"[DEBUG] Starting transcription...")
while True:
data = wf.readframes(4000)
if len(data) == 0:
print("[DEBUG] End of file reached.")
break
if recognizer.AcceptWaveform(data):
processed_frames += len(data)
percentage = (processed_frames / total_frames) * 100
result = json.loads(recognizer.Result())
text_result += result.get("text", "") + " "
print(f" [DEBUG] Transcribed chunk: {result.get('text', '')}")
sample_width = wf.getsampwidth()
print(f"Sample width (bytes per sample): {sample_width}")
total_frames = wf.getnframes()
print(f"\rProgress: {percentage:.2f}% Transcribed, processed {processed_frames} / {total_frames} frames", end="")
end_time = time.time()
transcription_duration = end_time - start_time
print(f"[DEBUG] Finished transcription in {transcription_duration:.2f} seconds.")
```