Revamp: embedded console, faster-whisper, simplified install

This commit is contained in:
Kristofer Söderström
2026-03-02 17:02:16 +01:00
parent 7d3fe1ba26
commit f8cf42733d
12 changed files with 449 additions and 139 deletions

View File

@@ -1,11 +1,56 @@
import os
import sys
import datetime
import site
from glob import glob
import whisper
from torch import backends, cuda, Generator
import colorama
from colorama import Back,Fore
colorama.init(autoreset=True)
# ---------------------------------------------------------------------------
# CUDA setup — must happen before importing faster_whisper / ctranslate2
# ---------------------------------------------------------------------------
def _setup_cuda_dlls():
"""Add NVIDIA pip-package DLL dirs to the DLL search path (Windows only).
pip-installed nvidia-cublas-cu12 / nvidia-cudnn-cu12 place their .dll
files inside the site-packages tree. Python 3.8+ on Windows does NOT
search PATH for DLLs loaded via ctypes/LoadLibrary, so we must
explicitly register every nvidia/*/bin and nvidia/*/lib directory using
os.add_dll_directory *and* prepend them to PATH (some native extensions
still rely on PATH).
"""
if sys.platform != "win32":
return
try:
for sp in site.getsitepackages():
nvidia_root = os.path.join(sp, "nvidia")
if not os.path.isdir(nvidia_root):
continue
for pkg in os.listdir(nvidia_root):
for sub in ("bin", "lib"):
d = os.path.join(nvidia_root, pkg, sub)
if os.path.isdir(d):
os.environ["PATH"] = d + os.pathsep + os.environ.get("PATH", "")
try:
os.add_dll_directory(d)
except (OSError, AttributeError):
pass
except Exception:
pass
_setup_cuda_dlls()
from faster_whisper import WhisperModel
def _detect_device():
"""Return (device, compute_type) for the best available backend."""
try:
import ctranslate2
cuda_types = ctranslate2.get_supported_compute_types("cuda")
if "float16" in cuda_types:
return "cuda", "float16"
except Exception:
pass
return "cpu", "int8"
# Get the path
@@ -16,12 +61,12 @@ def get_path(path):
# Main function
def transcribe(path, glob_file, model=None, language=None, verbose=False):
"""
Transcribes audio files in a specified folder using OpenAI's Whisper model.
Transcribes audio files in a specified folder using faster-whisper (CTranslate2).
Args:
path (str): Path to the folder containing the audio files.
glob_file (list): List of audio file paths to transcribe.
model (str, optional): Name of the Whisper model to use for transcription.
model (str, optional): Name of the Whisper model size to use for transcription.
Defaults to None, which uses the default model.
language (str, optional): Language code for transcription. Defaults to None,
which enables automatic language detection.
@@ -38,59 +83,67 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False):
- The function downloads the specified model if not available locally.
- The transcribed text files will be saved in a "transcriptions" folder
within the specified path.
- Uses CTranslate2 for up to 4x faster inference compared to openai-whisper.
- FFmpeg is bundled via the PyAV dependency — no separate installation needed.
"""
# Check for GPU acceleration and set device
if backends.mps.is_available():
device = 'mps'
Generator('mps').manual_seed(42)
elif cuda.is_available():
device = 'cuda'
Generator('cuda').manual_seed(42)
else:
device = 'cpu'
Generator().manual_seed(42)
SEP = "" * 46
# Load model on the correct device
model = whisper.load_model(model, device=device)
# Start main loop
files_transcripted=[]
# ── Step 1: Detect hardware ──────────────────────────────────────
device, compute_type = _detect_device()
print(f"⚙ Device: {device} | Compute: {compute_type}")
# ── Step 2: Load model ───────────────────────────────────────────
print(f"⏳ Loading model '{model}' — downloading if needed...")
whisper_model = WhisperModel(model, device=device, compute_type=compute_type)
print("✅ Model ready!")
print(SEP)
# ── Step 3: Transcribe files ─────────────────────────────────────
total_files = len(glob_file)
print(f"📂 Found {total_files} item(s) in folder")
print(SEP)
files_transcripted = []
file_num = 0
for file in glob_file:
title = os.path.basename(file).split('.')[0]
print(Back.CYAN + '\nTrying to transcribe file named: {}\U0001f550'.format(title))
file_num += 1
print(f"\n{'' * 46}")
print(f"📄 File {file_num}/{total_files}: {title}")
try:
result = model.transcribe(
file,
language=language,
verbose=verbose
)
files_transcripted.append(result)
# Make folder if missing
try:
os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
except FileExistsError:
pass
# Create segments for text files
start = []
end = []
text = []
for segment in result['segments']:
start.append(str(datetime.timedelta(seconds=segment['start'])))
end.append(str(datetime.timedelta(seconds=segment['end'])))
text.append(segment['text'])
# Save files to transcriptions folder
with open("{}/transcriptions/{}.txt".format(path, title), 'w', encoding='utf-8') as file:
file.write(title)
for i in range(len(result['segments'])):
file.write('\n[{} --> {}]:{}'.format(start[i], end[i], text[i]))
# Skip invalid files
except RuntimeError:
print(Fore.RED + 'Not a valid file, skipping.')
pass
# Check if any files were processed.
segments, info = whisper_model.transcribe(
file,
language=language,
beam_size=5
)
# Make folder if missing
os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
# Stream segments as they are decoded
segment_list = []
with open("{}/transcriptions/{}.txt".format(path, title), 'w', encoding='utf-8') as f:
f.write(title)
for seg in segments:
start_ts = str(datetime.timedelta(seconds=seg.start))
end_ts = str(datetime.timedelta(seconds=seg.end))
f.write('\n[{} --> {}]:{}'.format(start_ts, end_ts, seg.text))
f.flush()
if verbose:
print(" [%.2fs → %.2fs] %s" % (seg.start, seg.end, seg.text))
else:
print(" Transcribed up to %.0fs..." % seg.end, end='\r')
segment_list.append(seg)
print(f"✅ Done — saved to transcriptions/{title}.txt")
files_transcripted.append(segment_list)
except Exception:
print('⚠ Not a valid audio/video file, skipping.')
# ── Summary ──────────────────────────────────────────────────────
print(f"\n{SEP}")
if len(files_transcripted) > 0:
output_text = 'Finished transcription, {} files can be found in {}/transcriptions'.format(len(files_transcripted), path)
output_text = f"✅ Finished! {len(files_transcripted)} file(s) transcribed.\n Saved in: {path}/transcriptions"
else:
output_text = 'No files elligible for transcription, try adding audio or video files to this folder or choose another folder!'
# Return output text
output_text = 'No files eligible for transcription — try another folder.'
print(output_text)
print(SEP)
return output_text