Revamp: embedded console, faster-whisper, simplified install

2026-03-02 17:02:16 +01:00
parent 7d3fe1ba26
commit f8cf42733d
12 changed files with 449 additions and 139 deletions
@@ -1,11 +1,56 @@
 import os
+import sys
 import datetime
+import site
 from glob import glob
-import whisper
-from torch import backends, cuda, Generator
-import colorama
-from colorama import Back,Fore
-colorama.init(autoreset=True)
+
+# ---------------------------------------------------------------------------
+# CUDA setup — must happen before importing faster_whisper / ctranslate2
+# ---------------------------------------------------------------------------
+def _setup_cuda_dlls():
+    """Add NVIDIA pip-package DLL dirs to the DLL search path (Windows only).
+
+    pip-installed nvidia-cublas-cu12 / nvidia-cudnn-cu12 place their .dll
+    files inside the site-packages tree.  Python 3.8+ on Windows does NOT
+    search PATH for DLLs loaded via ctypes/LoadLibrary, so we must
+    explicitly register every nvidia/*/bin and nvidia/*/lib directory using
+    os.add_dll_directory *and* prepend them to PATH (some native extensions
+    still rely on PATH).
+    """
+    if sys.platform != "win32":
+        return
+    try:
+        for sp in site.getsitepackages():
+            nvidia_root = os.path.join(sp, "nvidia")
+            if not os.path.isdir(nvidia_root):
+                continue
+            for pkg in os.listdir(nvidia_root):
+                for sub in ("bin", "lib"):
+                    d = os.path.join(nvidia_root, pkg, sub)
+                    if os.path.isdir(d):
+                        os.environ["PATH"] = d + os.pathsep + os.environ.get("PATH", "")
+                        try:
+                            os.add_dll_directory(d)
+                        except (OSError, AttributeError):
+                            pass
+    except Exception:
+        pass
+
+_setup_cuda_dlls()
+
+from faster_whisper import WhisperModel
+
+
+def _detect_device():
+    """Return (device, compute_type) for the best available backend."""
+    try:
+        import ctranslate2
+        cuda_types = ctranslate2.get_supported_compute_types("cuda")
+        if "float16" in cuda_types:
+            return "cuda", "float16"
+    except Exception:
+        pass
+    return "cpu", "int8"


 # Get the path
@@ -16,12 +61,12 @@ def get_path(path):
 # Main function
 def transcribe(path, glob_file, model=None, language=None, verbose=False):
    """
-    Transcribes audio files in a specified folder using OpenAI's Whisper model.
+    Transcribes audio files in a specified folder using faster-whisper (CTranslate2).

    Args:
        path (str): Path to the folder containing the audio files.
        glob_file (list): List of audio file paths to transcribe.
-        model (str, optional): Name of the Whisper model to use for transcription.
+        model (str, optional): Name of the Whisper model size to use for transcription.
            Defaults to None, which uses the default model.
        language (str, optional): Language code for transcription. Defaults to None,
            which enables automatic language detection.
@@ -38,59 +83,67 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False):
        - The function downloads the specified model if not available locally.
        - The transcribed text files will be saved in a "transcriptions" folder
          within the specified path.
+        - Uses CTranslate2 for up to 4x faster inference compared to openai-whisper.
+        - FFmpeg is bundled via the PyAV dependency — no separate installation needed.

    """
-    # Check for GPU acceleration and set device
-    if backends.mps.is_available():
-        device = 'mps'
-        Generator('mps').manual_seed(42)
-    elif cuda.is_available():
-        device = 'cuda'
-        Generator('cuda').manual_seed(42)
-    else:
-        device = 'cpu'
-        Generator().manual_seed(42)
+    SEP = "─" * 46

-    # Load model on the correct device
-    model = whisper.load_model(model, device=device)
-    # Start main loop
-    files_transcripted=[]   
+    # ── Step 1: Detect hardware ──────────────────────────────────────
+    device, compute_type = _detect_device()
+    print(f"⚙  Device: {device}  |  Compute: {compute_type}")
+
+    # ── Step 2: Load model ───────────────────────────────────────────
+    print(f"⏳ Loading model '{model}' — downloading if needed...")
+    whisper_model = WhisperModel(model, device=device, compute_type=compute_type)
+    print("✅ Model ready!")
+    print(SEP)
+
+    # ── Step 3: Transcribe files ─────────────────────────────────────
+    total_files = len(glob_file)
+    print(f"📂 Found {total_files} item(s) in folder")
+    print(SEP)
+
+    files_transcripted = []
+    file_num = 0
    for file in glob_file:
        title = os.path.basename(file).split('.')[0]
-        print(Back.CYAN + '\nTrying to transcribe file named: {}\U0001f550'.format(title))
+        file_num += 1
+        print(f"\n{'─' * 46}")
+        print(f"📄 File {file_num}/{total_files}: {title}")
        try:
-            result = model.transcribe(
-                file, 
-                language=language, 
-                verbose=verbose
-                )
-            files_transcripted.append(result)
-            # Make folder if missing 
-            try:
-                os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
-            except FileExistsError:
-                pass
-            # Create segments for text files
-            start = []
-            end = []
-            text = []
-            for segment in result['segments']:
-                start.append(str(datetime.timedelta(seconds=segment['start'])))
-                end.append(str(datetime.timedelta(seconds=segment['end'])))
-                text.append(segment['text'])
-            # Save files to transcriptions folder
-            with open("{}/transcriptions/{}.txt".format(path, title), 'w', encoding='utf-8') as file:
-                file.write(title)
-                for i in range(len(result['segments'])):
-                    file.write('\n[{} --> {}]:{}'.format(start[i], end[i], text[i]))
-        # Skip invalid files
-        except RuntimeError:
-            print(Fore.RED + 'Not a valid file, skipping.')
-            pass
-     # Check if any files were processed.
+            segments, info = whisper_model.transcribe(
+                file,
+                language=language,
+                beam_size=5
+            )
+            # Make folder if missing
+            os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
+            # Stream segments as they are decoded
+            segment_list = []
+            with open("{}/transcriptions/{}.txt".format(path, title), 'w', encoding='utf-8') as f:
+                f.write(title)
+                for seg in segments:
+                    start_ts = str(datetime.timedelta(seconds=seg.start))
+                    end_ts = str(datetime.timedelta(seconds=seg.end))
+                    f.write('\n[{} --> {}]:{}'.format(start_ts, end_ts, seg.text))
+                    f.flush()
+                    if verbose:
+                        print("   [%.2fs → %.2fs] %s" % (seg.start, seg.end, seg.text))
+                    else:
+                        print("   Transcribed up to %.0fs..." % seg.end, end='\r')
+                    segment_list.append(seg)
+            print(f"✅ Done — saved to transcriptions/{title}.txt")
+            files_transcripted.append(segment_list)
+        except Exception:
+            print('⚠  Not a valid audio/video file, skipping.')
+
+    # ── Summary ──────────────────────────────────────────────────────
+    print(f"\n{SEP}")
    if len(files_transcripted) > 0:
-        output_text = 'Finished transcription, {} files can be found in {}/transcriptions'.format(len(files_transcripted), path)
+        output_text = f"✅ Finished! {len(files_transcripted)} file(s) transcribed.\n   Saved in: {path}/transcriptions"
    else:
-        output_text = 'No files elligible for transcription, try adding audio or video files to this folder or choose another folder!'
-    # Return output text
+        output_text = '⚠  No files eligible for transcription — try another folder.'
+    print(output_text)
+    print(SEP)
    return output_text