feat: enhance transcription capabilities with MLX support and backend detection

2026-04-04 00:32:36 +02:00
parent f7d621e510
commit e29572420e
3 changed files with 362 additions and 41 deletions
@@ -4,7 +4,7 @@ import tkinter as tk
 from tkinter import ttk
 from tkinter import filedialog
 from tkinter import messagebox
-from src._LocalTranscribe import transcribe, get_path
+from src._LocalTranscribe import transcribe, get_path, detect_backend
 import customtkinter
 import threading

@@ -46,11 +46,93 @@ HF_MODEL_MAP = {
    'KB Swedish (large)':  'KBLab/kb-whisper-large',
 }

+# Per-model info shown in the UI description label
+# (speed, size, quality stars, suggested use)
+MODEL_INFO = {
+    'tiny':                 ('Very fast',   '~75 MB',   '★★☆☆☆', 'Quick drafts & testing'),
+    'tiny.en':              ('Very fast',   '~75 MB',   '★★☆☆☆', 'Quick drafts & testing (English only)'),
+    'base':                 ('Fast',        '~145 MB',  '★★★☆☆', 'Notes & short podcasts'),
+    'base.en':              ('Fast',        '~145 MB',  '★★★☆☆', 'Notes & short podcasts (English only)'),
+    'small':                ('Balanced',    '~485 MB',  '★★★★☆', 'Everyday use'),
+    'small.en':             ('Balanced',    '~485 MB',  '★★★★☆', 'Everyday use (English only)'),
+    'medium':               ('Accurate',    '~1.5 GB',  '★★★★☆', 'Professional content'),
+    'medium.en':            ('Accurate',    '~1.5 GB',  '★★★★☆', 'Professional content (English only)'),
+    'large-v2':             ('Slow',        '~3 GB',    '★★★★★', 'Maximum accuracy'),
+    'large-v3':             ('Slow',        '~3 GB',    '★★★★★', 'Maximum accuracy (recommended)'),
+    'KB Swedish (tiny)':    ('Very fast',   '~75 MB',   '★★★☆☆', 'Swedish — optimised by KBLab'),
+    'KB Swedish (base)':    ('Fast',        '~145 MB',  '★★★☆☆', 'Swedish — optimised by KBLab'),
+    'KB Swedish (small)':   ('Balanced',    '~485 MB',  '★★★★☆', 'Swedish — optimised by KBLab'),
+    'KB Swedish (medium)':  ('Accurate',    '~1.5 GB',  '★★★★☆', 'Swedish — optimised by KBLab'),
+    'KB Swedish (large)':   ('Slow',        '~3 GB',    '★★★★★', 'Swedish — KBLab, best accuracy'),
+}
+


 customtkinter.set_appearance_mode("System")
 customtkinter.set_default_color_theme("blue")  # Themes: blue (default), dark-blue, green
-firstclick = True
+
+# All languages supported by Whisper  (display label → ISO code; None = auto-detect)
+WHISPER_LANGUAGES = {
+    'Auto-detect':          None,
+    'Afrikaans (af)':       'af',   'Albanian (sq)':        'sq',
+    'Amharic (am)':         'am',   'Arabic (ar)':          'ar',
+    'Armenian (hy)':        'hy',   'Assamese (as)':        'as',
+    'Azerbaijani (az)':     'az',   'Bashkir (ba)':         'ba',
+    'Basque (eu)':          'eu',   'Belarusian (be)':      'be',
+    'Bengali (bn)':         'bn',   'Bosnian (bs)':         'bs',
+    'Breton (br)':          'br',   'Bulgarian (bg)':       'bg',
+    'Catalan (ca)':         'ca',   'Chinese (zh)':         'zh',
+    'Croatian (hr)':        'hr',   'Czech (cs)':           'cs',
+    'Danish (da)':          'da',   'Dutch (nl)':           'nl',
+    'English (en)':         'en',   'Estonian (et)':        'et',
+    'Faroese (fo)':         'fo',   'Finnish (fi)':         'fi',
+    'French (fr)':          'fr',   'Galician (gl)':        'gl',
+    'Georgian (ka)':        'ka',   'German (de)':          'de',
+    'Greek (el)':           'el',   'Gujarati (gu)':        'gu',
+    'Haitian Creole (ht)':  'ht',   'Hausa (ha)':           'ha',
+    'Hawaiian (haw)':       'haw',  'Hebrew (he)':          'he',
+    'Hindi (hi)':           'hi',   'Hungarian (hu)':       'hu',
+    'Icelandic (is)':       'is',   'Indonesian (id)':      'id',
+    'Italian (it)':         'it',   'Japanese (ja)':        'ja',
+    'Javanese (jw)':        'jw',   'Kannada (kn)':         'kn',
+    'Kazakh (kk)':          'kk',   'Khmer (km)':           'km',
+    'Korean (ko)':          'ko',   'Lao (lo)':             'lo',
+    'Latin (la)':           'la',   'Latvian (lv)':         'lv',
+    'Lingala (ln)':         'ln',   'Lithuanian (lt)':      'lt',
+    'Luxembourgish (lb)':   'lb',   'Macedonian (mk)':      'mk',
+    'Malagasy (mg)':        'mg',   'Malay (ms)':           'ms',
+    'Malayalam (ml)':       'ml',   'Maltese (mt)':         'mt',
+    'Maori (mi)':           'mi',   'Marathi (mr)':         'mr',
+    'Mongolian (mn)':       'mn',   'Myanmar (my)':         'my',
+    'Nepali (ne)':          'ne',   'Norwegian (no)':       'no',
+    'Occitan (oc)':         'oc',   'Pashto (ps)':          'ps',
+    'Persian (fa)':         'fa',   'Polish (pl)':          'pl',
+    'Portuguese (pt)':      'pt',   'Punjabi (pa)':         'pa',
+    'Romanian (ro)':        'ro',   'Russian (ru)':         'ru',
+    'Sanskrit (sa)':        'sa',   'Serbian (sr)':         'sr',
+    'Shona (sn)':           'sn',   'Sindhi (sd)':          'sd',
+    'Sinhala (si)':         'si',   'Slovak (sk)':          'sk',
+    'Slovenian (sl)':       'sl',   'Somali (so)':          'so',
+    'Spanish (es)':         'es',   'Sundanese (su)':       'su',
+    'Swahili (sw)':         'sw',   'Swedish (sv)':         'sv',
+    'Tagalog (tl)':         'tl',   'Tajik (tg)':           'tg',
+    'Tamil (ta)':           'ta',   'Tatar (tt)':           'tt',
+    'Telugu (te)':          'te',   'Thai (th)':            'th',
+    'Tibetan (bo)':         'bo',   'Turkish (tr)':         'tr',
+    'Turkmen (tk)':         'tk',   'Ukrainian (uk)':       'uk',
+    'Urdu (ur)':            'ur',   'Uzbek (uz)':           'uz',
+    'Vietnamese (vi)':      'vi',   'Welsh (cy)':           'cy',
+    'Yiddish (yi)':         'yi',   'Yoruba (yo)':          'yo',
+}
+
+
+def _language_options_for_model(model_name):
+    """Return (values, default, state) for the language combobox given a model name."""
+    if model_name.endswith('.en'):
+        return ['English (en)'], 'English (en)', 'disabled'
+    if model_name.startswith('KB Swedish'):
+        return ['Swedish (sv)'], 'Swedish (sv)', 'disabled'
+    return list(WHISPER_LANGUAGES.keys()), 'Auto-detect', 'readonly'


 def _set_app_icon(root):
@@ -94,22 +176,16 @@ class App:
        self.path_entry.insert(0, os.path.join(os.getcwd(), 'sample_audio'))
        self.path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
        customtkinter.CTkButton(path_frame, text="Browse", command=self.browse, font=font).pack(side=tk.LEFT, padx=5)
-        # Language frame        
-        #thanks to pommicket from Stackoverflow for this fix
-        def on_entry_click(event):
-            """function that gets called whenever entry is clicked"""        
-            global firstclick
-            if firstclick: # if this is the first time they clicked it
-                firstclick = False
-                self.language_entry.delete(0, "end") # delete all the text in the entry
+        # Language frame
        language_frame = customtkinter.CTkFrame(master)
        language_frame.pack(fill=tk.BOTH, padx=10, pady=10)
        customtkinter.CTkLabel(language_frame, text="Language:", font=font).pack(side=tk.LEFT, padx=5)
-        self.language_entry = customtkinter.CTkEntry(language_frame, width=50, font=('Roboto', 12, 'italic'))
-        self.default_language_text = "Enter language (or ignore to auto-detect)"
-        self.language_entry.insert(0, self.default_language_text)
-        self.language_entry.bind('<FocusIn>', on_entry_click)
-        self.language_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
+        _lang_values, _lang_default, _lang_state = _language_options_for_model('medium')
+        self.language_combobox = customtkinter.CTkComboBox(
+            language_frame, width=50, state=_lang_state,
+            values=_lang_values, font=font_b)
+        self.language_combobox.set(_lang_default)
+        self.language_combobox.pack(side=tk.LEFT, fill=tk.X, expand=True)
        # Model frame
        models = ['tiny', 'tiny.en', 'base', 'base.en',
                  'small', 'small.en', 'medium', 'medium.en',
@@ -124,9 +200,16 @@ class App:
        # ComboBox frame
        self.model_combobox = customtkinter.CTkComboBox(
            model_frame, width=50, state="readonly",
-            values=models, font=font_b)
+            values=models, font=font_b,
+            command=self._on_model_change)
        self.model_combobox.set('medium')  # Set the default value
        self.model_combobox.pack(side=tk.LEFT, fill=tk.X, expand=True)
+        # Model description label
+        self.model_desc_label = customtkinter.CTkLabel(
+            master, text=self._model_desc_text('medium'),
+            font=('Roboto', 11), text_color=('#555555', '#aaaaaa'),
+            anchor='w')
+        self.model_desc_label.pack(fill=tk.X, padx=14, pady=(0, 4))
        # Timestamps toggle
        ts_frame = customtkinter.CTkFrame(master)
        ts_frame.pack(fill=tk.BOTH, padx=10, pady=10)
@@ -137,11 +220,17 @@ class App:
        self.timestamps_switch.pack(side=tk.LEFT, padx=5)
        # Progress Bar
        self.progress_bar = ttk.Progressbar(master, length=200, mode='indeterminate')
+        # Stop event for cancellation
+        self._stop_event = threading.Event()
        # Button actions frame
        button_frame = customtkinter.CTkFrame(master)
        button_frame.pack(fill=tk.BOTH, padx=10, pady=10)
        self.transcribe_button = customtkinter.CTkButton(button_frame, text="Transcribe", command=self.start_transcription, font=font)
        self.transcribe_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
+        self.stop_button = customtkinter.CTkButton(
+            button_frame, text="Stop", command=self._stop_transcription, font=font,
+            fg_color="#c0392b", hover_color="#922b21", state=tk.DISABLED)
+        self.stop_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
        customtkinter.CTkButton(button_frame, text="Quit", command=master.quit, font=font).pack(side=tk.RIGHT, padx=5, pady=10, fill=tk.X, expand=True)

        # ── Embedded console / log panel ──────────────────────────────────
@@ -156,11 +245,40 @@ class App:
        sys.stdout = _ConsoleRedirector(self.log_box)
        sys.stderr = _ConsoleRedirector(self.log_box)

+        # Backend indicator
+        _bi = detect_backend()
+        backend_label = customtkinter.CTkLabel(
+            master,
+            text=f"Backend: {_bi['label']}",
+            font=('Roboto', 11),
+            text_color=("#555555", "#aaaaaa"),
+            anchor='e',
+        )
+        backend_label.pack(fill=tk.X, padx=12, pady=(0, 2))
+
        # Welcome message (shown after redirect so it appears in the panel)
        print("Welcome to Local Transcribe with Whisper! \U0001f600")
        print("Transcriptions will be saved automatically.")
        print("─" * 46)
    # Helper functions
+    def _stop_transcription(self):
+        self._stop_event.set()
+        self.stop_button.configure(state=tk.DISABLED)
+        print("⛔  Stop requested — finishing current file…")
+
+    def _model_desc_text(self, model_name):
+        info = MODEL_INFO.get(model_name)
+        if not info:
+            return ''
+        speed, size, stars, use = info
+        return f'{stars}  {speed}  ·  {size}  ·  {use}'
+
+    def _on_model_change(self, selected):
+        self.model_desc_label.configure(text=self._model_desc_text(selected))
+        values, default, state = _language_options_for_model(selected)
+        self.language_combobox.configure(values=values, state=state)
+        self.language_combobox.set(default)
+
    # Browsing
    def browse(self):
        initial_dir = os.getcwd()
@@ -169,10 +287,10 @@ class App:
        self.path_entry.insert(0, folder_path)
    # Start transcription
    def start_transcription(self):
-        # Disable transcribe button
+        self._stop_event.clear()
        self.transcribe_button.configure(state=tk.DISABLED)
-        # Start a new thread for the transcription process
-        threading.Thread(target=self.transcribe_thread).start()
+        self.stop_button.configure(state=tk.NORMAL)
+        threading.Thread(target=self.transcribe_thread, daemon=True).start()
    # Threading
    def transcribe_thread(self):
        path = self.path_entry.get()
@@ -183,14 +301,8 @@ class App:
            self.transcribe_button.configure(state=tk.NORMAL)
            return
        model = HF_MODEL_MAP.get(model_display, model_display)
-        language = self.language_entry.get()
-        # Auto-set Swedish for KB models
-        is_kb_model = model_display.startswith('KB Swedish')
-        # Check if the language field has the default text or is empty
-        if is_kb_model:
-            language = 'sv'
-        elif language == self.default_language_text or not language.strip():
-            language = None  # This is the same as passing nothing
+        lang_label = self.language_combobox.get()
+        language = WHISPER_LANGUAGES.get(lang_label, lang_label) if lang_label else None
        verbose = True   # always show transcription progress in the console panel
        timestamps = self.timestamps_var.get()
        # Show progress bar
@@ -201,16 +313,17 @@ class App:
        #messagebox.showinfo("Message", "Starting transcription!")
        # Start transcription
        try:
-            output_text = transcribe(path, glob_file, model, language, verbose, timestamps)
+            output_text = transcribe(path, glob_file, model, language, verbose, timestamps, stop_event=self._stop_event)
        except UnboundLocalError:
            messagebox.showinfo("Files not found error!", 'Nothing found, choose another folder.')
            pass
-        except ValueError:
-            messagebox.showinfo("Invalid language name, you might have to clear the default text to continue!")
+        except ValueError as e:
+            messagebox.showinfo("Error", str(e))
        # Hide progress bar
        self.progress_bar.stop()
        self.progress_bar.pack_forget()
-        # Enable transcribe button
+        # Restore buttons
+        self.stop_button.configure(state=tk.DISABLED)
        self.transcribe_button.configure(state=tk.NORMAL)
        # Recover output text
        try:
@@ -1,2 +1,3 @@
 faster-whisper
+mlx-whisper
 customtkinter
@@ -1,5 +1,6 @@
 import os
 import sys
+import platform
 import datetime
 import time
 import site
@@ -66,16 +67,124 @@ SUPPORTED_EXTENSIONS = {
 }


-def _detect_device():
-    """Return (device, compute_type) for the best available backend."""
+# ---------------------------------------------------------------------------
+# MLX model map  (Apple Silicon only)
+# ---------------------------------------------------------------------------
+_MLX_MODEL_MAP = {
+    "tiny":     "mlx-community/whisper-tiny-mlx",
+    "base":     "mlx-community/whisper-base-mlx",
+    "small":    "mlx-community/whisper-small-mlx",
+    "medium":   "mlx-community/whisper-medium-mlx",
+    "large-v2": "mlx-community/whisper-large-v2-mlx",
+    "large-v3": "mlx-community/whisper-large-v3-mlx",
+}
+
+
+def detect_backend():
+    """Return the best available inference backend.
+
+    Returns a dict with keys:
+        backend      : "mlx" | "cuda" | "cpu"
+        device       : device string for WhisperModel (cuda / cpu)
+        compute_type : compute type string for WhisperModel
+        label        : human-readable label for UI display
+    """
+    # Apple Silicon → try MLX (GPU + Neural Engine via Apple MLX)
+    if sys.platform == "darwin" and platform.machine() == "arm64":
+        try:
+            import mlx_whisper  # noqa: F401
+            return {
+                "backend": "mlx",
+                "device": "cpu",
+                "compute_type": "int8",
+                "label": "MLX · Apple GPU/NPU",
+            }
+        except ImportError:
+            pass
+
+    # NVIDIA CUDA
    try:
        import ctranslate2
        cuda_types = ctranslate2.get_supported_compute_types("cuda")
        if "float16" in cuda_types:
-            return "cuda", "float16"
+            return {
+                "backend": "cuda",
+                "device": "cuda",
+                "compute_type": "float16",
+                "label": "CUDA · GPU",
+            }
    except Exception:
        pass
-    return "cpu", "int8"
+
+    return {
+        "backend": "cpu",
+        "device": "cpu",
+        "compute_type": "int8",
+        "label": "CPU · int8",
+    }
+
+
+def _decode_audio_pyav(file_path):
+    """Decode any audio/video file to a float32 mono 16 kHz numpy array.
+
+    Uses PyAV (bundled FFmpeg) — no external ffmpeg binary required.
+    Returns (audio_array, duration_seconds).
+    """
+    import av
+    import numpy as np
+
+    with av.open(file_path) as container:
+        duration = float(container.duration) / 1_000_000  # microseconds → seconds
+        stream = container.streams.audio[0]
+        resampler = av.AudioResampler(format="fltp", layout="mono", rate=16000)
+        chunks = []
+        for frame in container.decode(stream):
+            for out in resampler.resample(frame):
+                if out:
+                    chunks.append(out.to_ndarray()[0])
+        # Flush resampler
+        for out in resampler.resample(None):
+            if out:
+                chunks.append(out.to_ndarray()[0])
+
+    if not chunks:
+        return np.zeros(0, dtype=np.float32), duration
+    return np.concatenate(chunks, axis=0), duration
+
+
+def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose):
+    """Transcribe a single file with mlx-whisper (Apple GPU/NPU).
+
+    Decodes audio via PyAV (no system ffmpeg needed), then runs MLX inference.
+    Returns (segments_as_dicts, audio_duration_seconds).
+    Segments have dict keys: 'start', 'end', 'text'.
+    """
+    import mlx_whisper
+
+    audio_array, duration = _decode_audio_pyav(file)
+
+    decode_opts = {}
+    if language:
+        decode_opts["language"] = language
+
+    result = mlx_whisper.transcribe(
+        audio_array,
+        path_or_hf_repo=mlx_model_id,
+        verbose=(True if verbose else None),
+        **decode_opts,
+    )
+    segments = result["segments"]
+    audio_duration = segments[-1]["end"] if segments else duration
+    return segments, audio_duration
+
+
+def _srt_timestamp(seconds):
+    """Convert seconds (float) to SRT timestamp format HH:MM:SS,mmm."""
+    ms = round(seconds * 1000)
+    h, ms = divmod(ms, 3_600_000)
+    m, ms = divmod(ms, 60_000)
+    s, ms = divmod(ms, 1000)
+    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"


 # Get the path
@@ -91,7 +200,7 @@ def get_path(path):
    return sorted(media_files)

 # Main function
-def transcribe(path, glob_file, model=None, language=None, verbose=False, timestamps=True):
+def transcribe(path, glob_file, model=None, language=None, verbose=False, timestamps=True, stop_event=None):
    """
    Transcribes audio files in a specified folder using faster-whisper (CTranslate2).

@@ -122,10 +231,98 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
    SEP = "─" * 46

    # ── Step 1: Detect hardware ──────────────────────────────────────
-    device, compute_type = _detect_device()
-    print(f"⚙  Device: {device}  |  Compute: {compute_type}")
+    backend_info = detect_backend()
+    backend   = backend_info["backend"]
+    device    = backend_info["device"]
+    compute_type = backend_info["compute_type"]
+    print(f"⚙  Backend: {backend_info['label']}")

-    # ── Step 2: Load model ───────────────────────────────────────────
+    # ── Step 1b: MLX path (Apple GPU/NPU) ───────────────────────────
+    if backend == "mlx":
+        mlx_model_id = _MLX_MODEL_MAP.get(model)
+        if mlx_model_id is None:
+            print(f"⚠  Model '{model}' is not available in MLX format.")
+            print("   Falling back to faster-whisper on CPU (int8).")
+            backend = "cpu"
+            device, compute_type = "cpu", "int8"
+        else:
+            # ── Step 2 (MLX): load + transcribe ─────────────────────
+            print(f"⏳ Loading MLX model '{model}' — downloading if needed...")
+            print("✅ Model ready!")
+            print(SEP)
+
+            total_files = len(glob_file)
+            print(f"📂 Found {total_files} supported media file(s) in folder")
+            print(SEP)
+
+            if total_files == 0:
+                output_text = '⚠  No supported media files found — try another folder.'
+                print(output_text)
+                print(SEP)
+                return output_text
+
+            files_transcripted = []
+            file_num = 0
+            for file in glob_file:
+                if stop_event and stop_event.is_set():
+                    print("⛔  Transcription stopped by user.")
+                    break
+                title = os.path.basename(file).split('.')[0]
+                file_num += 1
+                print(f"\n{'─' * 46}")
+                print(f"📄 File {file_num}/{total_files}: {title}")
+                try:
+                    t_start = time.time()
+                    segments, audio_duration = _transcribe_mlx_file(
+                        file, mlx_model_id, language, timestamps, verbose
+                    )
+                    os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
+                    segment_list = []
+                    txt_path = "{}/transcriptions/{}.txt".format(path, title)
+                    srt_path = "{}/transcriptions/{}.srt".format(path, title)
+                    with open(txt_path, 'w', encoding='utf-8') as f, \
+                         open(srt_path, 'w', encoding='utf-8') as srt_f:
+                        f.write(title)
+                        f.write('\n' + '─' * 40 + '\n')
+                        for idx, seg in enumerate(segments, start=1):
+                            if stop_event and stop_event.is_set():
+                                break
+                            text = seg["text"].strip()
+                            if timestamps:
+                                start_ts = str(datetime.timedelta(seconds=seg["start"]))
+                                end_ts   = str(datetime.timedelta(seconds=seg["end"]))
+                                f.write('\n[{} --> {}] {}'.format(start_ts, end_ts, text))
+                            else:
+                                f.write('\n{}'.format(text))
+                            srt_f.write(f'{idx}\n{_srt_timestamp(seg["start"])} --> {_srt_timestamp(seg["end"])}\n{text}\n\n')
+                            f.flush()
+                            srt_f.flush()
+                            if verbose:
+                                print("   [%.2fs → %.2fs] %s" % (seg["start"], seg["end"], seg["text"]))
+                            else:
+                                print("   Transcribed up to %.0fs..." % seg["end"], end='\r')
+                            segment_list.append(seg)
+                    elapsed = time.time() - t_start
+                    elapsed_min = elapsed / 60.0
+                    audio_min   = audio_duration / 60.0
+                    ratio = audio_duration / elapsed if elapsed > 0 else float('inf')
+                    print(f"✅ Done — saved to transcriptions/{title}.txt")
+                    print(f"⏱  Transcribed {audio_min:.1f} min of audio in {elapsed_min:.1f} min  ({ratio:.1f}x realtime)")
+                    files_transcripted.append(segment_list)
+                except Exception as exc:
+                    print(f"⚠  Could not decode '{os.path.basename(file)}', skipping.")
+                    print(f"   Reason: {exc}")
+
+            print(f"\n{SEP}")
+            if files_transcripted:
+                output_text = f"✅ Finished! {len(files_transcripted)} file(s) transcribed.\n   Saved in: {path}/transcriptions"
+            else:
+                output_text = '⚠  No files eligible for transcription — try another folder.'
+            print(output_text)
+            print(SEP)
+            return output_text
+
+    # ── Step 2: Load model (faster-whisper / CTranslate2) ───────────
    print(f"⏳ Loading model '{model}' — downloading if needed...")
    try:
        whisper_model = WhisperModel(model, device=device, compute_type=compute_type)
@@ -164,6 +361,9 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
    files_transcripted = []
    file_num = 0
    for file in glob_file:
+        if stop_event and stop_event.is_set():
+            print("⛔  Transcription stopped by user.")
+            break
        title = os.path.basename(file).split('.')[0]
        file_num += 1
        print(f"\n{'─' * 46}")
@@ -180,10 +380,15 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
            os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
            # Stream segments as they are decoded
            segment_list = []
-            with open("{}/transcriptions/{}.txt".format(path, title), 'w', encoding='utf-8') as f:
+            txt_path = "{}/transcriptions/{}.txt".format(path, title)
+            srt_path = "{}/transcriptions/{}.srt".format(path, title)
+            with open(txt_path, 'w', encoding='utf-8') as f, \
+                 open(srt_path, 'w', encoding='utf-8') as srt_f:
                f.write(title)
                f.write('\n' + '─' * 40 + '\n')
-                for seg in segments:
+                for idx, seg in enumerate(segments, start=1):
+                    if stop_event and stop_event.is_set():
+                        break
                    text = seg.text.strip()
                    if timestamps:
                        start_ts = str(datetime.timedelta(seconds=seg.start))
@@ -191,7 +396,9 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
                        f.write('\n[{} --> {}] {}'.format(start_ts, end_ts, text))
                    else:
                        f.write('\n{}'.format(text))
+                    srt_f.write(f'{idx}\n{_srt_timestamp(seg.start)} --> {_srt_timestamp(seg.end)}\n{text}\n\n')
                    f.flush()
+                    srt_f.flush()
                    if verbose:
                        print("   [%.2fs → %.2fs] %s" % (seg.start, seg.end, seg.text))
                    else: