feat: enhance transcription capabilities with MLX support and backend detection

This commit is contained in:
2026-04-04 00:32:36 +02:00
parent f7d621e510
commit e29572420e
3 changed files with 362 additions and 41 deletions

175
app.py
View File

@@ -4,7 +4,7 @@ import tkinter as tk
from tkinter import ttk
from tkinter import filedialog
from tkinter import messagebox
from src._LocalTranscribe import transcribe, get_path
from src._LocalTranscribe import transcribe, get_path, detect_backend
import customtkinter
import threading
@@ -46,11 +46,93 @@ HF_MODEL_MAP = {
'KB Swedish (large)': 'KBLab/kb-whisper-large',
}
# Per-model info shown in the UI description label
# (speed, size, quality stars, suggested use)
MODEL_INFO = {
'tiny': ('Very fast', '~75 MB', '★★☆☆☆', 'Quick drafts & testing'),
'tiny.en': ('Very fast', '~75 MB', '★★☆☆☆', 'Quick drafts & testing (English only)'),
'base': ('Fast', '~145 MB', '★★★☆☆', 'Notes & short podcasts'),
'base.en': ('Fast', '~145 MB', '★★★☆☆', 'Notes & short podcasts (English only)'),
'small': ('Balanced', '~485 MB', '★★★★☆', 'Everyday use'),
'small.en': ('Balanced', '~485 MB', '★★★★☆', 'Everyday use (English only)'),
'medium': ('Accurate', '~1.5 GB', '★★★★☆', 'Professional content'),
'medium.en': ('Accurate', '~1.5 GB', '★★★★☆', 'Professional content (English only)'),
'large-v2': ('Slow', '~3 GB', '★★★★★', 'Maximum accuracy'),
'large-v3': ('Slow', '~3 GB', '★★★★★', 'Maximum accuracy (recommended)'),
'KB Swedish (tiny)': ('Very fast', '~75 MB', '★★★☆☆', 'Swedish — optimised by KBLab'),
'KB Swedish (base)': ('Fast', '~145 MB', '★★★☆☆', 'Swedish — optimised by KBLab'),
'KB Swedish (small)': ('Balanced', '~485 MB', '★★★★☆', 'Swedish — optimised by KBLab'),
'KB Swedish (medium)': ('Accurate', '~1.5 GB', '★★★★☆', 'Swedish — optimised by KBLab'),
'KB Swedish (large)': ('Slow', '~3 GB', '★★★★★', 'Swedish — KBLab, best accuracy'),
}
customtkinter.set_appearance_mode("System")
customtkinter.set_default_color_theme("blue") # Themes: blue (default), dark-blue, green
firstclick = True
# All languages supported by Whisper (display label → ISO code; None = auto-detect)
WHISPER_LANGUAGES = {
'Auto-detect': None,
'Afrikaans (af)': 'af', 'Albanian (sq)': 'sq',
'Amharic (am)': 'am', 'Arabic (ar)': 'ar',
'Armenian (hy)': 'hy', 'Assamese (as)': 'as',
'Azerbaijani (az)': 'az', 'Bashkir (ba)': 'ba',
'Basque (eu)': 'eu', 'Belarusian (be)': 'be',
'Bengali (bn)': 'bn', 'Bosnian (bs)': 'bs',
'Breton (br)': 'br', 'Bulgarian (bg)': 'bg',
'Catalan (ca)': 'ca', 'Chinese (zh)': 'zh',
'Croatian (hr)': 'hr', 'Czech (cs)': 'cs',
'Danish (da)': 'da', 'Dutch (nl)': 'nl',
'English (en)': 'en', 'Estonian (et)': 'et',
'Faroese (fo)': 'fo', 'Finnish (fi)': 'fi',
'French (fr)': 'fr', 'Galician (gl)': 'gl',
'Georgian (ka)': 'ka', 'German (de)': 'de',
'Greek (el)': 'el', 'Gujarati (gu)': 'gu',
'Haitian Creole (ht)': 'ht', 'Hausa (ha)': 'ha',
'Hawaiian (haw)': 'haw', 'Hebrew (he)': 'he',
'Hindi (hi)': 'hi', 'Hungarian (hu)': 'hu',
'Icelandic (is)': 'is', 'Indonesian (id)': 'id',
'Italian (it)': 'it', 'Japanese (ja)': 'ja',
'Javanese (jw)': 'jw', 'Kannada (kn)': 'kn',
'Kazakh (kk)': 'kk', 'Khmer (km)': 'km',
'Korean (ko)': 'ko', 'Lao (lo)': 'lo',
'Latin (la)': 'la', 'Latvian (lv)': 'lv',
'Lingala (ln)': 'ln', 'Lithuanian (lt)': 'lt',
'Luxembourgish (lb)': 'lb', 'Macedonian (mk)': 'mk',
'Malagasy (mg)': 'mg', 'Malay (ms)': 'ms',
'Malayalam (ml)': 'ml', 'Maltese (mt)': 'mt',
'Maori (mi)': 'mi', 'Marathi (mr)': 'mr',
'Mongolian (mn)': 'mn', 'Myanmar (my)': 'my',
'Nepali (ne)': 'ne', 'Norwegian (no)': 'no',
'Occitan (oc)': 'oc', 'Pashto (ps)': 'ps',
'Persian (fa)': 'fa', 'Polish (pl)': 'pl',
'Portuguese (pt)': 'pt', 'Punjabi (pa)': 'pa',
'Romanian (ro)': 'ro', 'Russian (ru)': 'ru',
'Sanskrit (sa)': 'sa', 'Serbian (sr)': 'sr',
'Shona (sn)': 'sn', 'Sindhi (sd)': 'sd',
'Sinhala (si)': 'si', 'Slovak (sk)': 'sk',
'Slovenian (sl)': 'sl', 'Somali (so)': 'so',
'Spanish (es)': 'es', 'Sundanese (su)': 'su',
'Swahili (sw)': 'sw', 'Swedish (sv)': 'sv',
'Tagalog (tl)': 'tl', 'Tajik (tg)': 'tg',
'Tamil (ta)': 'ta', 'Tatar (tt)': 'tt',
'Telugu (te)': 'te', 'Thai (th)': 'th',
'Tibetan (bo)': 'bo', 'Turkish (tr)': 'tr',
'Turkmen (tk)': 'tk', 'Ukrainian (uk)': 'uk',
'Urdu (ur)': 'ur', 'Uzbek (uz)': 'uz',
'Vietnamese (vi)': 'vi', 'Welsh (cy)': 'cy',
'Yiddish (yi)': 'yi', 'Yoruba (yo)': 'yo',
}
def _language_options_for_model(model_name):
"""Return (values, default, state) for the language combobox given a model name."""
if model_name.endswith('.en'):
return ['English (en)'], 'English (en)', 'disabled'
if model_name.startswith('KB Swedish'):
return ['Swedish (sv)'], 'Swedish (sv)', 'disabled'
return list(WHISPER_LANGUAGES.keys()), 'Auto-detect', 'readonly'
def _set_app_icon(root):
@@ -94,22 +176,16 @@ class App:
self.path_entry.insert(0, os.path.join(os.getcwd(), 'sample_audio'))
self.path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
customtkinter.CTkButton(path_frame, text="Browse", command=self.browse, font=font).pack(side=tk.LEFT, padx=5)
# Language frame
#thanks to pommicket from Stackoverflow for this fix
def on_entry_click(event):
"""function that gets called whenever entry is clicked"""
global firstclick
if firstclick: # if this is the first time they clicked it
firstclick = False
self.language_entry.delete(0, "end") # delete all the text in the entry
# Language frame
language_frame = customtkinter.CTkFrame(master)
language_frame.pack(fill=tk.BOTH, padx=10, pady=10)
customtkinter.CTkLabel(language_frame, text="Language:", font=font).pack(side=tk.LEFT, padx=5)
self.language_entry = customtkinter.CTkEntry(language_frame, width=50, font=('Roboto', 12, 'italic'))
self.default_language_text = "Enter language (or ignore to auto-detect)"
self.language_entry.insert(0, self.default_language_text)
self.language_entry.bind('<FocusIn>', on_entry_click)
self.language_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
_lang_values, _lang_default, _lang_state = _language_options_for_model('medium')
self.language_combobox = customtkinter.CTkComboBox(
language_frame, width=50, state=_lang_state,
values=_lang_values, font=font_b)
self.language_combobox.set(_lang_default)
self.language_combobox.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Model frame
models = ['tiny', 'tiny.en', 'base', 'base.en',
'small', 'small.en', 'medium', 'medium.en',
@@ -124,9 +200,16 @@ class App:
# ComboBox frame
self.model_combobox = customtkinter.CTkComboBox(
model_frame, width=50, state="readonly",
values=models, font=font_b)
values=models, font=font_b,
command=self._on_model_change)
self.model_combobox.set('medium') # Set the default value
self.model_combobox.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Model description label
self.model_desc_label = customtkinter.CTkLabel(
master, text=self._model_desc_text('medium'),
font=('Roboto', 11), text_color=('#555555', '#aaaaaa'),
anchor='w')
self.model_desc_label.pack(fill=tk.X, padx=14, pady=(0, 4))
# Timestamps toggle
ts_frame = customtkinter.CTkFrame(master)
ts_frame.pack(fill=tk.BOTH, padx=10, pady=10)
@@ -137,11 +220,17 @@ class App:
self.timestamps_switch.pack(side=tk.LEFT, padx=5)
# Progress Bar
self.progress_bar = ttk.Progressbar(master, length=200, mode='indeterminate')
# Stop event for cancellation
self._stop_event = threading.Event()
# Button actions frame
button_frame = customtkinter.CTkFrame(master)
button_frame.pack(fill=tk.BOTH, padx=10, pady=10)
self.transcribe_button = customtkinter.CTkButton(button_frame, text="Transcribe", command=self.start_transcription, font=font)
self.transcribe_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
self.stop_button = customtkinter.CTkButton(
button_frame, text="Stop", command=self._stop_transcription, font=font,
fg_color="#c0392b", hover_color="#922b21", state=tk.DISABLED)
self.stop_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
customtkinter.CTkButton(button_frame, text="Quit", command=master.quit, font=font).pack(side=tk.RIGHT, padx=5, pady=10, fill=tk.X, expand=True)
# ── Embedded console / log panel ──────────────────────────────────
@@ -156,11 +245,40 @@ class App:
sys.stdout = _ConsoleRedirector(self.log_box)
sys.stderr = _ConsoleRedirector(self.log_box)
# Backend indicator
_bi = detect_backend()
backend_label = customtkinter.CTkLabel(
master,
text=f"Backend: {_bi['label']}",
font=('Roboto', 11),
text_color=("#555555", "#aaaaaa"),
anchor='e',
)
backend_label.pack(fill=tk.X, padx=12, pady=(0, 2))
# Welcome message (shown after redirect so it appears in the panel)
print("Welcome to Local Transcribe with Whisper! \U0001f600")
print("Transcriptions will be saved automatically.")
print("" * 46)
# Helper functions
def _stop_transcription(self):
self._stop_event.set()
self.stop_button.configure(state=tk.DISABLED)
print("⛔ Stop requested — finishing current file…")
def _model_desc_text(self, model_name):
info = MODEL_INFO.get(model_name)
if not info:
return ''
speed, size, stars, use = info
return f'{stars} {speed} · {size} · {use}'
def _on_model_change(self, selected):
self.model_desc_label.configure(text=self._model_desc_text(selected))
values, default, state = _language_options_for_model(selected)
self.language_combobox.configure(values=values, state=state)
self.language_combobox.set(default)
# Browsing
def browse(self):
initial_dir = os.getcwd()
@@ -169,10 +287,10 @@ class App:
self.path_entry.insert(0, folder_path)
# Start transcription
def start_transcription(self):
# Disable transcribe button
self._stop_event.clear()
self.transcribe_button.configure(state=tk.DISABLED)
# Start a new thread for the transcription process
threading.Thread(target=self.transcribe_thread).start()
self.stop_button.configure(state=tk.NORMAL)
threading.Thread(target=self.transcribe_thread, daemon=True).start()
# Threading
def transcribe_thread(self):
path = self.path_entry.get()
@@ -183,14 +301,8 @@ class App:
self.transcribe_button.configure(state=tk.NORMAL)
return
model = HF_MODEL_MAP.get(model_display, model_display)
language = self.language_entry.get()
# Auto-set Swedish for KB models
is_kb_model = model_display.startswith('KB Swedish')
# Check if the language field has the default text or is empty
if is_kb_model:
language = 'sv'
elif language == self.default_language_text or not language.strip():
language = None # This is the same as passing nothing
lang_label = self.language_combobox.get()
language = WHISPER_LANGUAGES.get(lang_label, lang_label) if lang_label else None
verbose = True # always show transcription progress in the console panel
timestamps = self.timestamps_var.get()
# Show progress bar
@@ -201,16 +313,17 @@ class App:
#messagebox.showinfo("Message", "Starting transcription!")
# Start transcription
try:
output_text = transcribe(path, glob_file, model, language, verbose, timestamps)
output_text = transcribe(path, glob_file, model, language, verbose, timestamps, stop_event=self._stop_event)
except UnboundLocalError:
messagebox.showinfo("Files not found error!", 'Nothing found, choose another folder.')
pass
except ValueError:
messagebox.showinfo("Invalid language name, you might have to clear the default text to continue!")
except ValueError as e:
messagebox.showinfo("Error", str(e))
# Hide progress bar
self.progress_bar.stop()
self.progress_bar.pack_forget()
# Enable transcribe button
# Restore buttons
self.stop_button.configure(state=tk.DISABLED)
self.transcribe_button.configure(state=tk.NORMAL)
# Recover output text
try: