feat: add advanced transcription options for VAD, word-level timestamps, and translation

This commit is contained in:
2026-04-11 14:06:04 +02:00
parent 8d5c8d6097
commit 0293a13177
2 changed files with 47 additions and 7 deletions

19
app.py
View File

@@ -219,6 +219,21 @@ class App:
ts_frame, text="Include timestamps in transcription", ts_frame, text="Include timestamps in transcription",
variable=self.timestamps_var, font=font_b) variable=self.timestamps_var, font=font_b)
self.timestamps_switch.pack(side=tk.LEFT, padx=5) self.timestamps_switch.pack(side=tk.LEFT, padx=5)
# Advanced options frame
adv_frame = customtkinter.CTkFrame(master)
adv_frame.pack(fill=tk.BOTH, padx=10, pady=10)
self.vad_var = tk.BooleanVar(value=False)
customtkinter.CTkSwitch(
adv_frame, text="VAD filter (remove silence)",
variable=self.vad_var, font=font_b).pack(side=tk.LEFT, padx=5)
self.word_ts_var = tk.BooleanVar(value=False)
customtkinter.CTkSwitch(
adv_frame, text="Word-level timestamps",
variable=self.word_ts_var, font=font_b).pack(side=tk.LEFT, padx=5)
self.translate_var = tk.BooleanVar(value=False)
customtkinter.CTkSwitch(
adv_frame, text="Translate to English",
variable=self.translate_var, font=font_b).pack(side=tk.LEFT, padx=5)
# Progress Bar # Progress Bar
self.progress_bar = ttk.Progressbar(master, length=200, mode='indeterminate') self.progress_bar = ttk.Progressbar(master, length=200, mode='indeterminate')
# Worker process handle (replaces thread+stop_event for true immediate cancellation) # Worker process handle (replaces thread+stop_event for true immediate cancellation)
@@ -320,6 +335,9 @@ class App:
lang_label = self.language_combobox.get() lang_label = self.language_combobox.get()
language = WHISPER_LANGUAGES.get(lang_label, lang_label) if lang_label else None language = WHISPER_LANGUAGES.get(lang_label, lang_label) if lang_label else None
timestamps = self.timestamps_var.get() timestamps = self.timestamps_var.get()
vad_filter = self.vad_var.get()
word_timestamps = self.word_ts_var.get()
translate = self.translate_var.get()
glob_file = get_path(path) glob_file = get_path(path)
self.progress_bar.pack(fill=tk.X, padx=5, pady=5) self.progress_bar.pack(fill=tk.X, padx=5, pady=5)
self.progress_bar.start() self.progress_bar.start()
@@ -327,6 +345,7 @@ class App:
self._proc = mp.Process( self._proc = mp.Process(
target=_transcribe_worker_process, target=_transcribe_worker_process,
args=(self._child_conn, path, glob_file, model, language, True, timestamps), args=(self._child_conn, path, glob_file, model, language, True, timestamps),
kwargs={"vad_filter": vad_filter, "word_timestamps": word_timestamps, "translate": translate},
daemon=True, daemon=True,
) )
self._proc.start() self._proc.start()

View File

@@ -152,7 +152,7 @@ def _decode_audio_pyav(file_path):
return np.concatenate(chunks, axis=0), duration return np.concatenate(chunks, axis=0), duration
def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose): def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose, vad_filter=False, word_timestamps=False, translate=False):
"""Transcribe a single file with mlx-whisper (Apple GPU/NPU). """Transcribe a single file with mlx-whisper (Apple GPU/NPU).
Decodes audio via PyAV (no system ffmpeg needed), then runs MLX inference. Decodes audio via PyAV (no system ffmpeg needed), then runs MLX inference.
@@ -166,6 +166,10 @@ def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose):
decode_opts = {} decode_opts = {}
if language: if language:
decode_opts["language"] = language decode_opts["language"] = language
if translate:
decode_opts["task"] = "translate"
if word_timestamps:
decode_opts["word_timestamps"] = True
result = mlx_whisper.transcribe( result = mlx_whisper.transcribe(
audio_array, audio_array,
@@ -200,7 +204,7 @@ def get_path(path):
return sorted(media_files) return sorted(media_files)
# Main function # Main function
def transcribe(path, glob_file, model=None, language=None, verbose=False, timestamps=True, stop_event=None): def transcribe(path, glob_file, model=None, language=None, verbose=False, timestamps=True, stop_event=None, vad_filter=False, word_timestamps=False, translate=False):
""" """
Transcribes audio files in a specified folder using faster-whisper (CTranslate2). Transcribes audio files in a specified folder using faster-whisper (CTranslate2).
@@ -274,7 +278,9 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
try: try:
t_start = time.time() t_start = time.time()
segments, audio_duration = _transcribe_mlx_file( segments, audio_duration = _transcribe_mlx_file(
file, mlx_model_id, language, timestamps, verbose file, mlx_model_id, language, timestamps, verbose,
vad_filter=vad_filter, word_timestamps=word_timestamps,
translate=translate
) )
os.makedirs('{}/transcriptions'.format(path), exist_ok=True) os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
segment_list = [] segment_list = []
@@ -373,7 +379,10 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
segments, info = whisper_model.transcribe( segments, info = whisper_model.transcribe(
file, file,
language=language, language=language,
beam_size=5 beam_size=5,
task="translate" if translate else "transcribe",
vad_filter=vad_filter,
word_timestamps=word_timestamps,
) )
audio_duration = info.duration # seconds audio_duration = info.duration # seconds
# Make folder if missing # Make folder if missing
@@ -396,7 +405,17 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
f.write('\n[{} --> {}] {}'.format(start_ts, end_ts, text)) f.write('\n[{} --> {}] {}'.format(start_ts, end_ts, text))
else: else:
f.write('\n{}'.format(text)) f.write('\n{}'.format(text))
srt_f.write(f'{idx}\n{_srt_timestamp(seg.start)} --> {_srt_timestamp(seg.end)}\n{text}\n\n') # Use word-level timestamps for SRT if available
if word_timestamps and hasattr(seg, 'words') and seg.words:
for w_idx, word in enumerate(seg.words, start=1):
w_text = word.word.strip()
if not w_text:
continue
w_start = _srt_timestamp(word.start)
w_end = _srt_timestamp(word.end)
srt_f.write(f'{idx}.{w_idx}\n{w_start} --> {w_end}\n{w_text}\n\n')
else:
srt_f.write(f'{idx}\n{_srt_timestamp(seg.start)} --> {_srt_timestamp(seg.end)}\n{text}\n\n')
f.flush() f.flush()
srt_f.flush() srt_f.flush()
if verbose: if verbose:
@@ -426,7 +445,7 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
return output_text return output_text
def _transcribe_worker_process(conn, path, glob_file, model, language, verbose, timestamps): def _transcribe_worker_process(conn, path, glob_file, model, language, verbose, timestamps, vad_filter=False, word_timestamps=False, translate=False):
"""Child-process entry point for the UI's multiprocessing backend. """Child-process entry point for the UI's multiprocessing backend.
Redirects stdout/stderr → pipe connection so the main process can display Redirects stdout/stderr → pipe connection so the main process can display
@@ -455,7 +474,9 @@ def _transcribe_worker_process(conn, path, glob_file, model, language, verbose,
result = '⚠ No output produced.' result = '⚠ No output produced.'
try: try:
result = transcribe(path, glob_file, model, language, verbose, timestamps) result = transcribe(path, glob_file, model, language, verbose, timestamps,
vad_filter=vad_filter, word_timestamps=word_timestamps,
translate=translate)
except Exception as exc: except Exception as exc:
result = f'⚠ Unexpected error: {exc}' result = f'⚠ Unexpected error: {exc}'
finally: finally: