feat: add advanced transcription options for VAD, word-level timestamps, and translation
This commit is contained in:
19
app.py
19
app.py
@@ -219,6 +219,21 @@ class App:
|
||||
ts_frame, text="Include timestamps in transcription",
|
||||
variable=self.timestamps_var, font=font_b)
|
||||
self.timestamps_switch.pack(side=tk.LEFT, padx=5)
|
||||
# Advanced options frame
|
||||
adv_frame = customtkinter.CTkFrame(master)
|
||||
adv_frame.pack(fill=tk.BOTH, padx=10, pady=10)
|
||||
self.vad_var = tk.BooleanVar(value=False)
|
||||
customtkinter.CTkSwitch(
|
||||
adv_frame, text="VAD filter (remove silence)",
|
||||
variable=self.vad_var, font=font_b).pack(side=tk.LEFT, padx=5)
|
||||
self.word_ts_var = tk.BooleanVar(value=False)
|
||||
customtkinter.CTkSwitch(
|
||||
adv_frame, text="Word-level timestamps",
|
||||
variable=self.word_ts_var, font=font_b).pack(side=tk.LEFT, padx=5)
|
||||
self.translate_var = tk.BooleanVar(value=False)
|
||||
customtkinter.CTkSwitch(
|
||||
adv_frame, text="Translate to English",
|
||||
variable=self.translate_var, font=font_b).pack(side=tk.LEFT, padx=5)
|
||||
# Progress Bar
|
||||
self.progress_bar = ttk.Progressbar(master, length=200, mode='indeterminate')
|
||||
# Worker process handle (replaces thread+stop_event for true immediate cancellation)
|
||||
@@ -320,6 +335,9 @@ class App:
|
||||
lang_label = self.language_combobox.get()
|
||||
language = WHISPER_LANGUAGES.get(lang_label, lang_label) if lang_label else None
|
||||
timestamps = self.timestamps_var.get()
|
||||
vad_filter = self.vad_var.get()
|
||||
word_timestamps = self.word_ts_var.get()
|
||||
translate = self.translate_var.get()
|
||||
glob_file = get_path(path)
|
||||
self.progress_bar.pack(fill=tk.X, padx=5, pady=5)
|
||||
self.progress_bar.start()
|
||||
@@ -327,6 +345,7 @@ class App:
|
||||
self._proc = mp.Process(
|
||||
target=_transcribe_worker_process,
|
||||
args=(self._child_conn, path, glob_file, model, language, True, timestamps),
|
||||
kwargs={"vad_filter": vad_filter, "word_timestamps": word_timestamps, "translate": translate},
|
||||
daemon=True,
|
||||
)
|
||||
self._proc.start()
|
||||
|
||||
@@ -152,7 +152,7 @@ def _decode_audio_pyav(file_path):
|
||||
return np.concatenate(chunks, axis=0), duration
|
||||
|
||||
|
||||
def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose):
|
||||
def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose, vad_filter=False, word_timestamps=False, translate=False):
|
||||
"""Transcribe a single file with mlx-whisper (Apple GPU/NPU).
|
||||
|
||||
Decodes audio via PyAV (no system ffmpeg needed), then runs MLX inference.
|
||||
@@ -166,6 +166,10 @@ def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose):
|
||||
decode_opts = {}
|
||||
if language:
|
||||
decode_opts["language"] = language
|
||||
if translate:
|
||||
decode_opts["task"] = "translate"
|
||||
if word_timestamps:
|
||||
decode_opts["word_timestamps"] = True
|
||||
|
||||
result = mlx_whisper.transcribe(
|
||||
audio_array,
|
||||
@@ -200,7 +204,7 @@ def get_path(path):
|
||||
return sorted(media_files)
|
||||
|
||||
# Main function
|
||||
def transcribe(path, glob_file, model=None, language=None, verbose=False, timestamps=True, stop_event=None):
|
||||
def transcribe(path, glob_file, model=None, language=None, verbose=False, timestamps=True, stop_event=None, vad_filter=False, word_timestamps=False, translate=False):
|
||||
"""
|
||||
Transcribes audio files in a specified folder using faster-whisper (CTranslate2).
|
||||
|
||||
@@ -274,7 +278,9 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
|
||||
try:
|
||||
t_start = time.time()
|
||||
segments, audio_duration = _transcribe_mlx_file(
|
||||
file, mlx_model_id, language, timestamps, verbose
|
||||
file, mlx_model_id, language, timestamps, verbose,
|
||||
vad_filter=vad_filter, word_timestamps=word_timestamps,
|
||||
translate=translate
|
||||
)
|
||||
os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
|
||||
segment_list = []
|
||||
@@ -373,7 +379,10 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
|
||||
segments, info = whisper_model.transcribe(
|
||||
file,
|
||||
language=language,
|
||||
beam_size=5
|
||||
beam_size=5,
|
||||
task="translate" if translate else "transcribe",
|
||||
vad_filter=vad_filter,
|
||||
word_timestamps=word_timestamps,
|
||||
)
|
||||
audio_duration = info.duration # seconds
|
||||
# Make folder if missing
|
||||
@@ -396,7 +405,17 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
|
||||
f.write('\n[{} --> {}] {}'.format(start_ts, end_ts, text))
|
||||
else:
|
||||
f.write('\n{}'.format(text))
|
||||
srt_f.write(f'{idx}\n{_srt_timestamp(seg.start)} --> {_srt_timestamp(seg.end)}\n{text}\n\n')
|
||||
# Use word-level timestamps for SRT if available
|
||||
if word_timestamps and hasattr(seg, 'words') and seg.words:
|
||||
for w_idx, word in enumerate(seg.words, start=1):
|
||||
w_text = word.word.strip()
|
||||
if not w_text:
|
||||
continue
|
||||
w_start = _srt_timestamp(word.start)
|
||||
w_end = _srt_timestamp(word.end)
|
||||
srt_f.write(f'{idx}.{w_idx}\n{w_start} --> {w_end}\n{w_text}\n\n')
|
||||
else:
|
||||
srt_f.write(f'{idx}\n{_srt_timestamp(seg.start)} --> {_srt_timestamp(seg.end)}\n{text}\n\n')
|
||||
f.flush()
|
||||
srt_f.flush()
|
||||
if verbose:
|
||||
@@ -426,7 +445,7 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
|
||||
return output_text
|
||||
|
||||
|
||||
def _transcribe_worker_process(conn, path, glob_file, model, language, verbose, timestamps):
|
||||
def _transcribe_worker_process(conn, path, glob_file, model, language, verbose, timestamps, vad_filter=False, word_timestamps=False, translate=False):
|
||||
"""Child-process entry point for the UI's multiprocessing backend.
|
||||
|
||||
Redirects stdout/stderr → pipe connection so the main process can display
|
||||
@@ -455,7 +474,9 @@ def _transcribe_worker_process(conn, path, glob_file, model, language, verbose,
|
||||
|
||||
result = '⚠ No output produced.'
|
||||
try:
|
||||
result = transcribe(path, glob_file, model, language, verbose, timestamps)
|
||||
result = transcribe(path, glob_file, model, language, verbose, timestamps,
|
||||
vad_filter=vad_filter, word_timestamps=word_timestamps,
|
||||
translate=translate)
|
||||
except Exception as exc:
|
||||
result = f'⚠ Unexpected error: {exc}'
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user