feat: add advanced transcription options for VAD, word-level timestamps, and translation
This commit is contained in:
19
app.py
19
app.py
@@ -219,6 +219,21 @@ class App:
|
|||||||
ts_frame, text="Include timestamps in transcription",
|
ts_frame, text="Include timestamps in transcription",
|
||||||
variable=self.timestamps_var, font=font_b)
|
variable=self.timestamps_var, font=font_b)
|
||||||
self.timestamps_switch.pack(side=tk.LEFT, padx=5)
|
self.timestamps_switch.pack(side=tk.LEFT, padx=5)
|
||||||
|
# Advanced options frame
|
||||||
|
adv_frame = customtkinter.CTkFrame(master)
|
||||||
|
adv_frame.pack(fill=tk.BOTH, padx=10, pady=10)
|
||||||
|
self.vad_var = tk.BooleanVar(value=False)
|
||||||
|
customtkinter.CTkSwitch(
|
||||||
|
adv_frame, text="VAD filter (remove silence)",
|
||||||
|
variable=self.vad_var, font=font_b).pack(side=tk.LEFT, padx=5)
|
||||||
|
self.word_ts_var = tk.BooleanVar(value=False)
|
||||||
|
customtkinter.CTkSwitch(
|
||||||
|
adv_frame, text="Word-level timestamps",
|
||||||
|
variable=self.word_ts_var, font=font_b).pack(side=tk.LEFT, padx=5)
|
||||||
|
self.translate_var = tk.BooleanVar(value=False)
|
||||||
|
customtkinter.CTkSwitch(
|
||||||
|
adv_frame, text="Translate to English",
|
||||||
|
variable=self.translate_var, font=font_b).pack(side=tk.LEFT, padx=5)
|
||||||
# Progress Bar
|
# Progress Bar
|
||||||
self.progress_bar = ttk.Progressbar(master, length=200, mode='indeterminate')
|
self.progress_bar = ttk.Progressbar(master, length=200, mode='indeterminate')
|
||||||
# Worker process handle (replaces thread+stop_event for true immediate cancellation)
|
# Worker process handle (replaces thread+stop_event for true immediate cancellation)
|
||||||
@@ -320,6 +335,9 @@ class App:
|
|||||||
lang_label = self.language_combobox.get()
|
lang_label = self.language_combobox.get()
|
||||||
language = WHISPER_LANGUAGES.get(lang_label, lang_label) if lang_label else None
|
language = WHISPER_LANGUAGES.get(lang_label, lang_label) if lang_label else None
|
||||||
timestamps = self.timestamps_var.get()
|
timestamps = self.timestamps_var.get()
|
||||||
|
vad_filter = self.vad_var.get()
|
||||||
|
word_timestamps = self.word_ts_var.get()
|
||||||
|
translate = self.translate_var.get()
|
||||||
glob_file = get_path(path)
|
glob_file = get_path(path)
|
||||||
self.progress_bar.pack(fill=tk.X, padx=5, pady=5)
|
self.progress_bar.pack(fill=tk.X, padx=5, pady=5)
|
||||||
self.progress_bar.start()
|
self.progress_bar.start()
|
||||||
@@ -327,6 +345,7 @@ class App:
|
|||||||
self._proc = mp.Process(
|
self._proc = mp.Process(
|
||||||
target=_transcribe_worker_process,
|
target=_transcribe_worker_process,
|
||||||
args=(self._child_conn, path, glob_file, model, language, True, timestamps),
|
args=(self._child_conn, path, glob_file, model, language, True, timestamps),
|
||||||
|
kwargs={"vad_filter": vad_filter, "word_timestamps": word_timestamps, "translate": translate},
|
||||||
daemon=True,
|
daemon=True,
|
||||||
)
|
)
|
||||||
self._proc.start()
|
self._proc.start()
|
||||||
|
|||||||
@@ -152,7 +152,7 @@ def _decode_audio_pyav(file_path):
|
|||||||
return np.concatenate(chunks, axis=0), duration
|
return np.concatenate(chunks, axis=0), duration
|
||||||
|
|
||||||
|
|
||||||
def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose):
|
def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose, vad_filter=False, word_timestamps=False, translate=False):
|
||||||
"""Transcribe a single file with mlx-whisper (Apple GPU/NPU).
|
"""Transcribe a single file with mlx-whisper (Apple GPU/NPU).
|
||||||
|
|
||||||
Decodes audio via PyAV (no system ffmpeg needed), then runs MLX inference.
|
Decodes audio via PyAV (no system ffmpeg needed), then runs MLX inference.
|
||||||
@@ -166,6 +166,10 @@ def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose):
|
|||||||
decode_opts = {}
|
decode_opts = {}
|
||||||
if language:
|
if language:
|
||||||
decode_opts["language"] = language
|
decode_opts["language"] = language
|
||||||
|
if translate:
|
||||||
|
decode_opts["task"] = "translate"
|
||||||
|
if word_timestamps:
|
||||||
|
decode_opts["word_timestamps"] = True
|
||||||
|
|
||||||
result = mlx_whisper.transcribe(
|
result = mlx_whisper.transcribe(
|
||||||
audio_array,
|
audio_array,
|
||||||
@@ -200,7 +204,7 @@ def get_path(path):
|
|||||||
return sorted(media_files)
|
return sorted(media_files)
|
||||||
|
|
||||||
# Main function
|
# Main function
|
||||||
def transcribe(path, glob_file, model=None, language=None, verbose=False, timestamps=True, stop_event=None):
|
def transcribe(path, glob_file, model=None, language=None, verbose=False, timestamps=True, stop_event=None, vad_filter=False, word_timestamps=False, translate=False):
|
||||||
"""
|
"""
|
||||||
Transcribes audio files in a specified folder using faster-whisper (CTranslate2).
|
Transcribes audio files in a specified folder using faster-whisper (CTranslate2).
|
||||||
|
|
||||||
@@ -274,7 +278,9 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
|
|||||||
try:
|
try:
|
||||||
t_start = time.time()
|
t_start = time.time()
|
||||||
segments, audio_duration = _transcribe_mlx_file(
|
segments, audio_duration = _transcribe_mlx_file(
|
||||||
file, mlx_model_id, language, timestamps, verbose
|
file, mlx_model_id, language, timestamps, verbose,
|
||||||
|
vad_filter=vad_filter, word_timestamps=word_timestamps,
|
||||||
|
translate=translate
|
||||||
)
|
)
|
||||||
os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
|
os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
|
||||||
segment_list = []
|
segment_list = []
|
||||||
@@ -373,7 +379,10 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
|
|||||||
segments, info = whisper_model.transcribe(
|
segments, info = whisper_model.transcribe(
|
||||||
file,
|
file,
|
||||||
language=language,
|
language=language,
|
||||||
beam_size=5
|
beam_size=5,
|
||||||
|
task="translate" if translate else "transcribe",
|
||||||
|
vad_filter=vad_filter,
|
||||||
|
word_timestamps=word_timestamps,
|
||||||
)
|
)
|
||||||
audio_duration = info.duration # seconds
|
audio_duration = info.duration # seconds
|
||||||
# Make folder if missing
|
# Make folder if missing
|
||||||
@@ -396,7 +405,17 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
|
|||||||
f.write('\n[{} --> {}] {}'.format(start_ts, end_ts, text))
|
f.write('\n[{} --> {}] {}'.format(start_ts, end_ts, text))
|
||||||
else:
|
else:
|
||||||
f.write('\n{}'.format(text))
|
f.write('\n{}'.format(text))
|
||||||
srt_f.write(f'{idx}\n{_srt_timestamp(seg.start)} --> {_srt_timestamp(seg.end)}\n{text}\n\n')
|
# Use word-level timestamps for SRT if available
|
||||||
|
if word_timestamps and hasattr(seg, 'words') and seg.words:
|
||||||
|
for w_idx, word in enumerate(seg.words, start=1):
|
||||||
|
w_text = word.word.strip()
|
||||||
|
if not w_text:
|
||||||
|
continue
|
||||||
|
w_start = _srt_timestamp(word.start)
|
||||||
|
w_end = _srt_timestamp(word.end)
|
||||||
|
srt_f.write(f'{idx}.{w_idx}\n{w_start} --> {w_end}\n{w_text}\n\n')
|
||||||
|
else:
|
||||||
|
srt_f.write(f'{idx}\n{_srt_timestamp(seg.start)} --> {_srt_timestamp(seg.end)}\n{text}\n\n')
|
||||||
f.flush()
|
f.flush()
|
||||||
srt_f.flush()
|
srt_f.flush()
|
||||||
if verbose:
|
if verbose:
|
||||||
@@ -426,7 +445,7 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest
|
|||||||
return output_text
|
return output_text
|
||||||
|
|
||||||
|
|
||||||
def _transcribe_worker_process(conn, path, glob_file, model, language, verbose, timestamps):
|
def _transcribe_worker_process(conn, path, glob_file, model, language, verbose, timestamps, vad_filter=False, word_timestamps=False, translate=False):
|
||||||
"""Child-process entry point for the UI's multiprocessing backend.
|
"""Child-process entry point for the UI's multiprocessing backend.
|
||||||
|
|
||||||
Redirects stdout/stderr → pipe connection so the main process can display
|
Redirects stdout/stderr → pipe connection so the main process can display
|
||||||
@@ -455,7 +474,9 @@ def _transcribe_worker_process(conn, path, glob_file, model, language, verbose,
|
|||||||
|
|
||||||
result = '⚠ No output produced.'
|
result = '⚠ No output produced.'
|
||||||
try:
|
try:
|
||||||
result = transcribe(path, glob_file, model, language, verbose, timestamps)
|
result = transcribe(path, glob_file, model, language, verbose, timestamps,
|
||||||
|
vad_filter=vad_filter, word_timestamps=word_timestamps,
|
||||||
|
translate=translate)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
result = f'⚠ Unexpected error: {exc}'
|
result = f'⚠ Unexpected error: {exc}'
|
||||||
finally:
|
finally:
|
||||||
|
|||||||
Reference in New Issue
Block a user