diff --git a/app.py b/app.py index 6ce065c..ccb9b9a 100644 --- a/app.py +++ b/app.py @@ -219,6 +219,21 @@ class App: ts_frame, text="Include timestamps in transcription", variable=self.timestamps_var, font=font_b) self.timestamps_switch.pack(side=tk.LEFT, padx=5) + # Advanced options frame + adv_frame = customtkinter.CTkFrame(master) + adv_frame.pack(fill=tk.BOTH, padx=10, pady=10) + self.vad_var = tk.BooleanVar(value=False) + customtkinter.CTkSwitch( + adv_frame, text="VAD filter (remove silence)", + variable=self.vad_var, font=font_b).pack(side=tk.LEFT, padx=5) + self.word_ts_var = tk.BooleanVar(value=False) + customtkinter.CTkSwitch( + adv_frame, text="Word-level timestamps", + variable=self.word_ts_var, font=font_b).pack(side=tk.LEFT, padx=5) + self.translate_var = tk.BooleanVar(value=False) + customtkinter.CTkSwitch( + adv_frame, text="Translate to English", + variable=self.translate_var, font=font_b).pack(side=tk.LEFT, padx=5) # Progress Bar self.progress_bar = ttk.Progressbar(master, length=200, mode='indeterminate') # Worker process handle (replaces thread+stop_event for true immediate cancellation) @@ -320,6 +335,9 @@ class App: lang_label = self.language_combobox.get() language = WHISPER_LANGUAGES.get(lang_label, lang_label) if lang_label else None timestamps = self.timestamps_var.get() + vad_filter = self.vad_var.get() + word_timestamps = self.word_ts_var.get() + translate = self.translate_var.get() glob_file = get_path(path) self.progress_bar.pack(fill=tk.X, padx=5, pady=5) self.progress_bar.start() @@ -327,6 +345,7 @@ class App: self._proc = mp.Process( target=_transcribe_worker_process, args=(self._child_conn, path, glob_file, model, language, True, timestamps), + kwargs={"vad_filter": vad_filter, "word_timestamps": word_timestamps, "translate": translate}, daemon=True, ) self._proc.start() diff --git a/src/_LocalTranscribe.py b/src/_LocalTranscribe.py index 366d224..9766bea 100644 --- a/src/_LocalTranscribe.py +++ b/src/_LocalTranscribe.py @@ -152,7 +152,7 @@ def _decode_audio_pyav(file_path): return np.concatenate(chunks, axis=0), duration -def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose): +def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose, vad_filter=False, word_timestamps=False, translate=False): """Transcribe a single file with mlx-whisper (Apple GPU/NPU). Decodes audio via PyAV (no system ffmpeg needed), then runs MLX inference. @@ -166,6 +166,10 @@ def _transcribe_mlx_file(file, mlx_model_id, language, timestamps, verbose): decode_opts = {} if language: decode_opts["language"] = language + if translate: + decode_opts["task"] = "translate" + if word_timestamps: + decode_opts["word_timestamps"] = True result = mlx_whisper.transcribe( audio_array, @@ -200,7 +204,7 @@ def get_path(path): return sorted(media_files) # Main function -def transcribe(path, glob_file, model=None, language=None, verbose=False, timestamps=True, stop_event=None): +def transcribe(path, glob_file, model=None, language=None, verbose=False, timestamps=True, stop_event=None, vad_filter=False, word_timestamps=False, translate=False): """ Transcribes audio files in a specified folder using faster-whisper (CTranslate2). @@ -274,7 +278,9 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest try: t_start = time.time() segments, audio_duration = _transcribe_mlx_file( - file, mlx_model_id, language, timestamps, verbose + file, mlx_model_id, language, timestamps, verbose, + vad_filter=vad_filter, word_timestamps=word_timestamps, + translate=translate ) os.makedirs('{}/transcriptions'.format(path), exist_ok=True) segment_list = [] @@ -373,7 +379,10 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest segments, info = whisper_model.transcribe( file, language=language, - beam_size=5 + beam_size=5, + task="translate" if translate else "transcribe", + vad_filter=vad_filter, + word_timestamps=word_timestamps, ) audio_duration = info.duration # seconds # Make folder if missing @@ -396,7 +405,17 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest f.write('\n[{} --> {}] {}'.format(start_ts, end_ts, text)) else: f.write('\n{}'.format(text)) - srt_f.write(f'{idx}\n{_srt_timestamp(seg.start)} --> {_srt_timestamp(seg.end)}\n{text}\n\n') + # Use word-level timestamps for SRT if available + if word_timestamps and hasattr(seg, 'words') and seg.words: + for w_idx, word in enumerate(seg.words, start=1): + w_text = word.word.strip() + if not w_text: + continue + w_start = _srt_timestamp(word.start) + w_end = _srt_timestamp(word.end) + srt_f.write(f'{idx}.{w_idx}\n{w_start} --> {w_end}\n{w_text}\n\n') + else: + srt_f.write(f'{idx}\n{_srt_timestamp(seg.start)} --> {_srt_timestamp(seg.end)}\n{text}\n\n') f.flush() srt_f.flush() if verbose: @@ -426,7 +445,7 @@ def transcribe(path, glob_file, model=None, language=None, verbose=False, timest return output_text -def _transcribe_worker_process(conn, path, glob_file, model, language, verbose, timestamps): +def _transcribe_worker_process(conn, path, glob_file, model, language, verbose, timestamps, vad_filter=False, word_timestamps=False, translate=False): """Child-process entry point for the UI's multiprocessing backend. Redirects stdout/stderr → pipe connection so the main process can display @@ -455,7 +474,9 @@ def _transcribe_worker_process(conn, path, glob_file, model, language, verbose, result = '⚠ No output produced.' try: - result = transcribe(path, glob_file, model, language, verbose, timestamps) + result = transcribe(path, glob_file, model, language, verbose, timestamps, + vad_filter=vad_filter, word_timestamps=word_timestamps, + translate=translate) except Exception as exc: result = f'⚠ Unexpected error: {exc}' finally: