Compare commits

33 Commits

Author SHA1 Message Date
Kristofer Söderström f8cf42733d Revamp: embedded console, faster-whisper, simplified install 2026-03-02 17:02:16 +01:00
Kristofer Rolf Söderström 7d3fe1ba26 Merge pull request #11 from soderstromkr/copilot/update-whisper-device-parameter
Pass explicit device parameter to whisper.load_model() for MPS acceleration
2026-01-22 14:03:13 +01:00
copilot-swe-agent[bot] da42a6e4cc Add .gitignore and remove __pycache__ files
Co-authored-by: soderstromkr <23003509+soderstromkr@users.noreply.github.com>
2026-01-22 13:00:38 +00:00
copilot-swe-agent[bot] 0dab0d9bea Add explicit device parameter to whisper.load_model()
Co-authored-by: soderstromkr <23003509+soderstromkr@users.noreply.github.com>
2026-01-22 13:00:21 +00:00
copilot-swe-agent[bot] 953c71ab28 Initial plan 2026-01-22 12:57:09 +00:00
Kristofer Rolf Söderström 5522bdd575 Merge pull request #6
Merged pull request #6
2026-01-22 13:53:23 +01:00
Kristofer Rolf Söderström 861c470330 Merge pull request #10 from soderstromkr/copilot/add-readme-gpu-support
Add GPU support documentation to README
2026-01-22 13:44:11 +01:00
copilot-swe-agent[bot] 6de6d4b2ff Add GPU support section to README with CUDA PyTorch installation instructions
Co-authored-by: soderstromkr <23003509+soderstromkr@users.noreply.github.com>
2026-01-22 12:42:09 +00:00
copilot-swe-agent[bot] 01552cc7cb Initial plan 2026-01-22 12:40:19 +00:00
Yaroslav P 049a168c81 amd graphic card support 2025-03-05 16:23:10 +02:00
Kristofer Rolf Söderström 56a925463f Update README.md 2024-05-17 08:51:16 +02:00
Kristofer Rolf Söderström fe60b04020 Update README.md 2024-05-17 08:49:28 +02:00
Kristofer Rolf Söderström ff06a257f2 Update README.md 2024-05-17 08:47:57 +02:00
Kristofer Rolf Söderström 5e31129ea2 Create requirements.txt 2024-05-17 08:44:39 +02:00
Kristofer Rolf Söderström 3f0bca02b7 Update README.md 2024-05-17 08:44:09 +02:00
Kristofer Rolf Söderström 488e78a5ae Update README.md 2024-05-17 08:42:42 +02:00
Kristofer Rolf Söderström 829a054300 Update README.md 2024-05-17 08:40:42 +02:00
Kristofer Rolf Söderström 462aae12ca Update README.md 2024-05-17 08:09:30 +02:00
Kristofer Rolf Söderström fec9190ba1 Update README.md 2024-05-17 08:08:51 +02:00
Kristofer Rolf Söderström 0dde25204d Update README.md
removed other installation options from readme
2024-05-17 08:07:00 +02:00
Kristofer Söderström b611aa6b8c removed messagebox 2023-11-06 10:13:04 +01:00
Kristofer Söderström 7d50d5f4cf QOL improvements 2023-11-06 09:57:44 +01:00
Kristofer Söderström 7799d03960 bug fixes 2023-11-06 09:31:53 +01:00
Kristofer Rolf Söderström f88186dacc Update app.py 2023-10-19 09:26:43 +02:00
Kristofer Rolf Söderström 3f5c1491ac Delete build.zip 2023-10-19 09:20:55 +02:00
Kristofer Rolf Söderström c83e15bdba Update README.md 2023-10-19 09:20:29 +02:00
Kristofer Rolf Söderström ff16ad30e1 Merge pull request #2 from ValentinFunk/patch-1
Fix mac instructions link
2023-10-19 09:09:01 +02:00
Valentin 622165b3e6 Update Mac_instructions.md 2023-09-08 10:11:02 +02:00
Valentin 0e9cbdca58 Fix mac instructions link 2023-09-08 10:09:15 +02:00
Kristofer Söderström 87cb509b14 added windows exe in as zip 2023-06-30 17:26:24 +02:00
Kristofer Söderström ba935cafb7 formatting 2023-06-30 16:32:37 +02:00
Kristofer Söderström 6497508b7a fix formatting 2023-06-30 16:23:07 +02:00
Kristofer Söderström d96333a5a7 Complete rework for GUI, experimental EXE file and other minor changes, see readme for more info 2023-06-30 16:11:59 +02:00
24 changed files with 739 additions and 515 deletions
+1
View File
@@ -0,0 +1 @@
*.zip filter=lfs diff=lfs merge=lfs -text
+26
View File
@@ -0,0 +1,26 @@
# Python cache
__pycache__/
*.py[cod]
*$py.class
# Virtual environments
venv/
env/
ENV/
.venv/
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
Thumbs.db
# Build artifacts
dist/
build/
*.egg-info/
+3 -3
View File
@@ -4,8 +4,8 @@ authors:
- family-names: "Söderström"
given-names: "Kristofer Rolf"
orcid: "https://orcid.org/0000-0002-5322-3350"
title: "transcribe"
version: 1.1.1
doi: 10.5281/zenodo.7760511
title: "Local Transcribe"
version: 1.2
doi: 10.5281/zenodo.7760510
date-released: 2023-03-22
url: "https://github.com/soderstromkr/transcribe"
-100
View File
@@ -1,100 +0,0 @@
import tkinter as tk
from tkinter import ttk
from tkinter import filedialog
from tkinter import messagebox
from transcribe import transcribe
from ttkthemes import ThemedTk
import whisper
import numpy as np
import glob, os
class App:
def __init__(self, master):
self.master = master
master.title("Local Transcribe")
#style options
style = ttk.Style()
style.configure('TLabel', font=('Arial', 10), padding=10)
style.configure('TEntry', font=('Arial', 10), padding=10)
style.configure('TButton', font=('Arial', 10), padding=10)
style.configure('TCheckbutton', font=('Arial', 10), padding=10)
# Folder Path
path_frame = ttk.Frame(master, padding=10)
path_frame.pack(fill=tk.BOTH)
path_label = ttk.Label(path_frame, text="Folder Path:")
path_label.pack(side=tk.LEFT, padx=5)
self.path_entry = ttk.Entry(path_frame, width=50)
self.path_entry.insert(10, 'sample_audio/')
self.path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
browse_button = ttk.Button(path_frame, text="Browse", command=self.browse)
browse_button.pack(side=tk.LEFT, padx=5)
# File Type
file_type_frame = ttk.Frame(master, padding=10)
file_type_frame.pack(fill=tk.BOTH)
file_type_label = ttk.Label(file_type_frame, text="File Type:")
file_type_label.pack(side=tk.LEFT, padx=5)
self.file_type_entry = ttk.Entry(file_type_frame, width=50)
self.file_type_entry.insert(10, 'ogg')
self.file_type_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Model
model_frame = ttk.Frame(master, padding=10)
model_frame.pack(fill=tk.BOTH)
model_label = ttk.Label(model_frame, text="Model:")
model_label.pack(side=tk.LEFT, padx=5)
self.model_entry = ttk.Entry(model_frame, width=50)
self.model_entry.insert(10, 'small')
self.model_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Language (currently disabled)
#language_frame = ttk.Frame(master, padding=10)
#language_frame.pack(fill=tk.BOTH)
#language_label = ttk.Label(language_frame, text="Language:")
#language_label.pack(side=tk.LEFT, padx=5)
#self.language_entry = ttk.Entry(language_frame, width=50)
#self.language_entry.insert(10, np.nan)
#self.language_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Verbose
verbose_frame = ttk.Frame(master, padding=10)
verbose_frame.pack(fill=tk.BOTH)
self.verbose_var = tk.BooleanVar()
verbose_checkbutton = ttk.Checkbutton(verbose_frame, text="Verbose", variable=self.verbose_var)
verbose_checkbutton.pack(side=tk.LEFT, padx=5)
# Buttons
button_frame = ttk.Frame(master, padding=10)
button_frame.pack(fill=tk.BOTH)
transcribe_button = ttk.Button(button_frame, text="Transcribe Audio", command=self.transcribe)
transcribe_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
quit_button = ttk.Button(button_frame, text="Quit", command=master.quit)
quit_button.pack(side=tk.RIGHT, padx=5, pady=10, fill=tk.X, expand=True)
def browse(self):
folder_path = filedialog.askdirectory()
self.path_entry.delete(0, tk.END)
self.path_entry.insert(0, folder_path)
def transcribe(self):
path = self.path_entry.get()
file_type = self.file_type_entry.get()
model = self.model_entry.get()
#language = self.language_entry.get()
language = None # set to auto-detect
verbose = self.verbose_var.get()
# Call the transcribe function with the appropriate arguments
result = transcribe(path, file_type, model=model, language=language, verbose=verbose)
# Show the result in a message box
tk.messagebox.showinfo("Finished!", result)
if __name__ == "__main__":
# root = tk.Tk()
root = ThemedTk(theme="clearlooks")
app = App(root)
root.mainloop()
+31
View File
@@ -0,0 +1,31 @@
### How to run on Mac / Linux
#### Quick start
1. Open Terminal and navigate to the project folder (or right-click the folder and select "Open in Terminal").
2. Make the script executable (only needed once):
```
chmod +x run_Mac.sh
```
3. Run it:
```
./run_Mac.sh
```
This will automatically:
- Create a virtual environment (`.venv`)
- Install all dependencies (no admin rights needed)
- Launch the app
#### Manual steps (alternative)
If you prefer to do it manually:
```
python3 -m venv .venv
.venv/bin/python install.py
.venv/bin/python app.py
```
#### Notes
- **Python 3.10+** is required. macOS users can install it from [python.org](https://www.python.org/downloads/) or via `brew install python`.
- **No FFmpeg install needed** — audio decoding is bundled.
- **GPU acceleration** is not available on macOS (Apple Silicon MPS is not supported by CTranslate2). CPU with int8 quantization is still fast.
- On Apple Silicon (M1/M2/M3/M4), the `small` or `base` models run well. `medium` works but is slower.
-5
View File
@@ -1,5 +0,0 @@
### How to run on Mac
Unfortunately, I have not found a permament solution for this, not being a Mac user has limited the ways I can test this. For now, these are the recommended steps for a beginner user:
1. Open a terminal and navigate to the root folder (transcribe-main if you downloaded the folder). You can also right-click (or equivalent) on the root folder to open a Terminal within the folder.
2. Run the following command:
python GUI.py
+69 -51
View File
@@ -1,72 +1,90 @@
## Local Transcribe
## Local Transcribe with Whisper
Local Transcribe uses OpenAI's Whisper to transcribe audio files from your local folders, creating text files on disk.
> **⚠ Note for Mac users (Apple Silicon):** This version uses `faster-whisper` (CTranslate2), which does **not** support Apple M-chip GPU acceleration. Transcription will run on CPU, which is slower than OpenAI's Whisper with Metal/CoreML support. The trade-off is a much simpler installation — no conda, no PyTorch, no admin rights. If you'd prefer M-chip GPU acceleration and don't mind a more involved setup, switch to the **classic** release:
> ```
> git checkout classic
> ```
## Note
Local Transcribe with Whisper is a user-friendly desktop application that allows you to transcribe audio and video files using the Whisper ASR system, powered by [faster-whisper](https://github.com/SYSTRAN/faster-whisper) (CTranslate2). This application provides a graphical user interface (GUI) built with Python and the Tkinter library, making it easy to use even for those not familiar with programming.
This implementation and guide is mostly made for researchers not familiar with programming that want a way to transcribe their files locally, without internet connection, usually required within ethical data practices and frameworks. Two examples are shown, a normal workflow with internet connection. And one in which the model is loaded first, via openai-whisper, and then the transcription can be done without being connected to the internet. There is now also a GUI implementation, read below for more information.
## New in version 2.0!
1. **Switched to faster-whisper** — up to 4× faster transcription with lower memory usage.
2. **No separate FFmpeg installation needed** — audio decoding is handled by the bundled PyAV library.
3. **No admin rights required** — a plain `pip install` covers everything.
4. **No PyTorch dependency** — dramatically smaller install footprint.
5. **`tiny` model added** — smallest and fastest option for quick drafts.
### Instructions
#### Requirements
1. This script was made and tested in an Anaconda environment with Python 3.10. I recommend this method if you're not familiar with Python.
See [here](https://docs.anaconda.com/anaconda/install/index.html) for instructions. You might need administrator rights.
2. Whisper requires some additional libraries. The [setup](https://github.com/openai/whisper#setup) page states: "The codebase also depends on a few Python packages, most notably HuggingFace Transformers for their fast tokenizer implementation and ffmpeg-python for reading audio files."
Users might not need to specifically install Transfomers. However, a conda installation might be needed for ffmpeg[^1], which takes care of setting up PATH variables. From the anaconda prompt, type or copy the following:
```
conda install -c conda-forge ffmpeg-python
```
3. The main functionality comes from openai-whisper. See their [page](https://github.com/openai/whisper) for details. As of 2023-03-22 you can install via:
```
pip install -U openai-whisper
```
4. There is an option to run a batch file, which launches a GUI built on TKinter and TTKthemes. If using these options, make sure they are installed in your Python build. You can install them via pip.
```
pip install tk
```
and
```
pip install ttkthemes
```
#### Using the script
This is a simple script with no installation. You can download the zip folder and extract it to your preferred working folder.
![](Picture1.png)
## Features
* Select the folder containing the audio or video files you want to transcribe. Tested with m4a video.
* Choose the language of the files you are transcribing. You can either select a specific language or let the application automatically detect the language.
* Select the Whisper model to use for the transcription. Available models include "tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v2", and "large-v3". Models with .en ending are better if you're transcribing English, especially the base and small models.
* **Swedish-optimised models** — [KB-Whisper](https://huggingface.co/collections/KBLab/kb-whisper) from the National Library of Sweden (KBLab) is available in all sizes (tiny → large). These models reduce Word Error Rate by up to 47 % compared to OpenAI Whisper on Swedish speech. The language is set to Swedish automatically when a KB model is selected.
* Enable the verbose mode to receive detailed information during the transcription process.
* Monitor the progress of the transcription with the progress bar and terminal.
* Confirmation dialog before starting the transcription to ensure you have selected the correct folder.
* View the transcribed text in a message box once the transcription is completed.
## Installation
### Get the files
Download the zip folder and extract it to your preferred working folder.
![](images/Picture1.png)
Or by cloning the repository with:
```
git clone https://github.com/soderstromkr/transcribe.git
```
### Python Version **(any platform including Mac users)**
1. Install Python 3.10 or later. You can download it from [python.org](https://www.python.org/downloads/). During installation, **check "Add Python to PATH"**. No administrator rights are needed if you install for your user only.
2. Run the installer. Open a terminal (Command Prompt on Windows, Terminal on Mac/Linux) in the project folder and run:
```
python install.py
```
This will:
- Install all required packages (including bundled FFmpeg — no separate install needed)
- **Auto-detect your NVIDIA GPU** and ask if you want GPU acceleration
- No conda, no admin rights required
#### Example with Jupyter Notebook
Alternatively, you can install manually with `pip install -r requirements.txt`.
See [example](example.ipynb) for an implementation on Jupyter Notebook, also added an example for a simple [workaround](example_no_internet.ipynb) to transcribe while offline.
3. Run the app:
1. For **Windows**: double-click `run_Windows.bat` (it will auto-install on first run) or run:
```
python app.py
```
2. For **Mac / Linux**: run `./run_Mac.sh` (auto-installs on first run). See [Mac instructions](Mac_instructions.md) for details.
#### Using the GUI
**Note** The first run with a given model will download it (~75 MB for base, ~500 MB for medium). After that, everything works offline.
You can also run the GUI version from your terminal running ```python GUI.py``` or with the batch file called run_Windows.bat (for Windows users), just make sure to add your conda path to it. If you want to download a model first, and then go offline for transcription, I recommend running the model with the default sample folder, which will download the model locally.
## GPU Support
This program **does support running on NVIDIA GPUs**, which can significantly speed up transcription times. faster-whisper uses CTranslate2, which requires NVIDIA CUDA libraries for GPU acceleration.
The GUI should look like this:
### Automatic Detection
The `install.py` script **automatically detects NVIDIA GPUs** and will ask if you want to install GPU support. If you skipped it during installation, you can add it anytime:
```
pip install nvidia-cublas-cu12 nvidia-cudnn-cu12
```
![python GUI.py](gui_jpeg.jpg?raw=true)
**Note:** Make sure your NVIDIA GPU drivers are up to date. You can check by running `nvidia-smi` in your terminal. The program will automatically detect and use your GPU if available, otherwise it falls back to CPU.
or this, on a Mac, by running `python GUI.py` or `python3 GUI.py`:
### Verifying GPU Support
After installation, you can verify that your GPU is available by running:
```python
import ctranslate2
print(ctranslate2.get_supported_compute_types("cuda"))
```
If this returns a list containing `"float16"`, GPU acceleration is working.
![python GUI Mac.py](gui-mac.png)
## Usage
1. Launch the app — the built-in console panel at the bottom shows a welcome message and all progress updates.
2. Select the folder containing the audio or video files you want to transcribe by clicking the "Browse" button next to the "Folder" label. This will open a file dialog where you can navigate to the desired folder. Remember, you won't be choosing individual files but whole folders!
3. Enter the desired language for the transcription in the "Language" field. You can either select a language or leave it blank to enable automatic language detection.
4. Choose the Whisper model to use for the transcription from the dropdown list next to the "Model" label.
5. Click the "Transcribe" button to start the transcription. The button will be disabled during the process to prevent multiple transcriptions at once.
6. Monitor progress in the embedded console panel — it shows model loading, per-file progress, and segment timestamps in real time.
7. Once the transcription is completed, a message box will appear displaying the result. Click "OK" to close it.
8. You can run the application again or quit at any time by clicking the "Quit" button.
[^1]: Advanced users can use ```pip install ffmpeg-python``` but be ready to deal with some [PATH issues](https://stackoverflow.com/questions/65836756/python-ffmpeg-wont-accept-path-why), which I encountered in Windows 11.
## Jupyter Notebook
Don't want fancy EXEs or GUIs? Use the function as is. See [example](example.ipynb) for an implementation on Jupyter Notebook.
[![DOI](https://zenodo.org/badge/617404576.svg)](https://zenodo.org/badge/latestdoi/617404576)
+196
View File
@@ -0,0 +1,196 @@
import os
import sys
import tkinter as tk
from tkinter import ttk
from tkinter import filedialog
from tkinter import messagebox
from src._LocalTranscribe import transcribe, get_path
import customtkinter
import threading
# ── Helper: redirect stdout/stderr into a CTkTextbox ──────────────────────
import re
_ANSI_RE = re.compile(r'\x1b\[[0-9;]*m') # strip colour codes
class _ConsoleRedirector:
"""Redirects output exclusively to the in-app console panel."""
def __init__(self, text_widget):
self.widget = text_widget
def write(self, text):
clean = _ANSI_RE.sub('', text) # strip ANSI colours
if clean.strip() == '':
return
# Schedule UI update on the main thread
try:
self.widget.after(0, self._append, clean)
except Exception:
pass
def _append(self, text):
self.widget.configure(state='normal')
self.widget.insert('end', text + ('\n' if not text.endswith('\n') else ''))
self.widget.see('end')
self.widget.configure(state='disabled')
def flush(self):
pass
# HuggingFace model IDs for non-standard models
HF_MODEL_MAP = {
'KB Swedish (tiny)': 'KBLab/kb-whisper-tiny',
'KB Swedish (base)': 'KBLab/kb-whisper-base',
'KB Swedish (small)': 'KBLab/kb-whisper-small',
'KB Swedish (medium)': 'KBLab/kb-whisper-medium',
'KB Swedish (large)': 'KBLab/kb-whisper-large',
}
customtkinter.set_appearance_mode("System")
customtkinter.set_default_color_theme("blue") # Themes: blue (default), dark-blue, green
firstclick = True
class App:
def __init__(self, master):
self.master = master
# Change font
font = ('Roboto', 13, 'bold') # Change the font and size here
font_b = ('Roboto', 12) # Change the font and size here
# Folder Path
path_frame = customtkinter.CTkFrame(master)
path_frame.pack(fill=tk.BOTH, padx=10, pady=10)
customtkinter.CTkLabel(path_frame, text="Folder:", font=font).pack(side=tk.LEFT, padx=5)
self.path_entry = customtkinter.CTkEntry(path_frame, width=50, font=font_b)
self.path_entry.insert(0, os.path.join(os.getcwd(), 'sample_audio'))
self.path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
customtkinter.CTkButton(path_frame, text="Browse", command=self.browse, font=font).pack(side=tk.LEFT, padx=5)
# Language frame
#thanks to pommicket from Stackoverflow for this fix
def on_entry_click(event):
"""function that gets called whenever entry is clicked"""
global firstclick
if firstclick: # if this is the first time they clicked it
firstclick = False
self.language_entry.delete(0, "end") # delete all the text in the entry
language_frame = customtkinter.CTkFrame(master)
language_frame.pack(fill=tk.BOTH, padx=10, pady=10)
customtkinter.CTkLabel(language_frame, text="Language:", font=font).pack(side=tk.LEFT, padx=5)
self.language_entry = customtkinter.CTkEntry(language_frame, width=50, font=('Roboto', 12, 'italic'))
self.default_language_text = "Enter language (or ignore to auto-detect)"
self.language_entry.insert(0, self.default_language_text)
self.language_entry.bind('<FocusIn>', on_entry_click)
self.language_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Model frame
models = ['tiny', 'tiny.en', 'base', 'base.en',
'small', 'small.en', 'medium', 'medium.en',
'large-v2', 'large-v3',
'───────────────',
'KB Swedish (tiny)', 'KB Swedish (base)',
'KB Swedish (small)', 'KB Swedish (medium)',
'KB Swedish (large)']
model_frame = customtkinter.CTkFrame(master)
model_frame.pack(fill=tk.BOTH, padx=10, pady=10)
customtkinter.CTkLabel(model_frame, text="Model:", font=font).pack(side=tk.LEFT, padx=5)
# ComboBox frame
self.model_combobox = customtkinter.CTkComboBox(
model_frame, width=50, state="readonly",
values=models, font=font_b)
self.model_combobox.set('medium') # Set the default value
self.model_combobox.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Progress Bar
self.progress_bar = ttk.Progressbar(master, length=200, mode='indeterminate')
# Button actions frame
button_frame = customtkinter.CTkFrame(master)
button_frame.pack(fill=tk.BOTH, padx=10, pady=10)
self.transcribe_button = customtkinter.CTkButton(button_frame, text="Transcribe", command=self.start_transcription, font=font)
self.transcribe_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
customtkinter.CTkButton(button_frame, text="Quit", command=master.quit, font=font).pack(side=tk.RIGHT, padx=5, pady=10, fill=tk.X, expand=True)
# ── Embedded console / log panel ──────────────────────────────────
log_label = customtkinter.CTkLabel(master, text="Console output", font=font, anchor='w')
log_label.pack(fill=tk.X, padx=12, pady=(8, 0))
self.log_box = customtkinter.CTkTextbox(master, height=220, font=('Consolas', 14),
wrap='word', state='disabled',
fg_color='#1e1e1e', text_color='#e0e0e0')
self.log_box.pack(fill=tk.BOTH, expand=True, padx=10, pady=(2, 10))
# Redirect stdout & stderr into the log panel (no backend console)
sys.stdout = _ConsoleRedirector(self.log_box)
sys.stderr = _ConsoleRedirector(self.log_box)
# Welcome message (shown after redirect so it appears in the panel)
print("Welcome to Local Transcribe with Whisper! \U0001f600")
print("Transcriptions will be saved automatically.")
print("" * 46)
# Helper functions
# Browsing
def browse(self):
initial_dir = os.getcwd()
folder_path = filedialog.askdirectory(initialdir=initial_dir)
self.path_entry.delete(0, tk.END)
self.path_entry.insert(0, folder_path)
# Start transcription
def start_transcription(self):
# Disable transcribe button
self.transcribe_button.configure(state=tk.DISABLED)
# Start a new thread for the transcription process
threading.Thread(target=self.transcribe_thread).start()
# Threading
def transcribe_thread(self):
path = self.path_entry.get()
model_display = self.model_combobox.get()
# Ignore the visual separator
if model_display.startswith(''):
messagebox.showinfo("Invalid selection", "Please select a model, not the separator line.")
self.transcribe_button.configure(state=tk.NORMAL)
return
model = HF_MODEL_MAP.get(model_display, model_display)
language = self.language_entry.get()
# Auto-set Swedish for KB models
is_kb_model = model_display.startswith('KB Swedish')
# Check if the language field has the default text or is empty
if is_kb_model:
language = 'sv'
elif language == self.default_language_text or not language.strip():
language = None # This is the same as passing nothing
verbose = True # always show transcription progress in the console panel
# Show progress bar
self.progress_bar.pack(fill=tk.X, padx=5, pady=5)
self.progress_bar.start()
# Setting path and files
glob_file = get_path(path)
#messagebox.showinfo("Message", "Starting transcription!")
# Start transcription
try:
output_text = transcribe(path, glob_file, model, language, verbose)
except UnboundLocalError:
messagebox.showinfo("Files not found error!", 'Nothing found, choose another folder.')
pass
except ValueError:
messagebox.showinfo("Invalid language name, you might have to clear the default text to continue!")
# Hide progress bar
self.progress_bar.stop()
self.progress_bar.pack_forget()
# Enable transcribe button
self.transcribe_button.configure(state=tk.NORMAL)
# Recover output text
try:
messagebox.showinfo("Finished!", output_text)
except UnboundLocalError:
pass
if __name__ == "__main__":
# Setting custom themes
root = customtkinter.CTk()
root.title("Local Transcribe with Whisper")
# Geometry — taller to accommodate the embedded console panel
width, height = 550, 560
root.geometry('{}x{}'.format(width, height))
root.minsize(450, 480)
# Icon
root.iconbitmap('images/icon.ico')
# Run
app = App(root)
root.mainloop()
+20
View File
@@ -0,0 +1,20 @@
from cx_Freeze import setup, Executable
build_exe_options = {
"packages": ['faster_whisper','tkinter','customtkinter']
}
executables = (
[
Executable(
"app.py",
icon='images/icon.ico',
)
]
)
setup(
name="Local Transcribe with Whisper",
version="2.0",
author="Kristofer Rolf Söderström",
options={"build_exe":build_exe_options},
executables=executables
)
+61 -58
View File
@@ -1,123 +1,125 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a2cd4050",
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"from transcribe import transcribe"
"# Local Transcribe with Whisper\n",
"## Example"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "24e1d24e",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on function transcribe in module transcribe:\n",
"Help on function transcribe in module src._LocalTranscribe:\n",
"\n",
"transcribe(path, file_type, model=None, language=None, verbose=True)\n",
" Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions\n",
"transcribe(path, glob_file, model=None, language=None, verbose=False)\n",
" Transcribes audio files in a specified folder using OpenAI's Whisper model.\n",
" \n",
" Args:\n",
" path (str): Path to the folder containing the audio files.\n",
" glob_file (list): List of audio file paths to transcribe.\n",
" model (str, optional): Name of the Whisper model to use for transcription.\n",
" Defaults to None, which uses the default model.\n",
" language (str, optional): Language code for transcription. Defaults to None,\n",
" which enables automatic language detection.\n",
" verbose (bool, optional): If True, enables verbose mode with detailed information\n",
" during the transcription process. Defaults to False.\n",
" \n",
" Returns:\n",
" str: A message indicating the result of the transcription process.\n",
" \n",
" Raises:\n",
" RuntimeError: If an invalid file is encountered, it will be skipped.\n",
" \n",
" Notes:\n",
" - The function downloads the specified model if not available locally.\n",
" - The transcribed text files will be saved in a \"transcriptions\" folder\n",
" within the specified path.\n",
"\n"
]
}
],
"source": [
"# Import the modules and get the docstring\n",
"from src._LocalTranscribe import transcribe, get_path\n",
"help(transcribe)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e52477fb",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"path='sample_audio/'#folder path\n",
"file_type='ogg' #check your file for file type, will only transcribe those files\n",
"model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
"language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
"verbose = True # prints output while transcribing, False to deactivate"
"# Set the variables\n",
"path='sample_audio/'# Folder path\n",
"model='small' # Model size\n",
"language= None # Preset language, None for automatic detection\n",
"verbose = True # Output transcription in realtime\n",
"\n",
"# Get glob file, additional step for app version.\n",
"\n",
"glob_file = get_path(path)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d66866af",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using medium model, you can change this by specifying model=\"medium\" for example\n",
"Only looking for file type ogg, you can change this by specifying file_type=\"mp3\"\n",
"Expecting None language, you can change this by specifying language=\"English\". None will try to auto-detect\n",
"Verbosity is True. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False\n",
"\n",
"There are 2 ogg files in path: sample_audio/\n",
"\n",
"\n",
"Loading model...\n",
"Transcribing file number number 1: Armstrong_Small_Step\n",
"Model and file loaded...\n",
"Starting transcription...\n",
"\n",
"Trying to transcribe file named: Armstrong_Small_Step🕐\n",
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
"Detected language: English\n",
"[00:00.000 --> 00:24.000] That's one small step for man, one giant leap for mankind.\n",
"\n",
"Finished file number 1.\n",
"\n",
"\n",
"\n",
"Transcribing file number number 2: Axel_Pettersson_röstinspelning\n",
"Model and file loaded...\n",
"Starting transcription...\n",
"[00:00.000 --> 00:07.000] I'm going to step off the limb now.\n",
"[00:07.000 --> 00:18.000] That's one small step for man.\n",
"[00:18.000 --> 00:24.000] One giant leap for mankind.\n",
"\n",
"Trying to transcribe file named: Axel_Pettersson_röstinspelning🕐\n",
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
"Detected language: Swedish\n",
"[00:00.000 --> 00:16.000] Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
"[00:00.000 --> 00:06.140] Hej, jag heter Axel Pettersson. Jag följer bror 1976.\n",
"[00:06.400 --> 00:15.100] Jag har varit vikerpedjan sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
"\n",
"Finished file number 2.\n",
"Trying to transcribe file named: readme🕐\n",
"Not a valid file, skipping.\n",
"\n",
"\n",
"\n"
"Trying to transcribe file named: transcriptions🕐\n",
"Not a valid file, skipping.\n"
]
},
{
"data": {
"text/plain": [
"'Finished transcription, files can be found in sample_audio/transcriptions'"
"'Finished transcription, 2 files can be found in sample_audio//transcriptions'"
]
},
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transcribe(path, file_type, model, language, verbose)"
"# Run the script\n",
"transcribe(path, glob_file, model, language, verbose)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0bc67265",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "venv",
"language": "python",
"name": "python3"
},
@@ -132,8 +134,9 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 5
"nbformat_minor": 2
}
-231
View File
@@ -1,231 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "eba9e610",
"metadata": {},
"source": [
"A simple way to avoid being connected while transcribing is to first load the model version you want to use. See [here](https://github.com/openai/whisper/blob/main/README.md#available-models-and-languages) for more info."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "85cd2d12",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Whisper(\n",
" (encoder): AudioEncoder(\n",
" (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))\n",
" (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))\n",
" (blocks): ModuleList(\n",
" (0-23): 24 x ResidualAttentionBlock(\n",
" (attn): MultiHeadAttention(\n",
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): Sequential(\n",
" (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (1): GELU(approximate='none')\n",
" (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" )\n",
" (ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" (decoder): TextDecoder(\n",
" (token_embedding): Embedding(51865, 1024)\n",
" (blocks): ModuleList(\n",
" (0-23): 24 x ResidualAttentionBlock(\n",
" (attn): MultiHeadAttention(\n",
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" (cross_attn): MultiHeadAttention(\n",
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (cross_attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): Sequential(\n",
" (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (1): GELU(approximate='none')\n",
" (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" )\n",
" (ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
")"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import whisper\n",
"#change to model size, bigger is more accurate but slower\n",
"whisper.load_model(\"medium\") #base, small, medium, large"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0d2acd54",
"metadata": {},
"outputs": [],
"source": [
"#after it loads, you can disconnect from the internet and run the rest"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a2cd4050",
"metadata": {},
"outputs": [],
"source": [
"from transcribe import transcribe"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "24e1d24e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on function transcribe in module transcribe:\n",
"\n",
"transcribe(path, file_type, model=None, language=None, verbose=True)\n",
" Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions\n",
"\n"
]
}
],
"source": [
"help(transcribe)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "e52477fb",
"metadata": {},
"outputs": [],
"source": [
"path='sample_audio/'#folder path\n",
"file_type='ogg' #check your file for file type, will only transcribe those files\n",
"model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
"language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
"verbose = True # prints output while transcribing, False to deactivate"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "d66866af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using medium model, you can change this by specifying model=\"medium\" for example\n",
"Only looking for file type ogg, you can change this by specifying file_type=\"mp3\"\n",
"Expecting None language, you can change this by specifying language=\"English\". None will try to auto-detect\n",
"Verbosity is True. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False\n",
"\n",
"There are 2 ogg files in path: sample_audio/\n",
"\n",
"\n",
"Loading model...\n",
"Transcribing file number number 1: Armstrong_Small_Step\n",
"Model and file loaded...\n",
"Starting transcription...\n",
"\n",
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
"Detected language: English\n",
"[00:00.000 --> 00:24.000] That's one small step for man, one giant leap for mankind.\n",
"\n",
"Finished file number 1.\n",
"\n",
"\n",
"\n",
"Transcribing file number number 2: Axel_Pettersson_röstinspelning\n",
"Model and file loaded...\n",
"Starting transcription...\n",
"\n",
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
"Detected language: Swedish\n",
"[00:00.000 --> 00:16.000] Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
"\n",
"Finished file number 2.\n",
"\n",
"\n",
"\n"
]
},
{
"data": {
"text/plain": [
"'Finished transcription, files can be found in sample_audio/transcriptions'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transcribe(path, file_type, model, language, verbose)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0bc67265",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
BIN
View File
Binary file not shown.

Before

Width:  |  Height:  |  Size: 29 KiB

Before

Width:  |  Height:  |  Size: 135 KiB

After

Width:  |  Height:  |  Size: 135 KiB

View File

Before

Width:  |  Height:  |  Size: 324 KiB

After

Width:  |  Height:  |  Size: 324 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

BIN
View File
Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

+128
View File
@@ -0,0 +1,128 @@
"""
Installer script for Local Transcribe with Whisper.
Detects NVIDIA GPU and offers to install GPU acceleration support.
Usage:
python install.py
"""
import os
import subprocess
import sys
import shutil
import site
def detect_nvidia_gpu():
"""Check if an NVIDIA GPU is present."""
candidates = [
shutil.which("nvidia-smi"),
r"C:\Windows\System32\nvidia-smi.exe",
r"C:\Program Files\NVIDIA Corporation\NVSMI\nvidia-smi.exe",
]
for path in candidates:
if not path or not os.path.isfile(path):
continue
try:
r = subprocess.run(
[path, "--query-gpu=name", "--format=csv,noheader"],
capture_output=True, text=True, timeout=10,
)
if r.returncode == 0 and r.stdout.strip():
return True, r.stdout.strip().split("\n")[0]
except Exception:
continue
return False, None
def pip_install(*packages):
cmd = [sys.executable, "-m", "pip", "install"] + list(packages)
print(f"\n> {' '.join(cmd)}\n")
subprocess.check_call(cmd)
def get_site_packages():
for p in site.getsitepackages():
if p.endswith("site-packages"):
return p
return site.getsitepackages()[0]
def create_nvidia_pth():
"""Create a .pth startup hook that registers NVIDIA DLL directories."""
sp = get_site_packages()
pth_path = os.path.join(sp, "nvidia_cuda_path.pth")
# This one-liner runs at Python startup, before any user code.
pth_content = (
"import os, glob as g; "
"any(os.add_dll_directory(d) or os.environ.__setitem__('PATH', d + os.pathsep + os.environ.get('PATH','')) "
"for d in g.glob(os.path.join(r'" + sp.replace("'", "\\'") + "', 'nvidia', '*', 'bin')) "
"+ g.glob(os.path.join(r'" + sp.replace("'", "\\'") + "', 'nvidia', '*', 'lib')) "
"if os.path.isdir(d)) if os.name == 'nt' else None\n"
)
with open(pth_path, "w") as f:
f.write(pth_content)
print(f" Created CUDA startup hook: {pth_path}")
def verify_cuda():
"""Verify CUDA works in a fresh subprocess."""
try:
r = subprocess.run(
[sys.executable, "-c",
"import ctranslate2; "
"print('float16' in ctranslate2.get_supported_compute_types('cuda'))"],
capture_output=True, text=True, timeout=30,
)
return r.stdout.strip() == "True"
except Exception:
return False
def main():
print("=" * 55)
print(" Local Transcribe with Whisper — Installer")
print("=" * 55)
# Step 1: Base packages
print("\n[1/2] Installing base requirements...")
pip_install("-r", "requirements.txt")
print("\n Base requirements installed!")
# Step 2: GPU
print("\n[2/2] Checking for NVIDIA GPU...")
has_gpu, gpu_name = detect_nvidia_gpu()
if has_gpu:
print(f"\n NVIDIA GPU detected: {gpu_name}")
print(" GPU acceleration can make transcription 2-5x faster.")
print(" This will install ~300 MB of additional CUDA libraries.\n")
while True:
answer = input(" Install GPU support? [Y/n]: ").strip().lower()
if answer in ("", "y", "yes"):
print("\n Installing CUDA libraries...")
pip_install("nvidia-cublas-cu12", "nvidia-cudnn-cu12")
create_nvidia_pth()
print("\n Verifying CUDA...")
if verify_cuda():
print(" GPU support verified and working!")
else:
print(" WARNING: CUDA installed but not detected.")
print(" Update your NVIDIA drivers and try again.")
break
elif answer in ("n", "no"):
print("\n Skipping GPU. Re-run install.py to add it later.")
break
else:
print(" Please enter Y or N.")
else:
print("\n No NVIDIA GPU detected — using CPU mode.")
print("\n" + "=" * 55)
print(" Done! Run the app with: python app.py")
print("=" * 55)
if __name__ == "__main__":
main()
+2
View File
@@ -0,0 +1,2 @@
faster-whisper
customtkinter
+29
View File
@@ -0,0 +1,29 @@
#!/bin/bash
# ============================================================
# Local Transcribe with Whisper — macOS / Linux launcher
# ============================================================
# Double-click this file or run: ./run_Mac.sh
# On first run it creates a venv and installs dependencies.
# ============================================================
set -e
cd "$(dirname "$0")"
# Create .venv if it doesn't exist
if [ ! -f ".venv/bin/python" ]; then
echo "Creating virtual environment..."
python3 -m venv .venv
fi
PYTHON=".venv/bin/python"
# Install dependencies on first run
if ! "$PYTHON" -c "import faster_whisper" 2>/dev/null; then
echo "First run detected — running installer..."
"$PYTHON" install.py
echo
fi
echo "Starting Local Transcribe..."
"$PYTHON" app.py
+22 -4
View File
@@ -1,5 +1,23 @@
@echo off
echo Starting...
call conda activate base
REM OPTION 2 : (KEEP TEXT WITHIN QUOTES AND CHANGE USERNAME) "C:/Users/user/Anaconda3/condabin/activate.bat"
call python GUI.py
REM Create .venv on first run if it doesn't exist
if not exist ".venv\Scripts\python.exe" (
echo Creating virtual environment...
python -m venv .venv
if errorlevel 1 (
echo ERROR: Failed to create virtual environment. Is Python installed and on PATH?
pause
exit /b 1
)
)
set PYTHON=.venv\Scripts\python.exe
REM Check if dependencies are installed
%PYTHON% -c "import faster_whisper" 2>nul
if errorlevel 1 (
echo First run detected - running installer...
%PYTHON% install.py
echo.
)
echo Starting Local Transcribe...
%PYTHON% app.py
@@ -1,5 +1,2 @@
Armstrong_Small_Step
In seconds:
[0.00 --> 7.00]: I'm going to step off the limb now.
[7.00 --> 18.00]: That's one small step for man.
[18.00 --> 24.00]: One giant leap for mankind.
[0:00:00 --> 0:00:07]: That's one small step for man, one giant leap for mankind.
@@ -1,4 +1,2 @@
Axel_Pettersson_röstinspelning
In seconds:
[0.00 --> 6.14]: Hej, jag heter Axel Pettersson. Jag följer bror 1976.
[6.40 --> 15.10]: Jag har varit vikerpedjan sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.
[0:00:00 --> 0:00:15]: Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.
+149
View File
@@ -0,0 +1,149 @@
import os
import sys
import datetime
import site
from glob import glob
# ---------------------------------------------------------------------------
# CUDA setup — must happen before importing faster_whisper / ctranslate2
# ---------------------------------------------------------------------------
def _setup_cuda_dlls():
"""Add NVIDIA pip-package DLL dirs to the DLL search path (Windows only).
pip-installed nvidia-cublas-cu12 / nvidia-cudnn-cu12 place their .dll
files inside the site-packages tree. Python 3.8+ on Windows does NOT
search PATH for DLLs loaded via ctypes/LoadLibrary, so we must
explicitly register every nvidia/*/bin and nvidia/*/lib directory using
os.add_dll_directory *and* prepend them to PATH (some native extensions
still rely on PATH).
"""
if sys.platform != "win32":
return
try:
for sp in site.getsitepackages():
nvidia_root = os.path.join(sp, "nvidia")
if not os.path.isdir(nvidia_root):
continue
for pkg in os.listdir(nvidia_root):
for sub in ("bin", "lib"):
d = os.path.join(nvidia_root, pkg, sub)
if os.path.isdir(d):
os.environ["PATH"] = d + os.pathsep + os.environ.get("PATH", "")
try:
os.add_dll_directory(d)
except (OSError, AttributeError):
pass
except Exception:
pass
_setup_cuda_dlls()
from faster_whisper import WhisperModel
def _detect_device():
"""Return (device, compute_type) for the best available backend."""
try:
import ctranslate2
cuda_types = ctranslate2.get_supported_compute_types("cuda")
if "float16" in cuda_types:
return "cuda", "float16"
except Exception:
pass
return "cpu", "int8"
# Get the path
def get_path(path):
glob_file = glob(path + '/*')
return glob_file
# Main function
def transcribe(path, glob_file, model=None, language=None, verbose=False):
"""
Transcribes audio files in a specified folder using faster-whisper (CTranslate2).
Args:
path (str): Path to the folder containing the audio files.
glob_file (list): List of audio file paths to transcribe.
model (str, optional): Name of the Whisper model size to use for transcription.
Defaults to None, which uses the default model.
language (str, optional): Language code for transcription. Defaults to None,
which enables automatic language detection.
verbose (bool, optional): If True, enables verbose mode with detailed information
during the transcription process. Defaults to False.
Returns:
str: A message indicating the result of the transcription process.
Raises:
RuntimeError: If an invalid file is encountered, it will be skipped.
Notes:
- The function downloads the specified model if not available locally.
- The transcribed text files will be saved in a "transcriptions" folder
within the specified path.
- Uses CTranslate2 for up to 4x faster inference compared to openai-whisper.
- FFmpeg is bundled via the PyAV dependency — no separate installation needed.
"""
SEP = "" * 46
# ── Step 1: Detect hardware ──────────────────────────────────────
device, compute_type = _detect_device()
print(f"⚙ Device: {device} | Compute: {compute_type}")
# ── Step 2: Load model ───────────────────────────────────────────
print(f"⏳ Loading model '{model}' — downloading if needed...")
whisper_model = WhisperModel(model, device=device, compute_type=compute_type)
print("✅ Model ready!")
print(SEP)
# ── Step 3: Transcribe files ─────────────────────────────────────
total_files = len(glob_file)
print(f"📂 Found {total_files} item(s) in folder")
print(SEP)
files_transcripted = []
file_num = 0
for file in glob_file:
title = os.path.basename(file).split('.')[0]
file_num += 1
print(f"\n{'' * 46}")
print(f"📄 File {file_num}/{total_files}: {title}")
try:
segments, info = whisper_model.transcribe(
file,
language=language,
beam_size=5
)
# Make folder if missing
os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
# Stream segments as they are decoded
segment_list = []
with open("{}/transcriptions/{}.txt".format(path, title), 'w', encoding='utf-8') as f:
f.write(title)
for seg in segments:
start_ts = str(datetime.timedelta(seconds=seg.start))
end_ts = str(datetime.timedelta(seconds=seg.end))
f.write('\n[{} --> {}]:{}'.format(start_ts, end_ts, seg.text))
f.flush()
if verbose:
print(" [%.2fs → %.2fs] %s" % (seg.start, seg.end, seg.text))
else:
print(" Transcribed up to %.0fs..." % seg.end, end='\r')
segment_list.append(seg)
print(f"✅ Done — saved to transcriptions/{title}.txt")
files_transcripted.append(segment_list)
except Exception:
print('⚠ Not a valid audio/video file, skipping.')
# ── Summary ──────────────────────────────────────────────────────
print(f"\n{SEP}")
if len(files_transcripted) > 0:
output_text = f"✅ Finished! {len(files_transcripted)} file(s) transcribed.\n Saved in: {path}/transcriptions"
else:
output_text = '⚠ No files eligible for transcription — try another folder.'
print(output_text)
print(SEP)
return output_text
-56
View File
@@ -1,56 +0,0 @@
import whisper
import glob, os
#import torch #uncomment if using torch with cuda, below too
import datetime
def transcribe(path, file_type, model=None, language=None, verbose=False):
'''Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions'''
try:
os.mkdir('{}/transcriptions'.format(path))
except FileExistsError:
pass
glob_file = glob.glob(path+'/*{}'.format(file_type))
#if torch.cuda.is_available():
# generator = torch.Generator('cuda').manual_seed(42)
#else:
# generator = torch.Generator().manual_seed(42)
print('Using {} model'.format(model))
print('File type is {}'.format(file_type))
print('Language is being detected automatically for each file')
print('Verbosity is set to {}'.format(verbose))
print('\nThere are {} {} files in path: {}\n\n'.format(len(glob_file), file_type, path))
print('Loading model...')
model = whisper.load_model(model)
for idx,file in enumerate(glob_file):
title = os.path.basename(file).split('.')[0]
print('Transcribing file number number {}: {}'.format(idx+1,title))
print('Model and file loaded...\nStarting transcription...\n')
result = model.transcribe(
file,
language=language,
verbose=verbose
)
start=[]
end=[]
text=[]
for i in range(len(result['segments'])):
start.append(str(datetime.timedelta(seconds=(result['segments'][i]['start']))))
end.append(str(datetime.timedelta(seconds=(result['segments'][i]['end']))))
text.append(result['segments'][i]['text'])
with open("{}/transcriptions/{}.txt".format(path,title), 'w', encoding='utf-8') as file:
file.write(title)
file.write('\nIn seconds:')
for i in range(len(result['segments'])):
file.writelines('\n[{} --> {}]:{}'.format(start[i], end[i], text[i]))
print('\nFinished file number {}.\n\n\n'.format(idx+1))
return 'Finished transcription, files can be found in {}/transcriptions'.format(path)