Compare commits
33 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| f8cf42733d | |||
| 7d3fe1ba26 | |||
| da42a6e4cc | |||
| 0dab0d9bea | |||
| 953c71ab28 | |||
| 5522bdd575 | |||
| 861c470330 | |||
| 6de6d4b2ff | |||
| 01552cc7cb | |||
| 049a168c81 | |||
| 56a925463f | |||
| fe60b04020 | |||
| ff06a257f2 | |||
| 5e31129ea2 | |||
| 3f0bca02b7 | |||
| 488e78a5ae | |||
| 829a054300 | |||
| 462aae12ca | |||
| fec9190ba1 | |||
| 0dde25204d | |||
| b611aa6b8c | |||
| 7d50d5f4cf | |||
| 7799d03960 | |||
| f88186dacc | |||
| 3f5c1491ac | |||
| c83e15bdba | |||
| ff16ad30e1 | |||
| 622165b3e6 | |||
| 0e9cbdca58 | |||
| 87cb509b14 | |||
| ba935cafb7 | |||
| 6497508b7a | |||
| d96333a5a7 |
@@ -0,0 +1 @@
|
||||
*.zip filter=lfs diff=lfs merge=lfs -text
|
||||
@@ -0,0 +1,26 @@
|
||||
# Python cache
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# Virtual environments
|
||||
venv/
|
||||
env/
|
||||
ENV/
|
||||
.venv/
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Build artifacts
|
||||
dist/
|
||||
build/
|
||||
*.egg-info/
|
||||
@@ -4,8 +4,8 @@ authors:
|
||||
- family-names: "Söderström"
|
||||
given-names: "Kristofer Rolf"
|
||||
orcid: "https://orcid.org/0000-0002-5322-3350"
|
||||
title: "transcribe"
|
||||
version: 1.1.1
|
||||
doi: 10.5281/zenodo.7760511
|
||||
title: "Local Transcribe"
|
||||
version: 1.2
|
||||
doi: 10.5281/zenodo.7760510
|
||||
date-released: 2023-03-22
|
||||
url: "https://github.com/soderstromkr/transcribe"
|
||||
|
||||
@@ -1,100 +0,0 @@
|
||||
import tkinter as tk
|
||||
from tkinter import ttk
|
||||
from tkinter import filedialog
|
||||
from tkinter import messagebox
|
||||
from transcribe import transcribe
|
||||
from ttkthemes import ThemedTk
|
||||
import whisper
|
||||
import numpy as np
|
||||
import glob, os
|
||||
|
||||
|
||||
class App:
|
||||
def __init__(self, master):
|
||||
self.master = master
|
||||
master.title("Local Transcribe")
|
||||
|
||||
#style options
|
||||
style = ttk.Style()
|
||||
style.configure('TLabel', font=('Arial', 10), padding=10)
|
||||
style.configure('TEntry', font=('Arial', 10), padding=10)
|
||||
style.configure('TButton', font=('Arial', 10), padding=10)
|
||||
style.configure('TCheckbutton', font=('Arial', 10), padding=10)
|
||||
|
||||
# Folder Path
|
||||
path_frame = ttk.Frame(master, padding=10)
|
||||
path_frame.pack(fill=tk.BOTH)
|
||||
path_label = ttk.Label(path_frame, text="Folder Path:")
|
||||
path_label.pack(side=tk.LEFT, padx=5)
|
||||
self.path_entry = ttk.Entry(path_frame, width=50)
|
||||
self.path_entry.insert(10, 'sample_audio/')
|
||||
self.path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||
browse_button = ttk.Button(path_frame, text="Browse", command=self.browse)
|
||||
browse_button.pack(side=tk.LEFT, padx=5)
|
||||
|
||||
# File Type
|
||||
file_type_frame = ttk.Frame(master, padding=10)
|
||||
file_type_frame.pack(fill=tk.BOTH)
|
||||
file_type_label = ttk.Label(file_type_frame, text="File Type:")
|
||||
file_type_label.pack(side=tk.LEFT, padx=5)
|
||||
self.file_type_entry = ttk.Entry(file_type_frame, width=50)
|
||||
self.file_type_entry.insert(10, 'ogg')
|
||||
self.file_type_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||
|
||||
# Model
|
||||
model_frame = ttk.Frame(master, padding=10)
|
||||
model_frame.pack(fill=tk.BOTH)
|
||||
model_label = ttk.Label(model_frame, text="Model:")
|
||||
model_label.pack(side=tk.LEFT, padx=5)
|
||||
self.model_entry = ttk.Entry(model_frame, width=50)
|
||||
self.model_entry.insert(10, 'small')
|
||||
self.model_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||
|
||||
# Language (currently disabled)
|
||||
#language_frame = ttk.Frame(master, padding=10)
|
||||
#language_frame.pack(fill=tk.BOTH)
|
||||
#language_label = ttk.Label(language_frame, text="Language:")
|
||||
#language_label.pack(side=tk.LEFT, padx=5)
|
||||
#self.language_entry = ttk.Entry(language_frame, width=50)
|
||||
#self.language_entry.insert(10, np.nan)
|
||||
#self.language_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||
|
||||
# Verbose
|
||||
verbose_frame = ttk.Frame(master, padding=10)
|
||||
verbose_frame.pack(fill=tk.BOTH)
|
||||
self.verbose_var = tk.BooleanVar()
|
||||
verbose_checkbutton = ttk.Checkbutton(verbose_frame, text="Verbose", variable=self.verbose_var)
|
||||
verbose_checkbutton.pack(side=tk.LEFT, padx=5)
|
||||
|
||||
# Buttons
|
||||
button_frame = ttk.Frame(master, padding=10)
|
||||
button_frame.pack(fill=tk.BOTH)
|
||||
transcribe_button = ttk.Button(button_frame, text="Transcribe Audio", command=self.transcribe)
|
||||
transcribe_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
|
||||
quit_button = ttk.Button(button_frame, text="Quit", command=master.quit)
|
||||
quit_button.pack(side=tk.RIGHT, padx=5, pady=10, fill=tk.X, expand=True)
|
||||
|
||||
def browse(self):
|
||||
folder_path = filedialog.askdirectory()
|
||||
self.path_entry.delete(0, tk.END)
|
||||
self.path_entry.insert(0, folder_path)
|
||||
|
||||
def transcribe(self):
|
||||
path = self.path_entry.get()
|
||||
file_type = self.file_type_entry.get()
|
||||
model = self.model_entry.get()
|
||||
#language = self.language_entry.get()
|
||||
language = None # set to auto-detect
|
||||
verbose = self.verbose_var.get()
|
||||
|
||||
# Call the transcribe function with the appropriate arguments
|
||||
result = transcribe(path, file_type, model=model, language=language, verbose=verbose)
|
||||
|
||||
# Show the result in a message box
|
||||
tk.messagebox.showinfo("Finished!", result)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# root = tk.Tk()
|
||||
root = ThemedTk(theme="clearlooks")
|
||||
app = App(root)
|
||||
root.mainloop()
|
||||
@@ -0,0 +1,31 @@
|
||||
### How to run on Mac / Linux
|
||||
|
||||
#### Quick start
|
||||
1. Open Terminal and navigate to the project folder (or right-click the folder and select "Open in Terminal").
|
||||
2. Make the script executable (only needed once):
|
||||
```
|
||||
chmod +x run_Mac.sh
|
||||
```
|
||||
3. Run it:
|
||||
```
|
||||
./run_Mac.sh
|
||||
```
|
||||
|
||||
This will automatically:
|
||||
- Create a virtual environment (`.venv`)
|
||||
- Install all dependencies (no admin rights needed)
|
||||
- Launch the app
|
||||
|
||||
#### Manual steps (alternative)
|
||||
If you prefer to do it manually:
|
||||
```
|
||||
python3 -m venv .venv
|
||||
.venv/bin/python install.py
|
||||
.venv/bin/python app.py
|
||||
```
|
||||
|
||||
#### Notes
|
||||
- **Python 3.10+** is required. macOS users can install it from [python.org](https://www.python.org/downloads/) or via `brew install python`.
|
||||
- **No FFmpeg install needed** — audio decoding is bundled.
|
||||
- **GPU acceleration** is not available on macOS (Apple Silicon MPS is not supported by CTranslate2). CPU with int8 quantization is still fast.
|
||||
- On Apple Silicon (M1/M2/M3/M4), the `small` or `base` models run well. `medium` works but is slower.
|
||||
@@ -1,5 +0,0 @@
|
||||
### How to run on Mac
|
||||
Unfortunately, I have not found a permament solution for this, not being a Mac user has limited the ways I can test this. For now, these are the recommended steps for a beginner user:
|
||||
1. Open a terminal and navigate to the root folder (transcribe-main if you downloaded the folder). You can also right-click (or equivalent) on the root folder to open a Terminal within the folder.
|
||||
2. Run the following command:
|
||||
python GUI.py
|
||||
@@ -1,72 +1,90 @@
|
||||
## Local Transcribe
|
||||
## Local Transcribe with Whisper
|
||||
|
||||
Local Transcribe uses OpenAI's Whisper to transcribe audio files from your local folders, creating text files on disk.
|
||||
> **⚠ Note for Mac users (Apple Silicon):** This version uses `faster-whisper` (CTranslate2), which does **not** support Apple M-chip GPU acceleration. Transcription will run on CPU, which is slower than OpenAI's Whisper with Metal/CoreML support. The trade-off is a much simpler installation — no conda, no PyTorch, no admin rights. If you'd prefer M-chip GPU acceleration and don't mind a more involved setup, switch to the **classic** release:
|
||||
> ```
|
||||
> git checkout classic
|
||||
> ```
|
||||
|
||||
## Note
|
||||
Local Transcribe with Whisper is a user-friendly desktop application that allows you to transcribe audio and video files using the Whisper ASR system, powered by [faster-whisper](https://github.com/SYSTRAN/faster-whisper) (CTranslate2). This application provides a graphical user interface (GUI) built with Python and the Tkinter library, making it easy to use even for those not familiar with programming.
|
||||
|
||||
This implementation and guide is mostly made for researchers not familiar with programming that want a way to transcribe their files locally, without internet connection, usually required within ethical data practices and frameworks. Two examples are shown, a normal workflow with internet connection. And one in which the model is loaded first, via openai-whisper, and then the transcription can be done without being connected to the internet. There is now also a GUI implementation, read below for more information.
|
||||
## New in version 2.0!
|
||||
1. **Switched to faster-whisper** — up to 4× faster transcription with lower memory usage.
|
||||
2. **No separate FFmpeg installation needed** — audio decoding is handled by the bundled PyAV library.
|
||||
3. **No admin rights required** — a plain `pip install` covers everything.
|
||||
4. **No PyTorch dependency** — dramatically smaller install footprint.
|
||||
5. **`tiny` model added** — smallest and fastest option for quick drafts.
|
||||
|
||||
### Instructions
|
||||
|
||||
#### Requirements
|
||||
|
||||
1. This script was made and tested in an Anaconda environment with Python 3.10. I recommend this method if you're not familiar with Python.
|
||||
See [here](https://docs.anaconda.com/anaconda/install/index.html) for instructions. You might need administrator rights.
|
||||
|
||||
2. Whisper requires some additional libraries. The [setup](https://github.com/openai/whisper#setup) page states: "The codebase also depends on a few Python packages, most notably HuggingFace Transformers for their fast tokenizer implementation and ffmpeg-python for reading audio files."
|
||||
Users might not need to specifically install Transfomers. However, a conda installation might be needed for ffmpeg[^1], which takes care of setting up PATH variables. From the anaconda prompt, type or copy the following:
|
||||
|
||||
```
|
||||
conda install -c conda-forge ffmpeg-python
|
||||
```
|
||||
|
||||
3. The main functionality comes from openai-whisper. See their [page](https://github.com/openai/whisper) for details. As of 2023-03-22 you can install via:
|
||||
|
||||
```
|
||||
pip install -U openai-whisper
|
||||
```
|
||||
|
||||
4. There is an option to run a batch file, which launches a GUI built on TKinter and TTKthemes. If using these options, make sure they are installed in your Python build. You can install them via pip.
|
||||
|
||||
```
|
||||
pip install tk
|
||||
```
|
||||
|
||||
and
|
||||
|
||||
```
|
||||
pip install ttkthemes
|
||||
```
|
||||
|
||||
#### Using the script
|
||||
|
||||
This is a simple script with no installation. You can download the zip folder and extract it to your preferred working folder.
|
||||
|
||||

|
||||
## Features
|
||||
* Select the folder containing the audio or video files you want to transcribe. Tested with m4a video.
|
||||
* Choose the language of the files you are transcribing. You can either select a specific language or let the application automatically detect the language.
|
||||
* Select the Whisper model to use for the transcription. Available models include "tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v2", and "large-v3". Models with .en ending are better if you're transcribing English, especially the base and small models.
|
||||
* **Swedish-optimised models** — [KB-Whisper](https://huggingface.co/collections/KBLab/kb-whisper) from the National Library of Sweden (KBLab) is available in all sizes (tiny → large). These models reduce Word Error Rate by up to 47 % compared to OpenAI Whisper on Swedish speech. The language is set to Swedish automatically when a KB model is selected.
|
||||
* Enable the verbose mode to receive detailed information during the transcription process.
|
||||
* Monitor the progress of the transcription with the progress bar and terminal.
|
||||
* Confirmation dialog before starting the transcription to ensure you have selected the correct folder.
|
||||
* View the transcribed text in a message box once the transcription is completed.
|
||||
|
||||
## Installation
|
||||
### Get the files
|
||||
Download the zip folder and extract it to your preferred working folder.
|
||||

|
||||
Or by cloning the repository with:
|
||||
|
||||
```
|
||||
git clone https://github.com/soderstromkr/transcribe.git
|
||||
```
|
||||
### Python Version **(any platform including Mac users)**
|
||||
1. Install Python 3.10 or later. You can download it from [python.org](https://www.python.org/downloads/). During installation, **check "Add Python to PATH"**. No administrator rights are needed if you install for your user only.
|
||||
|
||||
2. Run the installer. Open a terminal (Command Prompt on Windows, Terminal on Mac/Linux) in the project folder and run:
|
||||
```
|
||||
python install.py
|
||||
```
|
||||
This will:
|
||||
- Install all required packages (including bundled FFmpeg — no separate install needed)
|
||||
- **Auto-detect your NVIDIA GPU** and ask if you want GPU acceleration
|
||||
- No conda, no admin rights required
|
||||
|
||||
#### Example with Jupyter Notebook
|
||||
Alternatively, you can install manually with `pip install -r requirements.txt`.
|
||||
|
||||
See [example](example.ipynb) for an implementation on Jupyter Notebook, also added an example for a simple [workaround](example_no_internet.ipynb) to transcribe while offline.
|
||||
3. Run the app:
|
||||
1. For **Windows**: double-click `run_Windows.bat` (it will auto-install on first run) or run:
|
||||
```
|
||||
python app.py
|
||||
```
|
||||
2. For **Mac / Linux**: run `./run_Mac.sh` (auto-installs on first run). See [Mac instructions](Mac_instructions.md) for details.
|
||||
|
||||
#### Using the GUI
|
||||
**Note** The first run with a given model will download it (~75 MB for base, ~500 MB for medium). After that, everything works offline.
|
||||
|
||||
You can also run the GUI version from your terminal running ```python GUI.py``` or with the batch file called run_Windows.bat (for Windows users), just make sure to add your conda path to it. If you want to download a model first, and then go offline for transcription, I recommend running the model with the default sample folder, which will download the model locally.
|
||||
## GPU Support
|
||||
This program **does support running on NVIDIA GPUs**, which can significantly speed up transcription times. faster-whisper uses CTranslate2, which requires NVIDIA CUDA libraries for GPU acceleration.
|
||||
|
||||
The GUI should look like this:
|
||||
### Automatic Detection
|
||||
The `install.py` script **automatically detects NVIDIA GPUs** and will ask if you want to install GPU support. If you skipped it during installation, you can add it anytime:
|
||||
```
|
||||
pip install nvidia-cublas-cu12 nvidia-cudnn-cu12
|
||||
```
|
||||
|
||||

|
||||
**Note:** Make sure your NVIDIA GPU drivers are up to date. You can check by running `nvidia-smi` in your terminal. The program will automatically detect and use your GPU if available, otherwise it falls back to CPU.
|
||||
|
||||
or this, on a Mac, by running `python GUI.py` or `python3 GUI.py`:
|
||||
### Verifying GPU Support
|
||||
After installation, you can verify that your GPU is available by running:
|
||||
```python
|
||||
import ctranslate2
|
||||
print(ctranslate2.get_supported_compute_types("cuda"))
|
||||
```
|
||||
If this returns a list containing `"float16"`, GPU acceleration is working.
|
||||
|
||||

|
||||
## Usage
|
||||
1. Launch the app — the built-in console panel at the bottom shows a welcome message and all progress updates.
|
||||
2. Select the folder containing the audio or video files you want to transcribe by clicking the "Browse" button next to the "Folder" label. This will open a file dialog where you can navigate to the desired folder. Remember, you won't be choosing individual files but whole folders!
|
||||
3. Enter the desired language for the transcription in the "Language" field. You can either select a language or leave it blank to enable automatic language detection.
|
||||
4. Choose the Whisper model to use for the transcription from the dropdown list next to the "Model" label.
|
||||
5. Click the "Transcribe" button to start the transcription. The button will be disabled during the process to prevent multiple transcriptions at once.
|
||||
6. Monitor progress in the embedded console panel — it shows model loading, per-file progress, and segment timestamps in real time.
|
||||
7. Once the transcription is completed, a message box will appear displaying the result. Click "OK" to close it.
|
||||
8. You can run the application again or quit at any time by clicking the "Quit" button.
|
||||
|
||||
[^1]: Advanced users can use ```pip install ffmpeg-python``` but be ready to deal with some [PATH issues](https://stackoverflow.com/questions/65836756/python-ffmpeg-wont-accept-path-why), which I encountered in Windows 11.
|
||||
## Jupyter Notebook
|
||||
Don't want fancy EXEs or GUIs? Use the function as is. See [example](example.ipynb) for an implementation on Jupyter Notebook.
|
||||
|
||||
[](https://zenodo.org/badge/latestdoi/617404576)
|
||||
|
||||
@@ -0,0 +1,196 @@
|
||||
import os
|
||||
import sys
|
||||
import tkinter as tk
|
||||
from tkinter import ttk
|
||||
from tkinter import filedialog
|
||||
from tkinter import messagebox
|
||||
from src._LocalTranscribe import transcribe, get_path
|
||||
import customtkinter
|
||||
import threading
|
||||
|
||||
|
||||
# ── Helper: redirect stdout/stderr into a CTkTextbox ──────────────────────
|
||||
import re
|
||||
_ANSI_RE = re.compile(r'\x1b\[[0-9;]*m') # strip colour codes
|
||||
|
||||
class _ConsoleRedirector:
|
||||
"""Redirects output exclusively to the in-app console panel."""
|
||||
def __init__(self, text_widget):
|
||||
self.widget = text_widget
|
||||
|
||||
def write(self, text):
|
||||
clean = _ANSI_RE.sub('', text) # strip ANSI colours
|
||||
if clean.strip() == '':
|
||||
return
|
||||
# Schedule UI update on the main thread
|
||||
try:
|
||||
self.widget.after(0, self._append, clean)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _append(self, text):
|
||||
self.widget.configure(state='normal')
|
||||
self.widget.insert('end', text + ('\n' if not text.endswith('\n') else ''))
|
||||
self.widget.see('end')
|
||||
self.widget.configure(state='disabled')
|
||||
|
||||
def flush(self):
|
||||
pass
|
||||
|
||||
# HuggingFace model IDs for non-standard models
|
||||
HF_MODEL_MAP = {
|
||||
'KB Swedish (tiny)': 'KBLab/kb-whisper-tiny',
|
||||
'KB Swedish (base)': 'KBLab/kb-whisper-base',
|
||||
'KB Swedish (small)': 'KBLab/kb-whisper-small',
|
||||
'KB Swedish (medium)': 'KBLab/kb-whisper-medium',
|
||||
'KB Swedish (large)': 'KBLab/kb-whisper-large',
|
||||
}
|
||||
|
||||
|
||||
|
||||
customtkinter.set_appearance_mode("System")
|
||||
customtkinter.set_default_color_theme("blue") # Themes: blue (default), dark-blue, green
|
||||
firstclick = True
|
||||
|
||||
class App:
|
||||
def __init__(self, master):
|
||||
self.master = master
|
||||
# Change font
|
||||
font = ('Roboto', 13, 'bold') # Change the font and size here
|
||||
font_b = ('Roboto', 12) # Change the font and size here
|
||||
# Folder Path
|
||||
path_frame = customtkinter.CTkFrame(master)
|
||||
path_frame.pack(fill=tk.BOTH, padx=10, pady=10)
|
||||
customtkinter.CTkLabel(path_frame, text="Folder:", font=font).pack(side=tk.LEFT, padx=5)
|
||||
self.path_entry = customtkinter.CTkEntry(path_frame, width=50, font=font_b)
|
||||
self.path_entry.insert(0, os.path.join(os.getcwd(), 'sample_audio'))
|
||||
self.path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||
customtkinter.CTkButton(path_frame, text="Browse", command=self.browse, font=font).pack(side=tk.LEFT, padx=5)
|
||||
# Language frame
|
||||
#thanks to pommicket from Stackoverflow for this fix
|
||||
def on_entry_click(event):
|
||||
"""function that gets called whenever entry is clicked"""
|
||||
global firstclick
|
||||
if firstclick: # if this is the first time they clicked it
|
||||
firstclick = False
|
||||
self.language_entry.delete(0, "end") # delete all the text in the entry
|
||||
language_frame = customtkinter.CTkFrame(master)
|
||||
language_frame.pack(fill=tk.BOTH, padx=10, pady=10)
|
||||
customtkinter.CTkLabel(language_frame, text="Language:", font=font).pack(side=tk.LEFT, padx=5)
|
||||
self.language_entry = customtkinter.CTkEntry(language_frame, width=50, font=('Roboto', 12, 'italic'))
|
||||
self.default_language_text = "Enter language (or ignore to auto-detect)"
|
||||
self.language_entry.insert(0, self.default_language_text)
|
||||
self.language_entry.bind('<FocusIn>', on_entry_click)
|
||||
self.language_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||
# Model frame
|
||||
models = ['tiny', 'tiny.en', 'base', 'base.en',
|
||||
'small', 'small.en', 'medium', 'medium.en',
|
||||
'large-v2', 'large-v3',
|
||||
'───────────────',
|
||||
'KB Swedish (tiny)', 'KB Swedish (base)',
|
||||
'KB Swedish (small)', 'KB Swedish (medium)',
|
||||
'KB Swedish (large)']
|
||||
model_frame = customtkinter.CTkFrame(master)
|
||||
model_frame.pack(fill=tk.BOTH, padx=10, pady=10)
|
||||
customtkinter.CTkLabel(model_frame, text="Model:", font=font).pack(side=tk.LEFT, padx=5)
|
||||
# ComboBox frame
|
||||
self.model_combobox = customtkinter.CTkComboBox(
|
||||
model_frame, width=50, state="readonly",
|
||||
values=models, font=font_b)
|
||||
self.model_combobox.set('medium') # Set the default value
|
||||
self.model_combobox.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||
# Progress Bar
|
||||
self.progress_bar = ttk.Progressbar(master, length=200, mode='indeterminate')
|
||||
# Button actions frame
|
||||
button_frame = customtkinter.CTkFrame(master)
|
||||
button_frame.pack(fill=tk.BOTH, padx=10, pady=10)
|
||||
self.transcribe_button = customtkinter.CTkButton(button_frame, text="Transcribe", command=self.start_transcription, font=font)
|
||||
self.transcribe_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
|
||||
customtkinter.CTkButton(button_frame, text="Quit", command=master.quit, font=font).pack(side=tk.RIGHT, padx=5, pady=10, fill=tk.X, expand=True)
|
||||
|
||||
# ── Embedded console / log panel ──────────────────────────────────
|
||||
log_label = customtkinter.CTkLabel(master, text="Console output", font=font, anchor='w')
|
||||
log_label.pack(fill=tk.X, padx=12, pady=(8, 0))
|
||||
self.log_box = customtkinter.CTkTextbox(master, height=220, font=('Consolas', 14),
|
||||
wrap='word', state='disabled',
|
||||
fg_color='#1e1e1e', text_color='#e0e0e0')
|
||||
self.log_box.pack(fill=tk.BOTH, expand=True, padx=10, pady=(2, 10))
|
||||
|
||||
# Redirect stdout & stderr into the log panel (no backend console)
|
||||
sys.stdout = _ConsoleRedirector(self.log_box)
|
||||
sys.stderr = _ConsoleRedirector(self.log_box)
|
||||
|
||||
# Welcome message (shown after redirect so it appears in the panel)
|
||||
print("Welcome to Local Transcribe with Whisper! \U0001f600")
|
||||
print("Transcriptions will be saved automatically.")
|
||||
print("─" * 46)
|
||||
# Helper functions
|
||||
# Browsing
|
||||
def browse(self):
|
||||
initial_dir = os.getcwd()
|
||||
folder_path = filedialog.askdirectory(initialdir=initial_dir)
|
||||
self.path_entry.delete(0, tk.END)
|
||||
self.path_entry.insert(0, folder_path)
|
||||
# Start transcription
|
||||
def start_transcription(self):
|
||||
# Disable transcribe button
|
||||
self.transcribe_button.configure(state=tk.DISABLED)
|
||||
# Start a new thread for the transcription process
|
||||
threading.Thread(target=self.transcribe_thread).start()
|
||||
# Threading
|
||||
def transcribe_thread(self):
|
||||
path = self.path_entry.get()
|
||||
model_display = self.model_combobox.get()
|
||||
# Ignore the visual separator
|
||||
if model_display.startswith('─'):
|
||||
messagebox.showinfo("Invalid selection", "Please select a model, not the separator line.")
|
||||
self.transcribe_button.configure(state=tk.NORMAL)
|
||||
return
|
||||
model = HF_MODEL_MAP.get(model_display, model_display)
|
||||
language = self.language_entry.get()
|
||||
# Auto-set Swedish for KB models
|
||||
is_kb_model = model_display.startswith('KB Swedish')
|
||||
# Check if the language field has the default text or is empty
|
||||
if is_kb_model:
|
||||
language = 'sv'
|
||||
elif language == self.default_language_text or not language.strip():
|
||||
language = None # This is the same as passing nothing
|
||||
verbose = True # always show transcription progress in the console panel
|
||||
# Show progress bar
|
||||
self.progress_bar.pack(fill=tk.X, padx=5, pady=5)
|
||||
self.progress_bar.start()
|
||||
# Setting path and files
|
||||
glob_file = get_path(path)
|
||||
#messagebox.showinfo("Message", "Starting transcription!")
|
||||
# Start transcription
|
||||
try:
|
||||
output_text = transcribe(path, glob_file, model, language, verbose)
|
||||
except UnboundLocalError:
|
||||
messagebox.showinfo("Files not found error!", 'Nothing found, choose another folder.')
|
||||
pass
|
||||
except ValueError:
|
||||
messagebox.showinfo("Invalid language name, you might have to clear the default text to continue!")
|
||||
# Hide progress bar
|
||||
self.progress_bar.stop()
|
||||
self.progress_bar.pack_forget()
|
||||
# Enable transcribe button
|
||||
self.transcribe_button.configure(state=tk.NORMAL)
|
||||
# Recover output text
|
||||
try:
|
||||
messagebox.showinfo("Finished!", output_text)
|
||||
except UnboundLocalError:
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Setting custom themes
|
||||
root = customtkinter.CTk()
|
||||
root.title("Local Transcribe with Whisper")
|
||||
# Geometry — taller to accommodate the embedded console panel
|
||||
width, height = 550, 560
|
||||
root.geometry('{}x{}'.format(width, height))
|
||||
root.minsize(450, 480)
|
||||
# Icon
|
||||
root.iconbitmap('images/icon.ico')
|
||||
# Run
|
||||
app = App(root)
|
||||
root.mainloop()
|
||||
@@ -0,0 +1,20 @@
|
||||
from cx_Freeze import setup, Executable
|
||||
|
||||
build_exe_options = {
|
||||
"packages": ['faster_whisper','tkinter','customtkinter']
|
||||
}
|
||||
executables = (
|
||||
[
|
||||
Executable(
|
||||
"app.py",
|
||||
icon='images/icon.ico',
|
||||
)
|
||||
]
|
||||
)
|
||||
setup(
|
||||
name="Local Transcribe with Whisper",
|
||||
version="2.0",
|
||||
author="Kristofer Rolf Söderström",
|
||||
options={"build_exe":build_exe_options},
|
||||
executables=executables
|
||||
)
|
||||
@@ -1,123 +1,125 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "a2cd4050",
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transcribe import transcribe"
|
||||
"# Local Transcribe with Whisper\n",
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "24e1d24e",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Help on function transcribe in module transcribe:\n",
|
||||
"Help on function transcribe in module src._LocalTranscribe:\n",
|
||||
"\n",
|
||||
"transcribe(path, file_type, model=None, language=None, verbose=True)\n",
|
||||
" Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions\n",
|
||||
"transcribe(path, glob_file, model=None, language=None, verbose=False)\n",
|
||||
" Transcribes audio files in a specified folder using OpenAI's Whisper model.\n",
|
||||
" \n",
|
||||
" Args:\n",
|
||||
" path (str): Path to the folder containing the audio files.\n",
|
||||
" glob_file (list): List of audio file paths to transcribe.\n",
|
||||
" model (str, optional): Name of the Whisper model to use for transcription.\n",
|
||||
" Defaults to None, which uses the default model.\n",
|
||||
" language (str, optional): Language code for transcription. Defaults to None,\n",
|
||||
" which enables automatic language detection.\n",
|
||||
" verbose (bool, optional): If True, enables verbose mode with detailed information\n",
|
||||
" during the transcription process. Defaults to False.\n",
|
||||
" \n",
|
||||
" Returns:\n",
|
||||
" str: A message indicating the result of the transcription process.\n",
|
||||
" \n",
|
||||
" Raises:\n",
|
||||
" RuntimeError: If an invalid file is encountered, it will be skipped.\n",
|
||||
" \n",
|
||||
" Notes:\n",
|
||||
" - The function downloads the specified model if not available locally.\n",
|
||||
" - The transcribed text files will be saved in a \"transcriptions\" folder\n",
|
||||
" within the specified path.\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Import the modules and get the docstring\n",
|
||||
"from src._LocalTranscribe import transcribe, get_path\n",
|
||||
"help(transcribe)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "e52477fb",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path='sample_audio/'#folder path\n",
|
||||
"file_type='ogg' #check your file for file type, will only transcribe those files\n",
|
||||
"model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
|
||||
"language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
|
||||
"verbose = True # prints output while transcribing, False to deactivate"
|
||||
"# Set the variables\n",
|
||||
"path='sample_audio/'# Folder path\n",
|
||||
"model='small' # Model size\n",
|
||||
"language= None # Preset language, None for automatic detection\n",
|
||||
"verbose = True # Output transcription in realtime\n",
|
||||
"\n",
|
||||
"# Get glob file, additional step for app version.\n",
|
||||
"\n",
|
||||
"glob_file = get_path(path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "d66866af",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using medium model, you can change this by specifying model=\"medium\" for example\n",
|
||||
"Only looking for file type ogg, you can change this by specifying file_type=\"mp3\"\n",
|
||||
"Expecting None language, you can change this by specifying language=\"English\". None will try to auto-detect\n",
|
||||
"Verbosity is True. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False\n",
|
||||
"\n",
|
||||
"There are 2 ogg files in path: sample_audio/\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Loading model...\n",
|
||||
"Transcribing file number number 1: Armstrong_Small_Step\n",
|
||||
"Model and file loaded...\n",
|
||||
"Starting transcription...\n",
|
||||
"\n",
|
||||
"Trying to transcribe file named: Armstrong_Small_Step🕐\n",
|
||||
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
|
||||
"Detected language: English\n",
|
||||
"[00:00.000 --> 00:24.000] That's one small step for man, one giant leap for mankind.\n",
|
||||
"\n",
|
||||
"Finished file number 1.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Transcribing file number number 2: Axel_Pettersson_röstinspelning\n",
|
||||
"Model and file loaded...\n",
|
||||
"Starting transcription...\n",
|
||||
"[00:00.000 --> 00:07.000] I'm going to step off the limb now.\n",
|
||||
"[00:07.000 --> 00:18.000] That's one small step for man.\n",
|
||||
"[00:18.000 --> 00:24.000] One giant leap for mankind.\n",
|
||||
"\n",
|
||||
"Trying to transcribe file named: Axel_Pettersson_röstinspelning🕐\n",
|
||||
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
|
||||
"Detected language: Swedish\n",
|
||||
"[00:00.000 --> 00:16.000] Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
|
||||
"[00:00.000 --> 00:06.140] Hej, jag heter Axel Pettersson. Jag följer bror 1976.\n",
|
||||
"[00:06.400 --> 00:15.100] Jag har varit vikerpedjan sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
|
||||
"\n",
|
||||
"Finished file number 2.\n",
|
||||
"Trying to transcribe file named: readme🕐\n",
|
||||
"Not a valid file, skipping.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
"Trying to transcribe file named: transcriptions🕐\n",
|
||||
"Not a valid file, skipping.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Finished transcription, files can be found in sample_audio/transcriptions'"
|
||||
"'Finished transcription, 2 files can be found in sample_audio//transcriptions'"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"transcribe(path, file_type, model, language, verbose)"
|
||||
"# Run the script\n",
|
||||
"transcribe(path, glob_file, model, language, verbose)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0bc67265",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"display_name": "venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@@ -132,8 +134,9 @@
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.4"
|
||||
}
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
||||
@@ -1,231 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "eba9e610",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"A simple way to avoid being connected while transcribing is to first load the model version you want to use. See [here](https://github.com/openai/whisper/blob/main/README.md#available-models-and-languages) for more info."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "85cd2d12",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Whisper(\n",
|
||||
" (encoder): AudioEncoder(\n",
|
||||
" (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))\n",
|
||||
" (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))\n",
|
||||
" (blocks): ModuleList(\n",
|
||||
" (0-23): 24 x ResidualAttentionBlock(\n",
|
||||
" (attn): MultiHeadAttention(\n",
|
||||
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
|
||||
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" )\n",
|
||||
" (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" (mlp): Sequential(\n",
|
||||
" (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
|
||||
" (1): GELU(approximate='none')\n",
|
||||
" (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
|
||||
" )\n",
|
||||
" (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" (decoder): TextDecoder(\n",
|
||||
" (token_embedding): Embedding(51865, 1024)\n",
|
||||
" (blocks): ModuleList(\n",
|
||||
" (0-23): 24 x ResidualAttentionBlock(\n",
|
||||
" (attn): MultiHeadAttention(\n",
|
||||
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
|
||||
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" )\n",
|
||||
" (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" (cross_attn): MultiHeadAttention(\n",
|
||||
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
|
||||
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
||||
" )\n",
|
||||
" (cross_attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" (mlp): Sequential(\n",
|
||||
" (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
|
||||
" (1): GELU(approximate='none')\n",
|
||||
" (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
|
||||
" )\n",
|
||||
" (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" (ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
||||
" )\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import whisper\n",
|
||||
"#change to model size, bigger is more accurate but slower\n",
|
||||
"whisper.load_model(\"medium\") #base, small, medium, large"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "0d2acd54",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#after it loads, you can disconnect from the internet and run the rest"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "a2cd4050",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transcribe import transcribe"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "24e1d24e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Help on function transcribe in module transcribe:\n",
|
||||
"\n",
|
||||
"transcribe(path, file_type, model=None, language=None, verbose=True)\n",
|
||||
" Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"help(transcribe)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "e52477fb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path='sample_audio/'#folder path\n",
|
||||
"file_type='ogg' #check your file for file type, will only transcribe those files\n",
|
||||
"model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
|
||||
"language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
|
||||
"verbose = True # prints output while transcribing, False to deactivate"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "d66866af",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Using medium model, you can change this by specifying model=\"medium\" for example\n",
|
||||
"Only looking for file type ogg, you can change this by specifying file_type=\"mp3\"\n",
|
||||
"Expecting None language, you can change this by specifying language=\"English\". None will try to auto-detect\n",
|
||||
"Verbosity is True. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False\n",
|
||||
"\n",
|
||||
"There are 2 ogg files in path: sample_audio/\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Loading model...\n",
|
||||
"Transcribing file number number 1: Armstrong_Small_Step\n",
|
||||
"Model and file loaded...\n",
|
||||
"Starting transcription...\n",
|
||||
"\n",
|
||||
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
|
||||
"Detected language: English\n",
|
||||
"[00:00.000 --> 00:24.000] That's one small step for man, one giant leap for mankind.\n",
|
||||
"\n",
|
||||
"Finished file number 1.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Transcribing file number number 2: Axel_Pettersson_röstinspelning\n",
|
||||
"Model and file loaded...\n",
|
||||
"Starting transcription...\n",
|
||||
"\n",
|
||||
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
|
||||
"Detected language: Swedish\n",
|
||||
"[00:00.000 --> 00:16.000] Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
|
||||
"\n",
|
||||
"Finished file number 2.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Finished transcription, files can be found in sample_audio/transcriptions'"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"transcribe(path, file_type, model, language, verbose)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0bc67265",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
|
Before Width: | Height: | Size: 29 KiB |
|
Before Width: | Height: | Size: 135 KiB After Width: | Height: | Size: 135 KiB |
|
Before Width: | Height: | Size: 324 KiB After Width: | Height: | Size: 324 KiB |
|
After Width: | Height: | Size: 12 KiB |
|
After Width: | Height: | Size: 1.8 KiB |
@@ -0,0 +1,128 @@
|
||||
"""
|
||||
Installer script for Local Transcribe with Whisper.
|
||||
Detects NVIDIA GPU and offers to install GPU acceleration support.
|
||||
|
||||
Usage:
|
||||
python install.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import shutil
|
||||
import site
|
||||
|
||||
|
||||
def detect_nvidia_gpu():
|
||||
"""Check if an NVIDIA GPU is present."""
|
||||
candidates = [
|
||||
shutil.which("nvidia-smi"),
|
||||
r"C:\Windows\System32\nvidia-smi.exe",
|
||||
r"C:\Program Files\NVIDIA Corporation\NVSMI\nvidia-smi.exe",
|
||||
]
|
||||
for path in candidates:
|
||||
if not path or not os.path.isfile(path):
|
||||
continue
|
||||
try:
|
||||
r = subprocess.run(
|
||||
[path, "--query-gpu=name", "--format=csv,noheader"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
if r.returncode == 0 and r.stdout.strip():
|
||||
return True, r.stdout.strip().split("\n")[0]
|
||||
except Exception:
|
||||
continue
|
||||
return False, None
|
||||
|
||||
|
||||
def pip_install(*packages):
|
||||
cmd = [sys.executable, "-m", "pip", "install"] + list(packages)
|
||||
print(f"\n> {' '.join(cmd)}\n")
|
||||
subprocess.check_call(cmd)
|
||||
|
||||
|
||||
def get_site_packages():
|
||||
for p in site.getsitepackages():
|
||||
if p.endswith("site-packages"):
|
||||
return p
|
||||
return site.getsitepackages()[0]
|
||||
|
||||
|
||||
def create_nvidia_pth():
|
||||
"""Create a .pth startup hook that registers NVIDIA DLL directories."""
|
||||
sp = get_site_packages()
|
||||
pth_path = os.path.join(sp, "nvidia_cuda_path.pth")
|
||||
# This one-liner runs at Python startup, before any user code.
|
||||
pth_content = (
|
||||
"import os, glob as g; "
|
||||
"any(os.add_dll_directory(d) or os.environ.__setitem__('PATH', d + os.pathsep + os.environ.get('PATH','')) "
|
||||
"for d in g.glob(os.path.join(r'" + sp.replace("'", "\\'") + "', 'nvidia', '*', 'bin')) "
|
||||
"+ g.glob(os.path.join(r'" + sp.replace("'", "\\'") + "', 'nvidia', '*', 'lib')) "
|
||||
"if os.path.isdir(d)) if os.name == 'nt' else None\n"
|
||||
)
|
||||
with open(pth_path, "w") as f:
|
||||
f.write(pth_content)
|
||||
print(f" Created CUDA startup hook: {pth_path}")
|
||||
|
||||
|
||||
def verify_cuda():
|
||||
"""Verify CUDA works in a fresh subprocess."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
[sys.executable, "-c",
|
||||
"import ctranslate2; "
|
||||
"print('float16' in ctranslate2.get_supported_compute_types('cuda'))"],
|
||||
capture_output=True, text=True, timeout=30,
|
||||
)
|
||||
return r.stdout.strip() == "True"
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 55)
|
||||
print(" Local Transcribe with Whisper — Installer")
|
||||
print("=" * 55)
|
||||
|
||||
# Step 1: Base packages
|
||||
print("\n[1/2] Installing base requirements...")
|
||||
pip_install("-r", "requirements.txt")
|
||||
print("\n Base requirements installed!")
|
||||
|
||||
# Step 2: GPU
|
||||
print("\n[2/2] Checking for NVIDIA GPU...")
|
||||
has_gpu, gpu_name = detect_nvidia_gpu()
|
||||
|
||||
if has_gpu:
|
||||
print(f"\n NVIDIA GPU detected: {gpu_name}")
|
||||
print(" GPU acceleration can make transcription 2-5x faster.")
|
||||
print(" This will install ~300 MB of additional CUDA libraries.\n")
|
||||
|
||||
while True:
|
||||
answer = input(" Install GPU support? [Y/n]: ").strip().lower()
|
||||
if answer in ("", "y", "yes"):
|
||||
print("\n Installing CUDA libraries...")
|
||||
pip_install("nvidia-cublas-cu12", "nvidia-cudnn-cu12")
|
||||
create_nvidia_pth()
|
||||
print("\n Verifying CUDA...")
|
||||
if verify_cuda():
|
||||
print(" GPU support verified and working!")
|
||||
else:
|
||||
print(" WARNING: CUDA installed but not detected.")
|
||||
print(" Update your NVIDIA drivers and try again.")
|
||||
break
|
||||
elif answer in ("n", "no"):
|
||||
print("\n Skipping GPU. Re-run install.py to add it later.")
|
||||
break
|
||||
else:
|
||||
print(" Please enter Y or N.")
|
||||
else:
|
||||
print("\n No NVIDIA GPU detected — using CPU mode.")
|
||||
|
||||
print("\n" + "=" * 55)
|
||||
print(" Done! Run the app with: python app.py")
|
||||
print("=" * 55)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,2 @@
|
||||
faster-whisper
|
||||
customtkinter
|
||||
@@ -0,0 +1,29 @@
|
||||
#!/bin/bash
|
||||
# ============================================================
|
||||
# Local Transcribe with Whisper — macOS / Linux launcher
|
||||
# ============================================================
|
||||
# Double-click this file or run: ./run_Mac.sh
|
||||
# On first run it creates a venv and installs dependencies.
|
||||
# ============================================================
|
||||
|
||||
set -e
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# Create .venv if it doesn't exist
|
||||
if [ ! -f ".venv/bin/python" ]; then
|
||||
echo "Creating virtual environment..."
|
||||
python3 -m venv .venv
|
||||
fi
|
||||
|
||||
PYTHON=".venv/bin/python"
|
||||
|
||||
# Install dependencies on first run
|
||||
if ! "$PYTHON" -c "import faster_whisper" 2>/dev/null; then
|
||||
echo "First run detected — running installer..."
|
||||
"$PYTHON" install.py
|
||||
echo
|
||||
fi
|
||||
|
||||
echo "Starting Local Transcribe..."
|
||||
"$PYTHON" app.py
|
||||
@@ -1,5 +1,23 @@
|
||||
@echo off
|
||||
echo Starting...
|
||||
call conda activate base
|
||||
REM OPTION 2 : (KEEP TEXT WITHIN QUOTES AND CHANGE USERNAME) "C:/Users/user/Anaconda3/condabin/activate.bat"
|
||||
call python GUI.py
|
||||
REM Create .venv on first run if it doesn't exist
|
||||
if not exist ".venv\Scripts\python.exe" (
|
||||
echo Creating virtual environment...
|
||||
python -m venv .venv
|
||||
if errorlevel 1 (
|
||||
echo ERROR: Failed to create virtual environment. Is Python installed and on PATH?
|
||||
pause
|
||||
exit /b 1
|
||||
)
|
||||
)
|
||||
|
||||
set PYTHON=.venv\Scripts\python.exe
|
||||
|
||||
REM Check if dependencies are installed
|
||||
%PYTHON% -c "import faster_whisper" 2>nul
|
||||
if errorlevel 1 (
|
||||
echo First run detected - running installer...
|
||||
%PYTHON% install.py
|
||||
echo.
|
||||
)
|
||||
echo Starting Local Transcribe...
|
||||
%PYTHON% app.py
|
||||
@@ -1,5 +1,2 @@
|
||||
Armstrong_Small_Step
|
||||
In seconds:
|
||||
[0.00 --> 7.00]: I'm going to step off the limb now.
|
||||
[7.00 --> 18.00]: That's one small step for man.
|
||||
[18.00 --> 24.00]: One giant leap for mankind.
|
||||
[0:00:00 --> 0:00:07]: That's one small step for man, one giant leap for mankind.
|
||||
@@ -1,4 +1,2 @@
|
||||
Axel_Pettersson_röstinspelning
|
||||
In seconds:
|
||||
[0.00 --> 6.14]: Hej, jag heter Axel Pettersson. Jag följer bror 1976.
|
||||
[6.40 --> 15.10]: Jag har varit vikerpedjan sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.
|
||||
[0:00:00 --> 0:00:15]: Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.
|
||||
@@ -0,0 +1,149 @@
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
import site
|
||||
from glob import glob
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CUDA setup — must happen before importing faster_whisper / ctranslate2
|
||||
# ---------------------------------------------------------------------------
|
||||
def _setup_cuda_dlls():
|
||||
"""Add NVIDIA pip-package DLL dirs to the DLL search path (Windows only).
|
||||
|
||||
pip-installed nvidia-cublas-cu12 / nvidia-cudnn-cu12 place their .dll
|
||||
files inside the site-packages tree. Python 3.8+ on Windows does NOT
|
||||
search PATH for DLLs loaded via ctypes/LoadLibrary, so we must
|
||||
explicitly register every nvidia/*/bin and nvidia/*/lib directory using
|
||||
os.add_dll_directory *and* prepend them to PATH (some native extensions
|
||||
still rely on PATH).
|
||||
"""
|
||||
if sys.platform != "win32":
|
||||
return
|
||||
try:
|
||||
for sp in site.getsitepackages():
|
||||
nvidia_root = os.path.join(sp, "nvidia")
|
||||
if not os.path.isdir(nvidia_root):
|
||||
continue
|
||||
for pkg in os.listdir(nvidia_root):
|
||||
for sub in ("bin", "lib"):
|
||||
d = os.path.join(nvidia_root, pkg, sub)
|
||||
if os.path.isdir(d):
|
||||
os.environ["PATH"] = d + os.pathsep + os.environ.get("PATH", "")
|
||||
try:
|
||||
os.add_dll_directory(d)
|
||||
except (OSError, AttributeError):
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
_setup_cuda_dlls()
|
||||
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
|
||||
def _detect_device():
|
||||
"""Return (device, compute_type) for the best available backend."""
|
||||
try:
|
||||
import ctranslate2
|
||||
cuda_types = ctranslate2.get_supported_compute_types("cuda")
|
||||
if "float16" in cuda_types:
|
||||
return "cuda", "float16"
|
||||
except Exception:
|
||||
pass
|
||||
return "cpu", "int8"
|
||||
|
||||
|
||||
# Get the path
|
||||
def get_path(path):
|
||||
glob_file = glob(path + '/*')
|
||||
return glob_file
|
||||
|
||||
# Main function
|
||||
def transcribe(path, glob_file, model=None, language=None, verbose=False):
|
||||
"""
|
||||
Transcribes audio files in a specified folder using faster-whisper (CTranslate2).
|
||||
|
||||
Args:
|
||||
path (str): Path to the folder containing the audio files.
|
||||
glob_file (list): List of audio file paths to transcribe.
|
||||
model (str, optional): Name of the Whisper model size to use for transcription.
|
||||
Defaults to None, which uses the default model.
|
||||
language (str, optional): Language code for transcription. Defaults to None,
|
||||
which enables automatic language detection.
|
||||
verbose (bool, optional): If True, enables verbose mode with detailed information
|
||||
during the transcription process. Defaults to False.
|
||||
|
||||
Returns:
|
||||
str: A message indicating the result of the transcription process.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If an invalid file is encountered, it will be skipped.
|
||||
|
||||
Notes:
|
||||
- The function downloads the specified model if not available locally.
|
||||
- The transcribed text files will be saved in a "transcriptions" folder
|
||||
within the specified path.
|
||||
- Uses CTranslate2 for up to 4x faster inference compared to openai-whisper.
|
||||
- FFmpeg is bundled via the PyAV dependency — no separate installation needed.
|
||||
|
||||
"""
|
||||
SEP = "─" * 46
|
||||
|
||||
# ── Step 1: Detect hardware ──────────────────────────────────────
|
||||
device, compute_type = _detect_device()
|
||||
print(f"⚙ Device: {device} | Compute: {compute_type}")
|
||||
|
||||
# ── Step 2: Load model ───────────────────────────────────────────
|
||||
print(f"⏳ Loading model '{model}' — downloading if needed...")
|
||||
whisper_model = WhisperModel(model, device=device, compute_type=compute_type)
|
||||
print("✅ Model ready!")
|
||||
print(SEP)
|
||||
|
||||
# ── Step 3: Transcribe files ─────────────────────────────────────
|
||||
total_files = len(glob_file)
|
||||
print(f"📂 Found {total_files} item(s) in folder")
|
||||
print(SEP)
|
||||
|
||||
files_transcripted = []
|
||||
file_num = 0
|
||||
for file in glob_file:
|
||||
title = os.path.basename(file).split('.')[0]
|
||||
file_num += 1
|
||||
print(f"\n{'─' * 46}")
|
||||
print(f"📄 File {file_num}/{total_files}: {title}")
|
||||
try:
|
||||
segments, info = whisper_model.transcribe(
|
||||
file,
|
||||
language=language,
|
||||
beam_size=5
|
||||
)
|
||||
# Make folder if missing
|
||||
os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
|
||||
# Stream segments as they are decoded
|
||||
segment_list = []
|
||||
with open("{}/transcriptions/{}.txt".format(path, title), 'w', encoding='utf-8') as f:
|
||||
f.write(title)
|
||||
for seg in segments:
|
||||
start_ts = str(datetime.timedelta(seconds=seg.start))
|
||||
end_ts = str(datetime.timedelta(seconds=seg.end))
|
||||
f.write('\n[{} --> {}]:{}'.format(start_ts, end_ts, seg.text))
|
||||
f.flush()
|
||||
if verbose:
|
||||
print(" [%.2fs → %.2fs] %s" % (seg.start, seg.end, seg.text))
|
||||
else:
|
||||
print(" Transcribed up to %.0fs..." % seg.end, end='\r')
|
||||
segment_list.append(seg)
|
||||
print(f"✅ Done — saved to transcriptions/{title}.txt")
|
||||
files_transcripted.append(segment_list)
|
||||
except Exception:
|
||||
print('⚠ Not a valid audio/video file, skipping.')
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────
|
||||
print(f"\n{SEP}")
|
||||
if len(files_transcripted) > 0:
|
||||
output_text = f"✅ Finished! {len(files_transcripted)} file(s) transcribed.\n Saved in: {path}/transcriptions"
|
||||
else:
|
||||
output_text = '⚠ No files eligible for transcription — try another folder.'
|
||||
print(output_text)
|
||||
print(SEP)
|
||||
return output_text
|
||||
@@ -1,56 +0,0 @@
|
||||
import whisper
|
||||
import glob, os
|
||||
#import torch #uncomment if using torch with cuda, below too
|
||||
import datetime
|
||||
|
||||
def transcribe(path, file_type, model=None, language=None, verbose=False):
|
||||
'''Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions'''
|
||||
|
||||
try:
|
||||
os.mkdir('{}/transcriptions'.format(path))
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
glob_file = glob.glob(path+'/*{}'.format(file_type))
|
||||
|
||||
#if torch.cuda.is_available():
|
||||
# generator = torch.Generator('cuda').manual_seed(42)
|
||||
#else:
|
||||
# generator = torch.Generator().manual_seed(42)
|
||||
|
||||
print('Using {} model'.format(model))
|
||||
print('File type is {}'.format(file_type))
|
||||
print('Language is being detected automatically for each file')
|
||||
print('Verbosity is set to {}'.format(verbose))
|
||||
print('\nThere are {} {} files in path: {}\n\n'.format(len(glob_file), file_type, path))
|
||||
|
||||
print('Loading model...')
|
||||
model = whisper.load_model(model)
|
||||
|
||||
for idx,file in enumerate(glob_file):
|
||||
title = os.path.basename(file).split('.')[0]
|
||||
|
||||
print('Transcribing file number number {}: {}'.format(idx+1,title))
|
||||
print('Model and file loaded...\nStarting transcription...\n')
|
||||
result = model.transcribe(
|
||||
file,
|
||||
language=language,
|
||||
verbose=verbose
|
||||
)
|
||||
start=[]
|
||||
end=[]
|
||||
text=[]
|
||||
for i in range(len(result['segments'])):
|
||||
start.append(str(datetime.timedelta(seconds=(result['segments'][i]['start']))))
|
||||
end.append(str(datetime.timedelta(seconds=(result['segments'][i]['end']))))
|
||||
text.append(result['segments'][i]['text'])
|
||||
|
||||
with open("{}/transcriptions/{}.txt".format(path,title), 'w', encoding='utf-8') as file:
|
||||
file.write(title)
|
||||
file.write('\nIn seconds:')
|
||||
for i in range(len(result['segments'])):
|
||||
file.writelines('\n[{} --> {}]:{}'.format(start[i], end[i], text[i]))
|
||||
|
||||
print('\nFinished file number {}.\n\n\n'.format(idx+1))
|
||||
|
||||
return 'Finished transcription, files can be found in {}/transcriptions'.format(path)
|
||||