Complete rework for GUI, experimental EXE file and other minor changes, see readme for more info
@@ -4,8 +4,8 @@ authors:
|
|||||||
- family-names: "Söderström"
|
- family-names: "Söderström"
|
||||||
given-names: "Kristofer Rolf"
|
given-names: "Kristofer Rolf"
|
||||||
orcid: "https://orcid.org/0000-0002-5322-3350"
|
orcid: "https://orcid.org/0000-0002-5322-3350"
|
||||||
title: "transcribe"
|
title: "Local Transcribe"
|
||||||
version: 1.1.1
|
version: 1.2
|
||||||
doi: 10.5281/zenodo.7760511
|
doi: 10.5281/zenodo.7760510
|
||||||
date-released: 2023-03-22
|
date-released: 2023-03-22
|
||||||
url: "https://github.com/soderstromkr/transcribe"
|
url: "https://github.com/soderstromkr/transcribe"
|
||||||
|
|||||||
100
GUI.py
@@ -1,100 +0,0 @@
|
|||||||
import tkinter as tk
|
|
||||||
from tkinter import ttk
|
|
||||||
from tkinter import filedialog
|
|
||||||
from tkinter import messagebox
|
|
||||||
from transcribe import transcribe
|
|
||||||
from ttkthemes import ThemedTk
|
|
||||||
import whisper
|
|
||||||
import numpy as np
|
|
||||||
import glob, os
|
|
||||||
|
|
||||||
|
|
||||||
class App:
|
|
||||||
def __init__(self, master):
|
|
||||||
self.master = master
|
|
||||||
master.title("Local Transcribe")
|
|
||||||
|
|
||||||
#style options
|
|
||||||
style = ttk.Style()
|
|
||||||
style.configure('TLabel', font=('Arial', 10), padding=10)
|
|
||||||
style.configure('TEntry', font=('Arial', 10), padding=10)
|
|
||||||
style.configure('TButton', font=('Arial', 10), padding=10)
|
|
||||||
style.configure('TCheckbutton', font=('Arial', 10), padding=10)
|
|
||||||
|
|
||||||
# Folder Path
|
|
||||||
path_frame = ttk.Frame(master, padding=10)
|
|
||||||
path_frame.pack(fill=tk.BOTH)
|
|
||||||
path_label = ttk.Label(path_frame, text="Folder Path:")
|
|
||||||
path_label.pack(side=tk.LEFT, padx=5)
|
|
||||||
self.path_entry = ttk.Entry(path_frame, width=50)
|
|
||||||
self.path_entry.insert(10, 'sample_audio/')
|
|
||||||
self.path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
|
||||||
browse_button = ttk.Button(path_frame, text="Browse", command=self.browse)
|
|
||||||
browse_button.pack(side=tk.LEFT, padx=5)
|
|
||||||
|
|
||||||
# File Type
|
|
||||||
file_type_frame = ttk.Frame(master, padding=10)
|
|
||||||
file_type_frame.pack(fill=tk.BOTH)
|
|
||||||
file_type_label = ttk.Label(file_type_frame, text="File Type:")
|
|
||||||
file_type_label.pack(side=tk.LEFT, padx=5)
|
|
||||||
self.file_type_entry = ttk.Entry(file_type_frame, width=50)
|
|
||||||
self.file_type_entry.insert(10, 'ogg')
|
|
||||||
self.file_type_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
|
||||||
|
|
||||||
# Model
|
|
||||||
model_frame = ttk.Frame(master, padding=10)
|
|
||||||
model_frame.pack(fill=tk.BOTH)
|
|
||||||
model_label = ttk.Label(model_frame, text="Model:")
|
|
||||||
model_label.pack(side=tk.LEFT, padx=5)
|
|
||||||
self.model_entry = ttk.Entry(model_frame, width=50)
|
|
||||||
self.model_entry.insert(10, 'small')
|
|
||||||
self.model_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
|
||||||
|
|
||||||
# Language (currently disabled)
|
|
||||||
#language_frame = ttk.Frame(master, padding=10)
|
|
||||||
#language_frame.pack(fill=tk.BOTH)
|
|
||||||
#language_label = ttk.Label(language_frame, text="Language:")
|
|
||||||
#language_label.pack(side=tk.LEFT, padx=5)
|
|
||||||
#self.language_entry = ttk.Entry(language_frame, width=50)
|
|
||||||
#self.language_entry.insert(10, np.nan)
|
|
||||||
#self.language_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
|
||||||
|
|
||||||
# Verbose
|
|
||||||
verbose_frame = ttk.Frame(master, padding=10)
|
|
||||||
verbose_frame.pack(fill=tk.BOTH)
|
|
||||||
self.verbose_var = tk.BooleanVar()
|
|
||||||
verbose_checkbutton = ttk.Checkbutton(verbose_frame, text="Verbose", variable=self.verbose_var)
|
|
||||||
verbose_checkbutton.pack(side=tk.LEFT, padx=5)
|
|
||||||
|
|
||||||
# Buttons
|
|
||||||
button_frame = ttk.Frame(master, padding=10)
|
|
||||||
button_frame.pack(fill=tk.BOTH)
|
|
||||||
transcribe_button = ttk.Button(button_frame, text="Transcribe Audio", command=self.transcribe)
|
|
||||||
transcribe_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
|
|
||||||
quit_button = ttk.Button(button_frame, text="Quit", command=master.quit)
|
|
||||||
quit_button.pack(side=tk.RIGHT, padx=5, pady=10, fill=tk.X, expand=True)
|
|
||||||
|
|
||||||
def browse(self):
|
|
||||||
folder_path = filedialog.askdirectory()
|
|
||||||
self.path_entry.delete(0, tk.END)
|
|
||||||
self.path_entry.insert(0, folder_path)
|
|
||||||
|
|
||||||
def transcribe(self):
|
|
||||||
path = self.path_entry.get()
|
|
||||||
file_type = self.file_type_entry.get()
|
|
||||||
model = self.model_entry.get()
|
|
||||||
#language = self.language_entry.get()
|
|
||||||
language = None # set to auto-detect
|
|
||||||
verbose = self.verbose_var.get()
|
|
||||||
|
|
||||||
# Call the transcribe function with the appropriate arguments
|
|
||||||
result = transcribe(path, file_type, model=model, language=language, verbose=verbose)
|
|
||||||
|
|
||||||
# Show the result in a message box
|
|
||||||
tk.messagebox.showinfo("Finished!", result)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# root = tk.Tk()
|
|
||||||
root = ThemedTk(theme="clearlooks")
|
|
||||||
app = App(root)
|
|
||||||
root.mainloop()
|
|
||||||
9
Mac_instructions.md
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
### How to run on Mac
|
||||||
|
Unfortunately, I have not found a permament solution for this, not being a Mac user has limited the ways I can test this.
|
||||||
|
#### Recommended steps
|
||||||
|
1. Open a terminal and navigate to the root folder (the downloaded the folder).
|
||||||
|
1. You can also right-click (or equivalent) on the root folder to open a Terminal within the folder.
|
||||||
|
2. Run the following command:
|
||||||
|
```
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
### How to run on Mac
|
|
||||||
Unfortunately, I have not found a permament solution for this, not being a Mac user has limited the ways I can test this. For now, these are the recommended steps for a beginner user:
|
|
||||||
1. Open a terminal and navigate to the root folder (transcribe-main if you downloaded the folder). You can also right-click (or equivalent) on the root folder to open a Terminal within the folder.
|
|
||||||
2. Run the following command:
|
|
||||||
python GUI.py
|
|
||||||
122
README.md
@@ -1,71 +1,75 @@
|
|||||||
## Local Transcribe
|
## Local Transcribe with Whisper
|
||||||
|
Local Transcribe with Whisper is a user-friendly desktop application that allows you to transcribe audio and video files using the Whisper ASR system. This application provides a graphical user interface (GUI) built with Python and the Tkinter library, making it easy to use even for those not familiar with programming.
|
||||||
|
|
||||||
Local Transcribe uses OpenAI's Whisper to transcribe audio files from your local folders, creating text files on disk.
|
## New in version 1.2!
|
||||||
|
1. Simpler usage:
|
||||||
|
1. File type: You no longer need to specify file type. The program will only transcribe elligible files.
|
||||||
|
2. Language: Added option to specify language, which might help in some cases. Clear the default text to run automatic language recognition.
|
||||||
|
3. Model selection: Now a dropdown option that includes most models for typical use.
|
||||||
|
2. New and improved GUI.
|
||||||
|

|
||||||
|
3. Executable: On Windows and don't want to install python? Try the Exe file! See below for instructions (Experimental)
|
||||||
|
|
||||||
## Note
|
## Features
|
||||||
|
* Select the folder containing the audio or video files you want to transcribe. Tested with m4a video.
|
||||||
This implementation and guide is mostly made for researchers not familiar with programming that want a way to transcribe their files locally, without internet connection, usually required within ethical data practices and frameworks. Two examples are shown, a normal workflow with internet connection. And one in which the model is loaded first, via openai-whisper, and then the transcription can be done without being connected to the internet. There is now also a GUI implementation, read below for more information.
|
* Choose the language of the files you are transcribing. You can either select a specific language or let the application automatically detect the language.
|
||||||
|
* Select the Whisper model to use for the transcription. Available models include "base.en", "base", "small.en", "small", "medium.en", "medium", and "large". Models with .en ending are better if you're transcribing English, especially the base and small models.
|
||||||
### Instructions
|
* Enable the verbose mode to receive detailed information during the transcription process.
|
||||||
|
* Monitor the progress of the transcription with the progress bar and terminal.
|
||||||
#### Requirements
|
* Confirmation dialog before starting the transcription to ensure you have selected the correct folder.
|
||||||
|
* View the transcribed text in a message box once the transcription is completed.
|
||||||
1. This script was made and tested in an Anaconda environment with Python 3.10. I recommend this method if you're not familiar with Python.
|
|
||||||
See [here](https://docs.anaconda.com/anaconda/install/index.html) for instructions. You might need administrator rights.
|
|
||||||
|
|
||||||
2. Whisper requires some additional libraries. The [setup](https://github.com/openai/whisper#setup) page states: "The codebase also depends on a few Python packages, most notably HuggingFace Transformers for their fast tokenizer implementation and ffmpeg-python for reading audio files."
|
|
||||||
Users might not need to specifically install Transfomers. However, a conda installation might be needed for ffmpeg[^1], which takes care of setting up PATH variables. From the anaconda prompt, type or copy the following:
|
|
||||||
|
|
||||||
```
|
|
||||||
conda install -c conda-forge ffmpeg-python
|
|
||||||
```
|
|
||||||
|
|
||||||
3. The main functionality comes from openai-whisper. See their [page](https://github.com/openai/whisper) for details. As of 2023-03-22 you can install via:
|
|
||||||
|
|
||||||
```
|
|
||||||
pip install -U openai-whisper
|
|
||||||
```
|
|
||||||
|
|
||||||
4. There is an option to run a batch file, which launches a GUI built on TKinter and TTKthemes. If using these options, make sure they are installed in your Python build. You can install them via pip.
|
|
||||||
|
|
||||||
```
|
|
||||||
pip install tk
|
|
||||||
```
|
|
||||||
|
|
||||||
and
|
|
||||||
|
|
||||||
```
|
|
||||||
pip install ttkthemes
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Using the script
|
|
||||||
|
|
||||||
This is a simple script with no installation. You can download the zip folder and extract it to your preferred working folder.
|
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
### Get the files
|
||||||
|
Download the zip folder and extract it to your preferred working folder.
|
||||||

|

|
||||||
|
|
||||||
Or by cloning the repository with:
|
Or by cloning the repository with:
|
||||||
|
|
||||||
```
|
```
|
||||||
git clone https://github.com/soderstromkr/transcribe.git
|
git clone https://github.com/soderstromkr/transcribe.git
|
||||||
```
|
```
|
||||||
|
### Executable Version **(Experimental. Windows only)**
|
||||||
|
The executable version of Local Transcribe with Whisper is a standalone program and should work out of the box. This experimental version is available if you have Windows, and do not have (or don't want to install) python and additional dependencies. However, it requires more disk space (around 1Gb), has no GPU acceleration and has only been lightly tested for bugs, etc. Let me know if you run into any issues!
|
||||||
|
1. Download the project folder. As the image above shows.
|
||||||
|
2. Navigate to build.
|
||||||
|
3. Unzip the folder (get a coffee or a tea, this might take a while depending on your computer)
|
||||||
|
3. Run the executable (app.exe) file.
|
||||||
|
### Python Version **(any platform including Mac users)**
|
||||||
|
This is recommended if you don't have Windows. Have Windows and use python, or want to use GPU acceleration (Pytorch and Cuda) for faster transcriptions. I would generally recommend this method anyway, but I can understand not everyone wants to go through the installation process for Python, Anaconda and the other required packages.
|
||||||
|
1. This script was made and tested in an Anaconda environment with Python 3.10. I recommend this method if you're not familiar with Python.
|
||||||
|
See [here](https://docs.anaconda.com/anaconda/install/index.html) for instructions. You might need administrator rights.
|
||||||
|
2. Whisper requires some additional libraries. The [setup](https://github.com/openai/whisper#setup) page states: "The codebase also depends on a few Python packages, most notably HuggingFace Transformers for their fast tokenizer implementation and ffmpeg-python for reading audio files."
|
||||||
|
Users might not need to specifically install Transfomers. However, a conda installation might be needed for ffmpeg[^1], which takes care of setting up PATH variables. From the anaconda prompt, type or copy the following:
|
||||||
|
```
|
||||||
|
conda install -c conda-forge ffmpeg-python
|
||||||
|
```
|
||||||
|
3. The main functionality comes from openai-whisper. See their [page](https://github.com/openai/whisper) for details. As of 2023-03-22 you can install via:
|
||||||
|
```
|
||||||
|
pip install -U openai-whisper
|
||||||
|
```
|
||||||
|
4. To run the app built on TKinter and TTKthemes. If using these options, make sure they are installed in your Python build. You can install them via pip.
|
||||||
|
```
|
||||||
|
pip install tkinter
|
||||||
|
```
|
||||||
|
and
|
||||||
|
```
|
||||||
|
pip install customtkinter
|
||||||
|
```
|
||||||
|
5. Run the app:
|
||||||
|
1. For **Windows**: In the same folder as the *app.py* file, run the app from terminal by running ```python app.py``` or with the batch file called run_Windows.bat (for Windows users), which assumes you have conda installed and in the base environment (This is for simplicity, but users are usually adviced to create an environment, see [here](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands) for more info) just make sure you have the correct environment (right click on the file and press edit to make any changes). If you want to download a model first, and then go offline for transcription, I recommend running the model with the default sample folder, which will download the model locally.
|
||||||
|
2. For **Mac**: Haven't figured out a better way to do this, see [the instructions here](Mac_instructions.txt)
|
||||||
|
## Usage
|
||||||
|
1. When launched, the app will also open a terminal that shows some additional information.
|
||||||
|
2. Select the folder containing the audio or video files you want to transcribe by clicking the "Browse" button next to the "Folder" label. This will open a file dialog where you can navigate to the desired folder. Remember, you won't be choosing individual files but whole folders!
|
||||||
|
3. Enter the desired language for the transcription in the "Language" field. You can either select a language or leave it blank to enable automatic language detection.
|
||||||
|
4. Choose the Whisper model to use for the transcription from the dropdown list next to the "Model" label.
|
||||||
|
5. Enable the verbose mode by checking the "Verbose" checkbox if you want to receive detailed information during the transcription process.
|
||||||
|
6. Click the "Transcribe" button to start the transcription. The button will be disabled during the process to prevent multiple transcriptions at once.
|
||||||
|
7. Monitor the progress of the transcription with the progress bar.
|
||||||
|
8. Once the transcription is completed, a message box will appear displaying the transcribed text. Click "OK" to close the message box.
|
||||||
|
9. You can run the application again or quit the application at any time by clicking the "Quit" button.
|
||||||
|
|
||||||
|
## Jupyter Notebook
|
||||||
#### Example with Jupyter Notebook
|
Don't want fancy EXEs or GUIs? Use the function as is. See [example](example.ipynb) for an implementation on Jupyter Notebook.
|
||||||
|
|
||||||
See [example](example.ipynb) for an implementation on Jupyter Notebook, also added an example for a simple [workaround](example_no_internet.ipynb) to transcribe while offline.
|
|
||||||
|
|
||||||
#### Using the GUI
|
|
||||||
|
|
||||||
You can also run the GUI version from your terminal running ```python GUI.py``` or with the batch file called run_Windows.bat (for Windows users), just make sure to add your conda path to it. If you want to download a model first, and then go offline for transcription, I recommend running the model with the default sample folder, which will download the model locally.
|
|
||||||
|
|
||||||
The GUI should look like this:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
or this, on a Mac, by running `python GUI.py` or `python3 GUI.py`:
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
[^1]: Advanced users can use ```pip install ffmpeg-python``` but be ready to deal with some [PATH issues](https://stackoverflow.com/questions/65836756/python-ffmpeg-wont-accept-path-why), which I encountered in Windows 11.
|
[^1]: Advanced users can use ```pip install ffmpeg-python``` but be ready to deal with some [PATH issues](https://stackoverflow.com/questions/65836756/python-ffmpeg-wont-accept-path-why), which I encountered in Windows 11.
|
||||||
|
|
||||||
|
|||||||
133
app.py
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
import tkinter as tk
|
||||||
|
from tkinter import ttk
|
||||||
|
from tkinter import filedialog
|
||||||
|
from tkinter import messagebox
|
||||||
|
from src._LocalTranscribe import transcribe, get_path
|
||||||
|
import customtkinter
|
||||||
|
import threading
|
||||||
|
from colorama import Back, Fore
|
||||||
|
import colorama
|
||||||
|
colorama.init(autoreset=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
customtkinter.set_appearance_mode("System")
|
||||||
|
customtkinter.set_default_color_theme("blue") # Themes: blue (default), dark-blue, green
|
||||||
|
firstclick = True
|
||||||
|
|
||||||
|
class App:
|
||||||
|
def __init__(self, master):
|
||||||
|
print(Back.CYAN + "Welcome to Local Transcribe with Whisper!\U0001f600\nCheck back here to see some output from your transcriptions.\nDon't worry, they will also be saved on the computer!\U0001f64f")
|
||||||
|
self.master = master
|
||||||
|
# Change font
|
||||||
|
font = ('Roboto', 13, 'bold') # Change the font and size here
|
||||||
|
font_b = ('Roboto', 12) # Change the font and size here
|
||||||
|
# Folder Path
|
||||||
|
path_frame = customtkinter.CTkFrame(master)
|
||||||
|
path_frame.pack(fill=tk.BOTH, padx=10, pady=10)
|
||||||
|
customtkinter.CTkLabel(path_frame, text="Folder:", font=font).pack(side=tk.LEFT, padx=5)
|
||||||
|
self.path_entry = customtkinter.CTkEntry(path_frame, width=50, font=font_b)
|
||||||
|
self.path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||||
|
customtkinter.CTkButton(path_frame, text="Browse", command=self.browse, font=font).pack(side=tk.LEFT, padx=5)
|
||||||
|
# Language frame
|
||||||
|
#thanks to pommicket from Stackoverflow for this fix
|
||||||
|
def on_entry_click(event):
|
||||||
|
"""function that gets called whenever entry is clicked"""
|
||||||
|
global firstclick
|
||||||
|
if firstclick: # if this is the first time they clicked it
|
||||||
|
firstclick = False
|
||||||
|
self.language_entry.delete(0, "end") # delete all the text in the entry
|
||||||
|
language_frame = customtkinter.CTkFrame(master)
|
||||||
|
language_frame.pack(fill=tk.BOTH, padx=10, pady=10)
|
||||||
|
customtkinter.CTkLabel(language_frame, text="Language:", font=font).pack(side=tk.LEFT, padx=5)
|
||||||
|
self.language_entry = customtkinter.CTkEntry(language_frame, width=50, font=('Roboto', 12, 'italic'))
|
||||||
|
self.language_entry.insert(0, 'Select language or clear to detect automatically')
|
||||||
|
self.language_entry.bind('<FocusIn>', on_entry_click)
|
||||||
|
self.language_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||||
|
# Model frame
|
||||||
|
models = ['base.en', 'base', 'small.en',
|
||||||
|
'small', 'medium.en', 'medium', 'large']
|
||||||
|
model_frame = customtkinter.CTkFrame(master)
|
||||||
|
model_frame.pack(fill=tk.BOTH, padx=10, pady=10)
|
||||||
|
customtkinter.CTkLabel(model_frame, text="Model:", font=font).pack(side=tk.LEFT, padx=5)
|
||||||
|
# ComboBox frame
|
||||||
|
self.model_combobox = customtkinter.CTkComboBox(
|
||||||
|
model_frame, width=50, state="readonly",
|
||||||
|
values=models, font=font_b)
|
||||||
|
self.model_combobox.set(models[1]) # Set the default value
|
||||||
|
self.model_combobox.pack(side=tk.LEFT, fill=tk.X, expand=True)
|
||||||
|
# Verbose frame
|
||||||
|
verbose_frame = customtkinter.CTkFrame(master)
|
||||||
|
verbose_frame.pack(fill=tk.BOTH, padx=10, pady=10)
|
||||||
|
self.verbose_var = tk.BooleanVar()
|
||||||
|
customtkinter.CTkCheckBox(verbose_frame, text="Output transcription to terminal", variable=self.verbose_var, font=font).pack(side=tk.LEFT, padx=5)
|
||||||
|
# Progress Bar
|
||||||
|
self.progress_bar = ttk.Progressbar(master, length=200, mode='indeterminate')
|
||||||
|
# Button actions frame
|
||||||
|
button_frame = customtkinter.CTkFrame(master)
|
||||||
|
button_frame.pack(fill=tk.BOTH, padx=10, pady=10)
|
||||||
|
self.transcribe_button = customtkinter.CTkButton(button_frame, text="Transcribe", command=self.start_transcription, font=font)
|
||||||
|
self.transcribe_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
|
||||||
|
customtkinter.CTkButton(button_frame, text="Quit", command=master.quit, font=font).pack(side=tk.RIGHT, padx=5, pady=10, fill=tk.X, expand=True)
|
||||||
|
# Helper functions
|
||||||
|
# Browsing
|
||||||
|
def browse(self):
|
||||||
|
folder_path = filedialog.askdirectory()
|
||||||
|
self.path_entry.delete(0, tk.END)
|
||||||
|
self.path_entry.insert(0, folder_path)
|
||||||
|
# Start transcription
|
||||||
|
def start_transcription(self):
|
||||||
|
# Disable transcribe button
|
||||||
|
self.transcribe_button.configure(state=tk.DISABLED)
|
||||||
|
# Start a new thread for the transcription process
|
||||||
|
threading.Thread(target=self.transcribe_thread).start()
|
||||||
|
# Threading
|
||||||
|
def transcribe_thread(self):
|
||||||
|
path = self.path_entry.get()
|
||||||
|
model = self.model_combobox.get()
|
||||||
|
language = self.language_entry.get() or None
|
||||||
|
verbose = self.verbose_var.get()
|
||||||
|
# Show progress bar
|
||||||
|
self.progress_bar.pack(fill=tk.X, padx=5, pady=5)
|
||||||
|
self.progress_bar.start()
|
||||||
|
# Setting path and files
|
||||||
|
glob_file = get_path(path)
|
||||||
|
info_path = 'I will transcribe all eligible audio/video files in the path: {}\n\nContinue?'.format(path)
|
||||||
|
answer = messagebox.askyesno("Confirmation", info_path)
|
||||||
|
if not answer:
|
||||||
|
self.progress_bar.stop()
|
||||||
|
self.progress_bar.pack_forget()
|
||||||
|
self.transcribe_button.configure(state=tk.NORMAL)
|
||||||
|
return
|
||||||
|
# Start transcription
|
||||||
|
error_language = 'https://github.com/openai/whisper#available-models-and-languages'
|
||||||
|
try:
|
||||||
|
output_text = transcribe(path, glob_file, model, language, verbose)
|
||||||
|
except UnboundLocalError:
|
||||||
|
messagebox.showinfo("Files not found error!", 'Nothing found, choose another folder.')
|
||||||
|
pass
|
||||||
|
except ValueError:
|
||||||
|
messagebox.showinfo("Language error!", 'See {} for supported languages'.format(error_language))
|
||||||
|
# Hide progress bar
|
||||||
|
self.progress_bar.stop()
|
||||||
|
self.progress_bar.pack_forget()
|
||||||
|
# Enable transcribe button
|
||||||
|
self.transcribe_button.configure(state=tk.NORMAL)
|
||||||
|
# Recover output text
|
||||||
|
try:
|
||||||
|
messagebox.showinfo("Finished!", output_text)
|
||||||
|
except UnboundLocalError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Setting custom themes
|
||||||
|
root = customtkinter.CTk()
|
||||||
|
root.title("Local Transcribe with Whisper")
|
||||||
|
# Geometry
|
||||||
|
width,height = 450,275
|
||||||
|
root.geometry('{}x{}'.format(width,height))
|
||||||
|
# Icon
|
||||||
|
root.iconbitmap('images/icon.ico')
|
||||||
|
# Run
|
||||||
|
app = App(root)
|
||||||
|
root.mainloop()
|
||||||
20
build_setup.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from cx_Freeze import setup, Executable
|
||||||
|
|
||||||
|
build_exe_options = {
|
||||||
|
"packages": ['whisper','tkinter','customtkinter']
|
||||||
|
}
|
||||||
|
executables = (
|
||||||
|
[
|
||||||
|
Executable(
|
||||||
|
"app.py",
|
||||||
|
icon='images/icon.ico',
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
setup(
|
||||||
|
name="Local Transcribe with Whisper",
|
||||||
|
version="1.2",
|
||||||
|
author="Kristofer Rolf Söderström",
|
||||||
|
options={"build_exe":build_exe_options},
|
||||||
|
executables=executables
|
||||||
|
)
|
||||||
119
example.ipynb
@@ -1,123 +1,125 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"attachments": {},
|
||||||
"execution_count": 1,
|
"cell_type": "markdown",
|
||||||
"id": "a2cd4050",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from transcribe import transcribe"
|
"# Local Transcribe with Whisper\n",
|
||||||
|
"## Example"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 1,
|
||||||
"id": "24e1d24e",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Help on function transcribe in module transcribe:\n",
|
"Help on function transcribe in module src._LocalTranscribe:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"transcribe(path, file_type, model=None, language=None, verbose=True)\n",
|
"transcribe(path, glob_file, model=None, language=None, verbose=False)\n",
|
||||||
" Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions\n",
|
" Transcribes audio files in a specified folder using OpenAI's Whisper model.\n",
|
||||||
|
" \n",
|
||||||
|
" Args:\n",
|
||||||
|
" path (str): Path to the folder containing the audio files.\n",
|
||||||
|
" glob_file (list): List of audio file paths to transcribe.\n",
|
||||||
|
" model (str, optional): Name of the Whisper model to use for transcription.\n",
|
||||||
|
" Defaults to None, which uses the default model.\n",
|
||||||
|
" language (str, optional): Language code for transcription. Defaults to None,\n",
|
||||||
|
" which enables automatic language detection.\n",
|
||||||
|
" verbose (bool, optional): If True, enables verbose mode with detailed information\n",
|
||||||
|
" during the transcription process. Defaults to False.\n",
|
||||||
|
" \n",
|
||||||
|
" Returns:\n",
|
||||||
|
" str: A message indicating the result of the transcription process.\n",
|
||||||
|
" \n",
|
||||||
|
" Raises:\n",
|
||||||
|
" RuntimeError: If an invalid file is encountered, it will be skipped.\n",
|
||||||
|
" \n",
|
||||||
|
" Notes:\n",
|
||||||
|
" - The function downloads the specified model if not available locally.\n",
|
||||||
|
" - The transcribed text files will be saved in a \"transcriptions\" folder\n",
|
||||||
|
" within the specified path.\n",
|
||||||
"\n"
|
"\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"# Import the modules and get the docstring\n",
|
||||||
|
"from src._LocalTranscribe import transcribe, get_path\n",
|
||||||
"help(transcribe)"
|
"help(transcribe)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 2,
|
||||||
"id": "e52477fb",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"path='sample_audio/'#folder path\n",
|
"# Set the variables\n",
|
||||||
"file_type='ogg' #check your file for file type, will only transcribe those files\n",
|
"path='sample_audio/'# Folder path\n",
|
||||||
"model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
|
"model='small' # Model size\n",
|
||||||
"language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
|
"language= None # Preset language, None for automatic detection\n",
|
||||||
"verbose = True # prints output while transcribing, False to deactivate"
|
"verbose = True # Output transcription in realtime\n",
|
||||||
|
"\n",
|
||||||
|
"# Get glob file, additional step for app version.\n",
|
||||||
|
"\n",
|
||||||
|
"glob_file = get_path(path)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 3,
|
||||||
"id": "d66866af",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"Using medium model, you can change this by specifying model=\"medium\" for example\n",
|
|
||||||
"Only looking for file type ogg, you can change this by specifying file_type=\"mp3\"\n",
|
|
||||||
"Expecting None language, you can change this by specifying language=\"English\". None will try to auto-detect\n",
|
|
||||||
"Verbosity is True. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False\n",
|
|
||||||
"\n",
|
|
||||||
"There are 2 ogg files in path: sample_audio/\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"Loading model...\n",
|
|
||||||
"Transcribing file number number 1: Armstrong_Small_Step\n",
|
|
||||||
"Model and file loaded...\n",
|
|
||||||
"Starting transcription...\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
|
"Trying to transcribe file named: Armstrong_Small_Step🕐\n",
|
||||||
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
|
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
|
||||||
"Detected language: English\n",
|
"Detected language: English\n",
|
||||||
"[00:00.000 --> 00:24.000] That's one small step for man, one giant leap for mankind.\n",
|
"[00:00.000 --> 00:07.000] I'm going to step off the limb now.\n",
|
||||||
"\n",
|
"[00:07.000 --> 00:18.000] That's one small step for man.\n",
|
||||||
"Finished file number 1.\n",
|
"[00:18.000 --> 00:24.000] One giant leap for mankind.\n",
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"Transcribing file number number 2: Axel_Pettersson_röstinspelning\n",
|
|
||||||
"Model and file loaded...\n",
|
|
||||||
"Starting transcription...\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
|
"Trying to transcribe file named: Axel_Pettersson_röstinspelning🕐\n",
|
||||||
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
|
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
|
||||||
"Detected language: Swedish\n",
|
"Detected language: Swedish\n",
|
||||||
"[00:00.000 --> 00:16.000] Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
|
"[00:00.000 --> 00:06.140] Hej, jag heter Axel Pettersson. Jag följer bror 1976.\n",
|
||||||
|
"[00:06.400 --> 00:15.100] Jag har varit vikerpedjan sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Finished file number 2.\n",
|
"Trying to transcribe file named: readme🕐\n",
|
||||||
|
"Not a valid file, skipping.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"Trying to transcribe file named: transcriptions🕐\n",
|
||||||
"\n"
|
"Not a valid file, skipping.\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"'Finished transcription, files can be found in sample_audio/transcriptions'"
|
"'Finished transcription, 2 files can be found in sample_audio//transcriptions'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 4,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"transcribe(path, file_type, model, language, verbose)"
|
"# Run the script\n",
|
||||||
|
"transcribe(path, glob_file, model, language, verbose)"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "0bc67265",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "venv",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@@ -132,8 +134,9 @@
|
|||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.4"
|
"version": "3.10.4"
|
||||||
}
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 5
|
"nbformat_minor": 2
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,231 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "eba9e610",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"A simple way to avoid being connected while transcribing is to first load the model version you want to use. See [here](https://github.com/openai/whisper/blob/main/README.md#available-models-and-languages) for more info."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"id": "85cd2d12",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"Whisper(\n",
|
|
||||||
" (encoder): AudioEncoder(\n",
|
|
||||||
" (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))\n",
|
|
||||||
" (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))\n",
|
|
||||||
" (blocks): ModuleList(\n",
|
|
||||||
" (0-23): 24 x ResidualAttentionBlock(\n",
|
|
||||||
" (attn): MultiHeadAttention(\n",
|
|
||||||
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
|
||||||
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
|
|
||||||
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
|
||||||
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
|
||||||
" )\n",
|
|
||||||
" (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
|
||||||
" (mlp): Sequential(\n",
|
|
||||||
" (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
|
|
||||||
" (1): GELU(approximate='none')\n",
|
|
||||||
" (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
|
|
||||||
" )\n",
|
|
||||||
" (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
|
||||||
" )\n",
|
|
||||||
" )\n",
|
|
||||||
" (ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
|
||||||
" )\n",
|
|
||||||
" (decoder): TextDecoder(\n",
|
|
||||||
" (token_embedding): Embedding(51865, 1024)\n",
|
|
||||||
" (blocks): ModuleList(\n",
|
|
||||||
" (0-23): 24 x ResidualAttentionBlock(\n",
|
|
||||||
" (attn): MultiHeadAttention(\n",
|
|
||||||
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
|
||||||
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
|
|
||||||
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
|
||||||
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
|
||||||
" )\n",
|
|
||||||
" (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
|
||||||
" (cross_attn): MultiHeadAttention(\n",
|
|
||||||
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
|
||||||
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
|
|
||||||
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
|
||||||
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
|
||||||
" )\n",
|
|
||||||
" (cross_attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
|
||||||
" (mlp): Sequential(\n",
|
|
||||||
" (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
|
|
||||||
" (1): GELU(approximate='none')\n",
|
|
||||||
" (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
|
|
||||||
" )\n",
|
|
||||||
" (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
|
||||||
" )\n",
|
|
||||||
" )\n",
|
|
||||||
" (ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
|
||||||
" )\n",
|
|
||||||
")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import whisper\n",
|
|
||||||
"#change to model size, bigger is more accurate but slower\n",
|
|
||||||
"whisper.load_model(\"medium\") #base, small, medium, large"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"id": "0d2acd54",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#after it loads, you can disconnect from the internet and run the rest"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"id": "a2cd4050",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from transcribe import transcribe"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 9,
|
|
||||||
"id": "24e1d24e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Help on function transcribe in module transcribe:\n",
|
|
||||||
"\n",
|
|
||||||
"transcribe(path, file_type, model=None, language=None, verbose=True)\n",
|
|
||||||
" Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"help(transcribe)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 11,
|
|
||||||
"id": "e52477fb",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"path='sample_audio/'#folder path\n",
|
|
||||||
"file_type='ogg' #check your file for file type, will only transcribe those files\n",
|
|
||||||
"model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
|
|
||||||
"language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
|
|
||||||
"verbose = True # prints output while transcribing, False to deactivate"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"id": "d66866af",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"Using medium model, you can change this by specifying model=\"medium\" for example\n",
|
|
||||||
"Only looking for file type ogg, you can change this by specifying file_type=\"mp3\"\n",
|
|
||||||
"Expecting None language, you can change this by specifying language=\"English\". None will try to auto-detect\n",
|
|
||||||
"Verbosity is True. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False\n",
|
|
||||||
"\n",
|
|
||||||
"There are 2 ogg files in path: sample_audio/\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"Loading model...\n",
|
|
||||||
"Transcribing file number number 1: Armstrong_Small_Step\n",
|
|
||||||
"Model and file loaded...\n",
|
|
||||||
"Starting transcription...\n",
|
|
||||||
"\n",
|
|
||||||
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
|
|
||||||
"Detected language: English\n",
|
|
||||||
"[00:00.000 --> 00:24.000] That's one small step for man, one giant leap for mankind.\n",
|
|
||||||
"\n",
|
|
||||||
"Finished file number 1.\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"Transcribing file number number 2: Axel_Pettersson_röstinspelning\n",
|
|
||||||
"Model and file loaded...\n",
|
|
||||||
"Starting transcription...\n",
|
|
||||||
"\n",
|
|
||||||
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
|
|
||||||
"Detected language: Swedish\n",
|
|
||||||
"[00:00.000 --> 00:16.000] Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
|
|
||||||
"\n",
|
|
||||||
"Finished file number 2.\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"'Finished transcription, files can be found in sample_audio/transcriptions'"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 12,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"transcribe(path, file_type, model, language, verbose)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "0bc67265",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.4"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
BIN
gui_jpeg.jpg
|
Before Width: | Height: | Size: 29 KiB |
|
Before Width: | Height: | Size: 135 KiB After Width: | Height: | Size: 135 KiB |
|
Before Width: | Height: | Size: 324 KiB After Width: | Height: | Size: 324 KiB |
BIN
images/gui-windows.png
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
images/icon.ico
Normal file
|
After Width: | Height: | Size: 1.8 KiB |
@@ -2,4 +2,4 @@
|
|||||||
echo Starting...
|
echo Starting...
|
||||||
call conda activate base
|
call conda activate base
|
||||||
REM OPTION 2 : (KEEP TEXT WITHIN QUOTES AND CHANGE USERNAME) "C:/Users/user/Anaconda3/condabin/activate.bat"
|
REM OPTION 2 : (KEEP TEXT WITHIN QUOTES AND CHANGE USERNAME) "C:/Users/user/Anaconda3/condabin/activate.bat"
|
||||||
call python GUI.py
|
call python app.py
|
||||||
@@ -1,5 +1,4 @@
|
|||||||
Armstrong_Small_Step
|
Armstrong_Small_Step
|
||||||
In seconds:
|
[0:00:00 --> 0:00:07]: And they're still brought to land now.
|
||||||
[0.00 --> 7.00]: I'm going to step off the limb now.
|
[0:00:07 --> 0:00:18]: It's one small step for man.
|
||||||
[7.00 --> 18.00]: That's one small step for man.
|
[0:00:18 --> 0:00:23]: One by a fleet for man time.
|
||||||
[18.00 --> 24.00]: One giant leap for mankind.
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
Axel_Pettersson_röstinspelning
|
Axel_Pettersson_röstinspelning
|
||||||
In seconds:
|
[0:00:00 --> 0:00:06]: Hej, jag heter Raxel Patterson, jag får att se över UR 1976.
|
||||||
[0.00 --> 6.14]: Hej, jag heter Axel Pettersson. Jag följer bror 1976.
|
[0:00:06 --> 0:00:12.540000]: Jag har varit Wikipedia-périonsen 2018 och jag har översat röst-intro-
|
||||||
[6.40 --> 15.10]: Jag har varit vikerpedjan sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.
|
[0:00:12.540000 --> 0:00:15.540000]:-projektet till svenska.
|
||||||
90
src/_LocalTranscribe.py
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
import os
|
||||||
|
import datetime
|
||||||
|
from glob import glob
|
||||||
|
import whisper
|
||||||
|
from torch import cuda, Generator
|
||||||
|
import colorama
|
||||||
|
from colorama import Back,Fore
|
||||||
|
colorama.init(autoreset=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Get the path
|
||||||
|
def get_path(path):
|
||||||
|
glob_file = glob(path + '/*')
|
||||||
|
return glob_file
|
||||||
|
|
||||||
|
# Main function
|
||||||
|
def transcribe(path, glob_file, model=None, language=None, verbose=False):
|
||||||
|
"""
|
||||||
|
Transcribes audio files in a specified folder using OpenAI's Whisper model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): Path to the folder containing the audio files.
|
||||||
|
glob_file (list): List of audio file paths to transcribe.
|
||||||
|
model (str, optional): Name of the Whisper model to use for transcription.
|
||||||
|
Defaults to None, which uses the default model.
|
||||||
|
language (str, optional): Language code for transcription. Defaults to None,
|
||||||
|
which enables automatic language detection.
|
||||||
|
verbose (bool, optional): If True, enables verbose mode with detailed information
|
||||||
|
during the transcription process. Defaults to False.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: A message indicating the result of the transcription process.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If an invalid file is encountered, it will be skipped.
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- The function downloads the specified model if not available locally.
|
||||||
|
- The transcribed text files will be saved in a "transcriptions" folder
|
||||||
|
within the specified path.
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Check for GPU acceleration
|
||||||
|
if cuda.is_available():
|
||||||
|
Generator('cuda').manual_seed(42)
|
||||||
|
else:
|
||||||
|
Generator().manual_seed(42)
|
||||||
|
# Load model
|
||||||
|
model = whisper.load_model(model)
|
||||||
|
# Start main loop
|
||||||
|
files_transcripted=[]
|
||||||
|
for file in glob_file:
|
||||||
|
title = os.path.basename(file).split('.')[0]
|
||||||
|
print(Back.CYAN + '\nTrying to transcribe file named: {}\U0001f550'.format(title))
|
||||||
|
try:
|
||||||
|
result = model.transcribe(
|
||||||
|
file,
|
||||||
|
language=language,
|
||||||
|
verbose=verbose
|
||||||
|
)
|
||||||
|
files_transcripted.append(result)
|
||||||
|
# Make folder if missing
|
||||||
|
try:
|
||||||
|
os.makedirs('{}/transcriptions'.format(path), exist_ok=True)
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
# Create segments for text files
|
||||||
|
start = []
|
||||||
|
end = []
|
||||||
|
text = []
|
||||||
|
for segment in result['segments']:
|
||||||
|
start.append(str(datetime.timedelta(seconds=segment['start'])))
|
||||||
|
end.append(str(datetime.timedelta(seconds=segment['end'])))
|
||||||
|
text.append(segment['text'])
|
||||||
|
# Save files to transcriptions folder
|
||||||
|
with open("{}/transcriptions/{}.txt".format(path, title), 'w', encoding='utf-8') as file:
|
||||||
|
file.write(title)
|
||||||
|
for i in range(len(result['segments'])):
|
||||||
|
file.write('\n[{} --> {}]:{}'.format(start[i], end[i], text[i]))
|
||||||
|
# Skip invalid files
|
||||||
|
except RuntimeError:
|
||||||
|
print(Fore.RED + 'Not a valid file, skipping.')
|
||||||
|
pass
|
||||||
|
# Check if any files were processed.
|
||||||
|
if len(files_transcripted) > 0:
|
||||||
|
output_text = 'Finished transcription, {} files can be found in {}/transcriptions'.format(len(files_transcripted), path)
|
||||||
|
else:
|
||||||
|
output_text = 'No files elligible for transcription, try adding audio or video files to this folder or choose another folder!'
|
||||||
|
# Return output text
|
||||||
|
return output_text
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
import whisper
|
|
||||||
import glob, os
|
|
||||||
#import torch #uncomment if using torch with cuda, below too
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
def transcribe(path, file_type, model=None, language=None, verbose=False):
|
|
||||||
'''Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions'''
|
|
||||||
|
|
||||||
try:
|
|
||||||
os.mkdir('{}/transcriptions'.format(path))
|
|
||||||
except FileExistsError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
glob_file = glob.glob(path+'/*{}'.format(file_type))
|
|
||||||
|
|
||||||
#if torch.cuda.is_available():
|
|
||||||
# generator = torch.Generator('cuda').manual_seed(42)
|
|
||||||
#else:
|
|
||||||
# generator = torch.Generator().manual_seed(42)
|
|
||||||
|
|
||||||
print('Using {} model'.format(model))
|
|
||||||
print('File type is {}'.format(file_type))
|
|
||||||
print('Language is being detected automatically for each file')
|
|
||||||
print('Verbosity is set to {}'.format(verbose))
|
|
||||||
print('\nThere are {} {} files in path: {}\n\n'.format(len(glob_file), file_type, path))
|
|
||||||
|
|
||||||
print('Loading model...')
|
|
||||||
model = whisper.load_model(model)
|
|
||||||
|
|
||||||
for idx,file in enumerate(glob_file):
|
|
||||||
title = os.path.basename(file).split('.')[0]
|
|
||||||
|
|
||||||
print('Transcribing file number number {}: {}'.format(idx+1,title))
|
|
||||||
print('Model and file loaded...\nStarting transcription...\n')
|
|
||||||
result = model.transcribe(
|
|
||||||
file,
|
|
||||||
language=language,
|
|
||||||
verbose=verbose
|
|
||||||
)
|
|
||||||
start=[]
|
|
||||||
end=[]
|
|
||||||
text=[]
|
|
||||||
for i in range(len(result['segments'])):
|
|
||||||
start.append(str(datetime.timedelta(seconds=(result['segments'][i]['start']))))
|
|
||||||
end.append(str(datetime.timedelta(seconds=(result['segments'][i]['end']))))
|
|
||||||
text.append(result['segments'][i]['text'])
|
|
||||||
|
|
||||||
with open("{}/transcriptions/{}.txt".format(path,title), 'w', encoding='utf-8') as file:
|
|
||||||
file.write(title)
|
|
||||||
file.write('\nIn seconds:')
|
|
||||||
for i in range(len(result['segments'])):
|
|
||||||
file.writelines('\n[{} --> {}]:{}'.format(start[i], end[i], text[i]))
|
|
||||||
|
|
||||||
print('\nFinished file number {}.\n\n\n'.format(idx+1))
|
|
||||||
|
|
||||||
return 'Finished transcription, files can be found in {}/transcriptions'.format(path)
|
|
||||||