Compare commits

38 Commits

Author SHA1 Message Date
Kristofer Rolf Söderström b765ff6bc6 Update README.md 2023-06-28 14:11:51 +02:00
Kristofer Rolf Söderström 867b082589 Add files via upload 2023-04-26 09:17:33 +02:00
Kristofer Rolf Söderström b4017c6fee Update README.md 2023-04-26 09:17:09 +02:00
Kristofer Rolf Söderström 1ea5187e78 Merge pull request #1 from bjornekstrom/main
README.md formatting suggestions
2023-04-24 09:25:07 +02:00
Björn Ekström 0051ceb873 Update README.md 2023-04-21 15:11:03 +02:00
Björn Ekström 76be00552f Updated README and Mac screenshot 2023-04-21 15:09:46 +02:00
Björn Ekström a5dd5d4a03 Update README.md
Further formatting.
2023-04-21 14:23:14 +02:00
Björn Ekström 43bcffaf4c Update README.md
Some formatting suggestions.
2023-04-21 14:22:34 +02:00
Kristofer Rolf Söderström 4e1c709f43 Update transcribe.py
better time keeping
2023-04-20 20:13:54 +02:00
Kristofer Rolf Söderström dfe967bd58 Update run_Windows.bat 2023-04-20 19:35:51 +02:00
Kristofer Rolf Söderström 586289efe5 Update Mac_instructions.txt 2023-04-19 16:51:36 +02:00
Kristofer Rolf Söderström c5a5597eee Update README.md 2023-04-19 16:46:49 +02:00
Kristofer Rolf Söderström ce8c365fc4 Update and rename Mac_2_instructions.txt to Mac_instructions.txt 2023-04-17 20:28:52 +02:00
Kristofer Rolf Söderström e2afd34170 Delete run_Mac_2.command 2023-04-17 20:25:18 +02:00
Kristofer Rolf Söderström 6fa49e41d9 Delete run_Mac_1.sh 2023-04-17 20:24:50 +02:00
Kristofer Söderström 1da9adbf5e updated version number 2023-04-14 10:32:38 +02:00
Kristofer Söderström 2769ddf68b dedicated windows and mac scripts, fixed verbose checkbox 2023-04-14 10:31:26 +02:00
Kristofer Rolf Söderström 1128e44486 Update README.md 2023-04-14 09:09:52 +02:00
Kristofer Rolf Söderström eec20b48c4 Update README.md 2023-04-14 08:30:29 +02:00
Kristofer Rolf Söderström b569d41aa9 Update README.md 2023-04-14 08:28:24 +02:00
Kristofer Rolf Söderström 99a6625e0e Update README.md 2023-03-31 11:12:06 +02:00
Kristofer Rolf Söderström b09114625a Update README.md 2023-03-27 21:29:51 +02:00
Kristofer Rolf Söderström 785f2b8215 Update README.md 2023-03-27 21:28:34 +02:00
Kristofer Rolf Söderström 412ab97157 Update README.md 2023-03-27 21:26:34 +02:00
Kristofer Rolf Söderström a14196b055 Update README.md 2023-03-27 21:25:41 +02:00
Kristofer Rolf Söderström c319316a4d Add files via upload 2023-03-27 21:25:11 +02:00
Kristofer Rolf Söderström 26c6f84e72 Update README.md 2023-03-27 21:24:12 +02:00
Kristofer Rolf Söderström 8f76466f57 typos 2023-03-27 21:18:04 +02:00
Kristofer Rolf Söderström 1f684a848a Update README.md 2023-03-27 10:08:19 +02:00
Kristofer Rolf Söderström bf75df30a4 Update README.md 2023-03-27 10:05:58 +02:00
Kristofer Söderström f5a8b19b65 fixed bug 2023-03-27 09:57:28 +02:00
Kristofer Söderström 7bbfef44cb added GUI and batch file to run GUI 2023-03-27 09:25:56 +02:00
Kristofer Söderström acadd17007 some corrections 2023-03-23 15:14:03 +01:00
Kristofer Söderström 918c2e489e added example 2023-03-23 15:06:20 +01:00
Kristofer Rolf Söderström 4710c61e22 Update README.md 2023-03-23 14:58:23 +01:00
Kristofer Söderström eea7441e43 fixed some spacing 2023-03-22 22:02:41 +01:00
Kristofer Rolf Söderström 4701f94bc2 Update CITATION.cff 2023-03-22 21:56:05 +01:00
Kristofer Rolf Söderström eda1adca07 Update README.md 2023-03-22 16:05:29 +01:00
13 changed files with 428 additions and 34 deletions
+3 -3
View File
@@ -1,11 +1,11 @@
cff-version: 1.2.0 cff-version: 1.2.0
message: "If you use find this implementation useful in your research, please cite it as below." message: "If you use find this implementation useful in your research, and want to cite it, please do so as below."
authors: authors:
- family-names: "Söderström" - family-names: "Söderström"
given-names: "Kristofer Rolf" given-names: "Kristofer Rolf"
orcid: "https://orcid.org/0000-0002-5322-3350" orcid: "https://orcid.org/0000-0002-5322-3350"
title: "transcribe" title: "transcribe"
version: 1.0 version: 1.1.1
doi: None doi: 10.5281/zenodo.7760511
date-released: 2023-03-22 date-released: 2023-03-22
url: "https://github.com/soderstromkr/transcribe" url: "https://github.com/soderstromkr/transcribe"
+100
View File
@@ -0,0 +1,100 @@
import tkinter as tk
from tkinter import ttk
from tkinter import filedialog
from tkinter import messagebox
from transcribe import transcribe
from ttkthemes import ThemedTk
import whisper
import numpy as np
import glob, os
class App:
def __init__(self, master):
self.master = master
master.title("Local Transcribe")
#style options
style = ttk.Style()
style.configure('TLabel', font=('Arial', 10), padding=10)
style.configure('TEntry', font=('Arial', 10), padding=10)
style.configure('TButton', font=('Arial', 10), padding=10)
style.configure('TCheckbutton', font=('Arial', 10), padding=10)
# Folder Path
path_frame = ttk.Frame(master, padding=10)
path_frame.pack(fill=tk.BOTH)
path_label = ttk.Label(path_frame, text="Folder Path:")
path_label.pack(side=tk.LEFT, padx=5)
self.path_entry = ttk.Entry(path_frame, width=50)
self.path_entry.insert(10, 'sample_audio/')
self.path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
browse_button = ttk.Button(path_frame, text="Browse", command=self.browse)
browse_button.pack(side=tk.LEFT, padx=5)
# File Type
file_type_frame = ttk.Frame(master, padding=10)
file_type_frame.pack(fill=tk.BOTH)
file_type_label = ttk.Label(file_type_frame, text="File Type:")
file_type_label.pack(side=tk.LEFT, padx=5)
self.file_type_entry = ttk.Entry(file_type_frame, width=50)
self.file_type_entry.insert(10, 'ogg')
self.file_type_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Model
model_frame = ttk.Frame(master, padding=10)
model_frame.pack(fill=tk.BOTH)
model_label = ttk.Label(model_frame, text="Model:")
model_label.pack(side=tk.LEFT, padx=5)
self.model_entry = ttk.Entry(model_frame, width=50)
self.model_entry.insert(10, 'small')
self.model_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Language (currently disabled)
#language_frame = ttk.Frame(master, padding=10)
#language_frame.pack(fill=tk.BOTH)
#language_label = ttk.Label(language_frame, text="Language:")
#language_label.pack(side=tk.LEFT, padx=5)
#self.language_entry = ttk.Entry(language_frame, width=50)
#self.language_entry.insert(10, np.nan)
#self.language_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Verbose
verbose_frame = ttk.Frame(master, padding=10)
verbose_frame.pack(fill=tk.BOTH)
self.verbose_var = tk.BooleanVar()
verbose_checkbutton = ttk.Checkbutton(verbose_frame, text="Verbose", variable=self.verbose_var)
verbose_checkbutton.pack(side=tk.LEFT, padx=5)
# Buttons
button_frame = ttk.Frame(master, padding=10)
button_frame.pack(fill=tk.BOTH)
transcribe_button = ttk.Button(button_frame, text="Transcribe Audio", command=self.transcribe)
transcribe_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
quit_button = ttk.Button(button_frame, text="Quit", command=master.quit)
quit_button.pack(side=tk.RIGHT, padx=5, pady=10, fill=tk.X, expand=True)
def browse(self):
folder_path = filedialog.askdirectory()
self.path_entry.delete(0, tk.END)
self.path_entry.insert(0, folder_path)
def transcribe(self):
path = self.path_entry.get()
file_type = self.file_type_entry.get()
model = self.model_entry.get()
#language = self.language_entry.get()
language = None # set to auto-detect
verbose = self.verbose_var.get()
# Call the transcribe function with the appropriate arguments
result = transcribe(path, file_type, model=model, language=language, verbose=verbose)
# Show the result in a message box
tk.messagebox.showinfo("Finished!", result)
if __name__ == "__main__":
# root = tk.Tk()
root = ThemedTk(theme="clearlooks")
app = App(root)
root.mainloop()
+5
View File
@@ -0,0 +1,5 @@
### How to run on Mac
Unfortunately, I have not found a permament solution for this, not being a Mac user has limited the ways I can test this. For now, these are the recommended steps for a beginner user:
1. Open a terminal and navigate to the root folder (transcribe-main if you downloaded the folder). You can also right-click (or equivalent) on the root folder to open a Terminal within the folder.
2. Run the following command:
python GUI.py
BIN
View File
Binary file not shown.

After

Width:  |  Height:  |  Size: 135 KiB

+53 -8
View File
@@ -1,27 +1,72 @@
## transcribe ## Local Transcribe
Simple script that uses OpenAI's Whisper to transcribe audio files from your local folders.
Local Transcribe uses OpenAI's Whisper to transcribe audio files from your local folders, creating text files on disk.
## Note
This implementation and guide is mostly made for researchers not familiar with programming that want a way to transcribe their files locally, without internet connection, usually required within ethical data practices and frameworks. Two examples are shown, a normal workflow with internet connection. And one in which the model is loaded first, via openai-whisper, and then the transcription can be done without being connected to the internet. There is now also a GUI implementation, read below for more information.
### Instructions ### Instructions
#### Requirements #### Requirements
1. This script was made and tested in an Anaconda environment with python 3.10. I recommend this method if you're not familiar with python.
1. This script was made and tested in an Anaconda environment with Python 3.10. I recommend this method if you're not familiar with Python.
See [here](https://docs.anaconda.com/anaconda/install/index.html) for instructions. You might need administrator rights. See [here](https://docs.anaconda.com/anaconda/install/index.html) for instructions. You might need administrator rights.
2. Whisper requires some additional libraries. The [setup](https://github.com/openai/whisper#setup) page states: "The codebase also depends on a few Python packages, most notably HuggingFace Transformers for their fast tokenizer implementation and ffmpeg-python for reading audio files." 2. Whisper requires some additional libraries. The [setup](https://github.com/openai/whisper#setup) page states: "The codebase also depends on a few Python packages, most notably HuggingFace Transformers for their fast tokenizer implementation and ffmpeg-python for reading audio files."
Users might not need to specifically install Transfomers. However, a conda installation might be needed for ffmepg[^1], which takes care of setting up PATH variables. From the anaconda prompt, type or copy the following: Users might not need to specifically install Transfomers. However, a conda installation might be needed for ffmpeg[^1], which takes care of setting up PATH variables. From the anaconda prompt, type or copy the following:
``` ```
conda install -c conda-forge ffmpeg-python conda install -c conda-forge ffmpeg-python
``` ```
3. The main functionality comes from openai-whisper. See their [page](https://github.com/openai/whisper) for details. As of 2023-03-22 you can install via: 3. The main functionality comes from openai-whisper. See their [page](https://github.com/openai/whisper) for details. As of 2023-03-22 you can install via:
``` ```
pip install -U openai-whisper pip install -U openai-whisper
``` ```
4. There is an option to run a batch file, which launches a GUI built on TKinter and TTKthemes. If using these options, make sure they are installed in your Python build. You can install them via pip.
```
pip install tk
```
and
```
pip install ttkthemes
```
#### Using the script #### Using the script
This is a simple script with no installation. You can either clone the repository with
This is a simple script with no installation. You can download the zip folder and extract it to your preferred working folder.
![](Picture1.png)
Or by cloning the repository with:
``` ```
git clone https://github.com/soderstromkr/transcribe.git git clone https://github.com/soderstromkr/transcribe.git
``` ```
and use the example.ipynb template to use the script **OR (for beginners)** download the ```transcribe.py``` file into your work folder. Then you can either import it to another script or notebook for use. I recommend jupyter notebook for new users, see the example below. (Remember to have transcribe.py and example.ipynb in the same working folder).
### Example
See the [example](example.ipynb) implementation on jupyter notebook. #### Example with Jupyter Notebook
See [example](example.ipynb) for an implementation on Jupyter Notebook, also added an example for a simple [workaround](example_no_internet.ipynb) to transcribe while offline.
#### Using the GUI
You can also run the GUI version from your terminal running ```python GUI.py``` or with the batch file called run_Windows.bat (for Windows users), just make sure to add your conda path to it. If you want to download a model first, and then go offline for transcription, I recommend running the model with the default sample folder, which will download the model locally.
The GUI should look like this:
![python GUI.py](gui_jpeg.jpg?raw=true)
or this, on a Mac, by running `python GUI.py` or `python3 GUI.py`:
![python GUI Mac.py](gui-mac.png)
[^1]: Advanced users can use ```pip install ffmpeg-python``` but be ready to deal with some [PATH issues](https://stackoverflow.com/questions/65836756/python-ffmpeg-wont-accept-path-why), which I encountered in Windows 11. [^1]: Advanced users can use ```pip install ffmpeg-python``` but be ready to deal with some [PATH issues](https://stackoverflow.com/questions/65836756/python-ffmpeg-wont-accept-path-why), which I encountered in Windows 11.
[![DOI](https://zenodo.org/badge/617404576.svg)](https://zenodo.org/badge/latestdoi/617404576)
+1 -1
View File
@@ -40,7 +40,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"path='sample_audio/'#folder path\n", "path='sample_audio/'#folder path\n",
"file_type='ogg' #check your file for file type, will only transcribe files with the file type, 'ogg', 'WAV'\n", "file_type='ogg' #check your file for file type, will only transcribe those files\n",
"model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n", "model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
"language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n", "language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
"verbose = True # prints output while transcribing, False to deactivate" "verbose = True # prints output while transcribing, False to deactivate"
+231
View File
@@ -0,0 +1,231 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "eba9e610",
"metadata": {},
"source": [
"A simple way to avoid being connected while transcribing is to first load the model version you want to use. See [here](https://github.com/openai/whisper/blob/main/README.md#available-models-and-languages) for more info."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "85cd2d12",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Whisper(\n",
" (encoder): AudioEncoder(\n",
" (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))\n",
" (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))\n",
" (blocks): ModuleList(\n",
" (0-23): 24 x ResidualAttentionBlock(\n",
" (attn): MultiHeadAttention(\n",
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): Sequential(\n",
" (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (1): GELU(approximate='none')\n",
" (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" )\n",
" (ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" (decoder): TextDecoder(\n",
" (token_embedding): Embedding(51865, 1024)\n",
" (blocks): ModuleList(\n",
" (0-23): 24 x ResidualAttentionBlock(\n",
" (attn): MultiHeadAttention(\n",
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" (cross_attn): MultiHeadAttention(\n",
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (cross_attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): Sequential(\n",
" (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (1): GELU(approximate='none')\n",
" (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" )\n",
" (ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
")"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import whisper\n",
"#change to model size, bigger is more accurate but slower\n",
"whisper.load_model(\"medium\") #base, small, medium, large"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0d2acd54",
"metadata": {},
"outputs": [],
"source": [
"#after it loads, you can disconnect from the internet and run the rest"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a2cd4050",
"metadata": {},
"outputs": [],
"source": [
"from transcribe import transcribe"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "24e1d24e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on function transcribe in module transcribe:\n",
"\n",
"transcribe(path, file_type, model=None, language=None, verbose=True)\n",
" Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions\n",
"\n"
]
}
],
"source": [
"help(transcribe)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "e52477fb",
"metadata": {},
"outputs": [],
"source": [
"path='sample_audio/'#folder path\n",
"file_type='ogg' #check your file for file type, will only transcribe those files\n",
"model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
"language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
"verbose = True # prints output while transcribing, False to deactivate"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "d66866af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using medium model, you can change this by specifying model=\"medium\" for example\n",
"Only looking for file type ogg, you can change this by specifying file_type=\"mp3\"\n",
"Expecting None language, you can change this by specifying language=\"English\". None will try to auto-detect\n",
"Verbosity is True. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False\n",
"\n",
"There are 2 ogg files in path: sample_audio/\n",
"\n",
"\n",
"Loading model...\n",
"Transcribing file number number 1: Armstrong_Small_Step\n",
"Model and file loaded...\n",
"Starting transcription...\n",
"\n",
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
"Detected language: English\n",
"[00:00.000 --> 00:24.000] That's one small step for man, one giant leap for mankind.\n",
"\n",
"Finished file number 1.\n",
"\n",
"\n",
"\n",
"Transcribing file number number 2: Axel_Pettersson_röstinspelning\n",
"Model and file loaded...\n",
"Starting transcription...\n",
"\n",
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
"Detected language: Swedish\n",
"[00:00.000 --> 00:16.000] Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
"\n",
"Finished file number 2.\n",
"\n",
"\n",
"\n"
]
},
{
"data": {
"text/plain": [
"'Finished transcription, files can be found in sample_audio/transcriptions'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transcribe(path, file_type, model, language, verbose)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0bc67265",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
BIN
View File
Binary file not shown.

After

Width:  |  Height:  |  Size: 324 KiB

BIN
View File
Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

+5
View File
@@ -0,0 +1,5 @@
@echo off
echo Starting...
call conda activate base
REM OPTION 2 : (KEEP TEXT WITHIN QUOTES AND CHANGE USERNAME) "C:/Users/user/Anaconda3/condabin/activate.bat"
call python GUI.py
@@ -1,3 +1,5 @@
Armstrong_Small_Step Armstrong_Small_Step
In seconds: In seconds:
[0.00 --> 24.00]: That's one small step for man, one giant leap for mankind. [0.00 --> 7.00]: I'm going to step off the limb now.
[7.00 --> 18.00]: That's one small step for man.
[18.00 --> 24.00]: One giant leap for mankind.
@@ -1,3 +1,4 @@
Axel_Pettersson_röstinspelning Axel_Pettersson_röstinspelning
In seconds: In seconds:
[0.00 --> 16.00]: Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska. [0.00 --> 6.14]: Hej, jag heter Axel Pettersson. Jag följer bror 1976.
[6.40 --> 15.10]: Jag har varit vikerpedjan sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.
+20 -15
View File
@@ -1,27 +1,32 @@
import whisper import whisper
import glob, os import glob, os
#import torch #uncomment if using torch with cuda, below too
import datetime
def transcribe(path, file_type, model=None, language=None, verbose=True): def transcribe(path, file_type, model=None, language=None, verbose=False):
'''Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions''' '''Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions'''
try: try:
os.mkdir('{}transcriptions'.format(path)) os.mkdir('{}/transcriptions'.format(path))
except FileExistsError: except FileExistsError:
pass pass
glob_file = glob.glob(path+'/*{}'.format(file_type)) glob_file = glob.glob(path+'/*{}'.format(file_type))
path = path
print('Using {} model, you can change this by specifying model="medium" for example'.format(model)) #if torch.cuda.is_available():
print('Only looking for file type {}, you can change this by specifying file_type="mp3"'.format(file_type)) # generator = torch.Generator('cuda').manual_seed(42)
print('Expecting {} language, you can change this by specifying language="English". None will try to auto-detect'.format(language)) #else:
print('Verbosity is {}. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False'.format(verbose)) # generator = torch.Generator().manual_seed(42)
print('Using {} model'.format(model))
print('File type is {}'.format(file_type))
print('Language is being detected automatically for each file')
print('Verbosity is set to {}'.format(verbose))
print('\nThere are {} {} files in path: {}\n\n'.format(len(glob_file), file_type, path)) print('\nThere are {} {} files in path: {}\n\n'.format(len(glob_file), file_type, path))
print('Loading model...') print('Loading model...')
model = whisper.load_model(model) model = whisper.load_model(model)
for idx,file in enumerate(glob_file): for idx,file in enumerate(glob_file):
title = os.path.basename(file).split('.')[0] title = os.path.basename(file).split('.')[0]
@@ -30,22 +35,22 @@ def transcribe(path, file_type, model=None, language=None, verbose=True):
result = model.transcribe( result = model.transcribe(
file, file,
language=language, language=language,
verbose=True verbose=verbose
) )
start=[] start=[]
end=[] end=[]
text=[] text=[]
for i in range(len(result['segments'])): for i in range(len(result['segments'])):
start.append(result['segments'][i]['start']) start.append(str(datetime.timedelta(seconds=(result['segments'][i]['start']))))
end.append(result['segments'][i]['end']) end.append(str(datetime.timedelta(seconds=(result['segments'][i]['end']))))
text.append(result['segments'][i]['text']) text.append(result['segments'][i]['text'])
with open("{}transcriptions/{}.txt".format(path,title), 'w', encoding='utf-8') as file: with open("{}/transcriptions/{}.txt".format(path,title), 'w', encoding='utf-8') as file:
file.write(title) file.write(title)
file.write('\nIn seconds:') file.write('\nIn seconds:')
for i in range(len(result['segments'])): for i in range(len(result['segments'])):
file.writelines('\n[{:.2f} --> {:.2f}]:{}'.format(start[i], end[i], text[i])) file.writelines('\n[{} --> {}]:{}'.format(start[i], end[i], text[i]))
print('\nFinished file number {}.\n\n\n'.format(idx+1)) print('\nFinished file number {}.\n\n\n'.format(idx+1))
return 'Finished transcription, files can be found in {}transcriptions'.format(path) return 'Finished transcription, files can be found in {}/transcriptions'.format(path)