Compare commits

7 Commits

Author SHA1 Message Date
Kristofer Söderström 7bbfef44cb added GUI and batch file to run GUI 2023-03-27 09:25:56 +02:00
Kristofer Söderström acadd17007 some corrections 2023-03-23 15:14:03 +01:00
Kristofer Söderström 918c2e489e added example 2023-03-23 15:06:20 +01:00
Kristofer Rolf Söderström 4710c61e22 Update README.md 2023-03-23 14:58:23 +01:00
Kristofer Söderström eea7441e43 fixed some spacing 2023-03-22 22:02:41 +01:00
Kristofer Rolf Söderström 4701f94bc2 Update CITATION.cff 2023-03-22 21:56:05 +01:00
Kristofer Rolf Söderström eda1adca07 Update README.md 2023-03-22 16:05:29 +01:00
7 changed files with 350 additions and 12 deletions
+2 -2
View File
@@ -1,11 +1,11 @@
cff-version: 1.2.0
message: "If you use find this implementation useful in your research, please cite it as below."
message: "If you use find this implementation useful in your research, and want to cite it, please do so as below."
authors:
- family-names: "Söderström"
given-names: "Kristofer Rolf"
orcid: "https://orcid.org/0000-0002-5322-3350"
title: "transcribe"
version: 1.0
doi: None
doi: 10.5281/zenodo.7760511
date-released: 2023-03-22
url: "https://github.com/soderstromkr/transcribe"
+100
View File
@@ -0,0 +1,100 @@
import tkinter as tk
from tkinter import ttk
from tkinter import filedialog
from tkinter import messagebox
from transcribe import transcribe
from ttkthemes import ThemedTk
import whisper
import numpy as np
import glob, os
class App:
def __init__(self, master):
self.master = master
master.title("Local Transcribe")
#style = ttk.Style()
#style.configure('TLabel', font=('Arial', 10), padding=10)
#style.configure('TEntry', font=('Arial', 10), padding=10)
#style.configure('TButton', font=('Arial', 10), padding=10)
#style.configure('TCheckbutton', font=('Arial', 10), padding=10)
# Folder Path
path_frame = ttk.Frame(master, padding=10)
path_frame.pack(fill=tk.BOTH)
path_label = ttk.Label(path_frame, text="Folder Path:")
path_label.pack(side=tk.LEFT, padx=5)
self.path_entry = ttk.Entry(path_frame, width=50)
self.path_entry.insert(10, 'sample_audio/')
self.path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
browse_button = ttk.Button(path_frame, text="Browse", command=self.browse)
browse_button.pack(side=tk.LEFT, padx=5)
# File Type
file_type_frame = ttk.Frame(master, padding=10)
file_type_frame.pack(fill=tk.BOTH)
file_type_label = ttk.Label(file_type_frame, text="File Type:")
file_type_label.pack(side=tk.LEFT, padx=5)
self.file_type_entry = ttk.Entry(file_type_frame, width=50)
self.file_type_entry.insert(10, 'ogg')
self.file_type_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Model
model_frame = ttk.Frame(master, padding=10)
model_frame.pack(fill=tk.BOTH)
model_label = ttk.Label(model_frame, text="Model:")
model_label.pack(side=tk.LEFT, padx=5)
self.model_entry = ttk.Entry(model_frame, width=50)
self.model_entry.insert(10, 'small')
self.model_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Language (currently disabled)
#language_frame = ttk.Frame(master, padding=10)
#language_frame.pack(fill=tk.BOTH)
#language_label = ttk.Label(language_frame, text="Language:")
#language_label.pack(side=tk.LEFT, padx=5)
#self.language_entry = ttk.Entry(language_frame, width=50)
#self.language_entry.insert(10, np.nan)
#self.language_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
# Verbose
verbose_frame = ttk.Frame(master, padding=10)
verbose_frame.pack(fill=tk.BOTH)
self.verbose_var = tk.BooleanVar()
verbose_checkbutton = ttk.Checkbutton(verbose_frame, text="Verbose", variable=self.verbose_var)
verbose_checkbutton.pack(side=tk.LEFT, padx=5)
# Buttons
button_frame = ttk.Frame(master, padding=10)
button_frame.pack(fill=tk.BOTH)
transcribe_button = ttk.Button(button_frame, text="Transcribe Audio", command=self.transcribe)
transcribe_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
quit_button = ttk.Button(button_frame, text="Quit", command=master.quit)
quit_button.pack(side=tk.RIGHT, padx=5, pady=10, fill=tk.X, expand=True)
def browse(self):
folder_path = filedialog.askdirectory()
self.path_entry.delete(0, tk.END)
self.path_entry.insert(0, folder_path)
def transcribe(self):
path = self.path_entry.get()
file_type = self.file_type_entry.get()
model = self.model_entry.get()
#language = self.language_entry.get()
language = None # set to auto-detect
verbose = self.verbose_var.get()
# Call the transcribe function with the appropriate arguments
result = transcribe(path, file_type, model=model, language=language, verbose=verbose)
# Show the result in a message box
tk.messagebox.showinfo("Finished!", result)
if __name__ == "__main__":
# root = tk.Tk()
root = ThemedTk(theme="clearlooks")
root.geometry("300x200")
app = App(root)
root.mainloop()
+5 -1
View File
@@ -1,5 +1,7 @@
## transcribe
Simple script that uses OpenAI's Whisper to transcribe audio files from your local folders.
## Note
This implementation and guide is mostly made for researchers not familiar with programming that want a way to transcribe their files locally, without internet connection, usually required within ethical data practices and frameworks. Two examples are shown, a normal workflow with interent connection. And one in which the model is loaded first, via openai-whisper, and then the transcription can be done without being connected to the internet.
### Instructions
#### Requirements
@@ -22,6 +24,8 @@ git clone https://github.com/soderstromkr/transcribe.git
and use the example.ipynb template to use the script **OR (for beginners)** download the ```transcribe.py``` file into your work folder. Then you can either import it to another script or notebook for use. I recommend jupyter notebook for new users, see the example below. (Remember to have transcribe.py and example.ipynb in the same working folder).
### Example
See the [example](example.ipynb) implementation on jupyter notebook.
See [example](example.ipynb) for an implementation on jupyter notebook, also added an example for a simple [workaround](example_no_internet.ipynb) to transcribe while offline.
[^1]: Advanced users can use ```pip install ffmpeg-python``` but be ready to deal with some [PATH issues](https://stackoverflow.com/questions/65836756/python-ffmpeg-wont-accept-path-why), which I encountered in Windows 11.
[![DOI](https://zenodo.org/badge/617404576.svg)](https://zenodo.org/badge/latestdoi/617404576)
+1 -1
View File
@@ -40,7 +40,7 @@
"outputs": [],
"source": [
"path='sample_audio/'#folder path\n",
"file_type='ogg' #check your file for file type, will only transcribe files with the file type, 'ogg', 'WAV'\n",
"file_type='ogg' #check your file for file type, will only transcribe those files\n",
"model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
"language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
"verbose = True # prints output while transcribing, False to deactivate"
+231
View File
@@ -0,0 +1,231 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "eba9e610",
"metadata": {},
"source": [
"A simple way to avoid being connected while transcribing is to first load the model version you want to use. See [here](https://github.com/openai/whisper/blob/main/README.md#available-models-and-languages) for more info."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "85cd2d12",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Whisper(\n",
" (encoder): AudioEncoder(\n",
" (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))\n",
" (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))\n",
" (blocks): ModuleList(\n",
" (0-23): 24 x ResidualAttentionBlock(\n",
" (attn): MultiHeadAttention(\n",
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): Sequential(\n",
" (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (1): GELU(approximate='none')\n",
" (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" )\n",
" (ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" (decoder): TextDecoder(\n",
" (token_embedding): Embedding(51865, 1024)\n",
" (blocks): ModuleList(\n",
" (0-23): 24 x ResidualAttentionBlock(\n",
" (attn): MultiHeadAttention(\n",
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" (cross_attn): MultiHeadAttention(\n",
" (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
" (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
" (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
" )\n",
" (cross_attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" (mlp): Sequential(\n",
" (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
" (1): GELU(approximate='none')\n",
" (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
" )\n",
" (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
" )\n",
" (ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
" )\n",
")"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import whisper\n",
"#change to model size, bigger is more accurate but slower\n",
"whisper.load_model(\"medium\") #base, small, medium, large"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0d2acd54",
"metadata": {},
"outputs": [],
"source": [
"#after it loads, you can disconnect from the internet and run the rest"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a2cd4050",
"metadata": {},
"outputs": [],
"source": [
"from transcribe import transcribe"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "24e1d24e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on function transcribe in module transcribe:\n",
"\n",
"transcribe(path, file_type, model=None, language=None, verbose=True)\n",
" Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions\n",
"\n"
]
}
],
"source": [
"help(transcribe)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "e52477fb",
"metadata": {},
"outputs": [],
"source": [
"path='sample_audio/'#folder path\n",
"file_type='ogg' #check your file for file type, will only transcribe those files\n",
"model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
"language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
"verbose = True # prints output while transcribing, False to deactivate"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "d66866af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using medium model, you can change this by specifying model=\"medium\" for example\n",
"Only looking for file type ogg, you can change this by specifying file_type=\"mp3\"\n",
"Expecting None language, you can change this by specifying language=\"English\". None will try to auto-detect\n",
"Verbosity is True. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False\n",
"\n",
"There are 2 ogg files in path: sample_audio/\n",
"\n",
"\n",
"Loading model...\n",
"Transcribing file number number 1: Armstrong_Small_Step\n",
"Model and file loaded...\n",
"Starting transcription...\n",
"\n",
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
"Detected language: English\n",
"[00:00.000 --> 00:24.000] That's one small step for man, one giant leap for mankind.\n",
"\n",
"Finished file number 1.\n",
"\n",
"\n",
"\n",
"Transcribing file number number 2: Axel_Pettersson_röstinspelning\n",
"Model and file loaded...\n",
"Starting transcription...\n",
"\n",
"Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
"Detected language: Swedish\n",
"[00:00.000 --> 00:16.000] Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
"\n",
"Finished file number 2.\n",
"\n",
"\n",
"\n"
]
},
{
"data": {
"text/plain": [
"'Finished transcription, files can be found in sample_audio/transcriptions'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transcribe(path, file_type, model, language, verbose)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0bc67265",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
+5
View File
@@ -0,0 +1,5 @@
@echo off
call 'PATH_TO_CONDA'
call 'ACTIVATE_NEEDED_ENVS'
call python GUI.py
PAUSE
+6 -8
View File
@@ -5,23 +5,21 @@ def transcribe(path, file_type, model=None, language=None, verbose=True):
'''Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions'''
try:
os.mkdir('{}transcriptions'.format(path))
os.mkdir('{}/transcriptions'.format(path))
except FileExistsError:
pass
glob_file = glob.glob(path+'/*{}'.format(file_type))
path = path
print('Using {} model, you can change this by specifying model="medium" for example'.format(model))
print('Only looking for file type {}, you can change this by specifying file_type="mp3"'.format(file_type))
print('Expecting {} language, you can change this by specifying language="English". None will try to auto-detect'.format(language))
print('Verbosity is {}. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False'.format(verbose))
print('\nThere are {} {} files in path: {}\n\n'.format(len(glob_file), file_type, path))
print('Loading model...')
model = whisper.load_model(model)
for idx,file in enumerate(glob_file):
title = os.path.basename(file).split('.')[0]
@@ -40,7 +38,7 @@ def transcribe(path, file_type, model=None, language=None, verbose=True):
end.append(result['segments'][i]['end'])
text.append(result['segments'][i]['text'])
with open("{}transcriptions/{}.txt".format(path,title), 'w', encoding='utf-8') as file:
with open("{}/transcriptions/{}.txt".format(path,title), 'w', encoding='utf-8') as file:
file.write(title)
file.write('\nIn seconds:')
for i in range(len(result['segments'])):
@@ -48,4 +46,4 @@ def transcribe(path, file_type, model=None, language=None, verbose=True):
print('\nFinished file number {}.\n\n\n'.format(idx+1))
return 'Finished transcription, files can be found in {}transcriptions'.format(path)
return 'Finished transcription, files can be found in {}/transcriptions'.format(path)