Update transcribe.py

better time keeping
Update run_Windows.bat
2023-04-20 20:13:54 +02:00 · 2023-04-20 19:35:51 +02:00 · 2023-04-19 16:51:36 +02:00 · 2023-04-19 16:46:49 +02:00 · 2023-04-17 20:28:52 +02:00 · 2023-04-17 20:25:18 +02:00
11 changed files with 391 additions and 26 deletions
@@ -1,11 +1,11 @@
 cff-version: 1.2.0
-message: "If you use find this implementation useful in your research, please cite it as below."
+message: "If you use find this implementation useful in your research, and want to cite it, please do so as below."
 authors:
 - family-names: "Söderström"
  given-names: "Kristofer Rolf"
  orcid: "https://orcid.org/0000-0002-5322-3350"
 title: "transcribe"
-version: 1.0
+version: 1.1.1
-doi: None
+doi: 10.5281/zenodo.7760511
 date-released: 2023-03-22
 url: "https://github.com/soderstromkr/transcribe"
@@ -0,0 +1,100 @@
 import tkinter as tk
 from tkinter import ttk
 from tkinter import filedialog
 from tkinter import messagebox
 from transcribe import transcribe
 from ttkthemes import ThemedTk
 import whisper 
 import numpy as np
 import glob, os
 class App:
    def __init__(self, master):
        self.master = master
        master.title("Local Transcribe")
        #style options
        style = ttk.Style()
        style.configure('TLabel', font=('Arial', 10), padding=10)
        style.configure('TEntry', font=('Arial', 10), padding=10)
        style.configure('TButton', font=('Arial', 10), padding=10)
        style.configure('TCheckbutton', font=('Arial', 10), padding=10)
        # Folder Path
        path_frame = ttk.Frame(master, padding=10)
        path_frame.pack(fill=tk.BOTH)
        path_label = ttk.Label(path_frame, text="Folder Path:")
        path_label.pack(side=tk.LEFT, padx=5)
        self.path_entry = ttk.Entry(path_frame, width=50)
        self.path_entry.insert(10, 'sample_audio/')
        self.path_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
        browse_button = ttk.Button(path_frame, text="Browse", command=self.browse)
        browse_button.pack(side=tk.LEFT, padx=5)
        # File Type
        file_type_frame = ttk.Frame(master, padding=10)
        file_type_frame.pack(fill=tk.BOTH)
        file_type_label = ttk.Label(file_type_frame, text="File Type:")
        file_type_label.pack(side=tk.LEFT, padx=5)
        self.file_type_entry = ttk.Entry(file_type_frame, width=50)
        self.file_type_entry.insert(10, 'ogg')
        self.file_type_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
        # Model
        model_frame = ttk.Frame(master, padding=10)
        model_frame.pack(fill=tk.BOTH)
        model_label = ttk.Label(model_frame, text="Model:")
        model_label.pack(side=tk.LEFT, padx=5)
        self.model_entry = ttk.Entry(model_frame, width=50)
        self.model_entry.insert(10, 'small')
        self.model_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
        # Language (currently disabled)
        #language_frame = ttk.Frame(master, padding=10)
        #language_frame.pack(fill=tk.BOTH)
        #language_label = ttk.Label(language_frame, text="Language:")
        #language_label.pack(side=tk.LEFT, padx=5)
        #self.language_entry = ttk.Entry(language_frame, width=50)
        #self.language_entry.insert(10, np.nan)
        #self.language_entry.pack(side=tk.LEFT, fill=tk.X, expand=True)
        # Verbose
        verbose_frame = ttk.Frame(master, padding=10)
        verbose_frame.pack(fill=tk.BOTH)
        self.verbose_var = tk.BooleanVar()
        verbose_checkbutton = ttk.Checkbutton(verbose_frame, text="Verbose", variable=self.verbose_var)
        verbose_checkbutton.pack(side=tk.LEFT, padx=5)
        # Buttons
        button_frame = ttk.Frame(master, padding=10)
        button_frame.pack(fill=tk.BOTH)
        transcribe_button = ttk.Button(button_frame, text="Transcribe Audio", command=self.transcribe)
        transcribe_button.pack(side=tk.LEFT, padx=5, pady=10, fill=tk.X, expand=True)
        quit_button = ttk.Button(button_frame, text="Quit", command=master.quit)
        quit_button.pack(side=tk.RIGHT, padx=5, pady=10, fill=tk.X, expand=True)
    def browse(self):
        folder_path = filedialog.askdirectory()
        self.path_entry.delete(0, tk.END)
        self.path_entry.insert(0, folder_path)
    def transcribe(self):
        path = self.path_entry.get()
        file_type = self.file_type_entry.get()
        model = self.model_entry.get()
        #language = self.language_entry.get()
        language = None # set to auto-detect
        verbose = self.verbose_var.get()
        # Call the transcribe function with the appropriate arguments
        result = transcribe(path, file_type, model=model, language=language, verbose=verbose)
        # Show the result in a message box
        tk.messagebox.showinfo("Finished!", result)
 if __name__ == "__main__":
 #    root = tk.Tk()
    root = ThemedTk(theme="clearlooks")
    app = App(root) 
    root.mainloop()
@@ -0,0 +1,5 @@
 ### How to run on Mac
 Unfortunately, I have not found a permament solution for this, not being a Mac user has limited the ways I can test this. For now, these are the recommended steps for a beginner user:
 1. Open a terminal and navigate to the root folder (transcribe-main if you downloaded the folder). You can also right-click (or equivalent) on the root folder to open a Terminal within the folder.
 2. Run the following command: 
 python GUI.py
@@ -1,5 +1,7 @@
 ## transcribe
 Simple script that uses OpenAI's Whisper to transcribe audio files from your local folders. 
 ## Note
 This implementation and guide is mostly made for researchers not familiar with programming that want a way to transcribe their files locally, without internet connection, usually required within ethical data practices and frameworks. Two examples are shown, a normal workflow with internet connection. And one in which the model is loaded first, via openai-whisper, and then the transcription can be done without being connected to the internet. There is now also a GUI implementation, read below for more information.  
 ### Instructions 
 #### Requirements
@@ -14,14 +16,28 @@ Users might not need to specifically install Transfomers. However, a conda insta
 ```
 pip install -U openai-whisper
 ```
 4. There is an option to run a batch file, which launches a GUI built on TKinter and TTKthemes. If using these options, make sure they are installed in your python build. You can install them via pip.
 ```
 pip install tk
 ```
 and
 ```
 pip install ttkthemes 
 ```
 #### Using the script
 This is a simple script with no installation. You can either clone the repository with
 ```
 git clone https://github.com/soderstromkr/transcribe.git
 ```
-and use the example.ipynb template to use the script **OR (for beginners)** download the ```transcribe.py``` file into your work folder. Then you can either import it to another script or notebook for use. I recommend jupyter notebook for new users, see the example below. (Remember to have transcribe.py and example.ipynb in the same working folder).
+and use the example.ipynb template to use the script.
 **OR** download the ```transcribe.py``` file into your work folder. Then you can either import it to another script or notebook for use. I recommend jupyter notebook for new users, see the example below. (Remember to have transcribe.py and example.ipynb in the same working folder). 
 #### Example with jupyter notebook 
 See [example](example.ipynb) for an implementation on jupyter notebook, also added an example for a simple [workaround](example_no_internet.ipynb) to transcribe while offline. 
 #### Using the GUI 
 You can also run the GUI version from your terminal running ```python GUI.py``` or with the batch file called run_Windows.bat (for Windows user, Mac users should read the text file for instructions), just make sure to add your conda path to it. If you want to download a model first, and then go offline for transcription, I recommend running the model with the default sample folder, which will download the model locally. The GUI should look like this:  
 ![python GUI.py](gui_jpeg.jpg?raw=true)
 ### Example
 See the [example](example.ipynb) implementation on jupyter notebook. 
 [^1]: Advanced users can use ```pip install ffmpeg-python``` but be ready to deal with some [PATH issues](https://stackoverflow.com/questions/65836756/python-ffmpeg-wont-accept-path-why), which I encountered in Windows 11.
 [![DOI](https://zenodo.org/badge/617404576.svg)](https://zenodo.org/badge/latestdoi/617404576)
@@ -40,7 +40,7 @@
   "outputs": [],
   "source": [
    "path='sample_audio/'#folder path\n",
-    "file_type='ogg' #check your file for file type, will only transcribe files with the file type, 'ogg', 'WAV'\n",
+    "file_type='ogg' #check your file for file type, will only transcribe those files\n",
    "model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
    "language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
    "verbose = True # prints output while transcribing, False to deactivate"
@@ -0,0 +1,231 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "eba9e610",
   "metadata": {},
   "source": [
    "A simple way to avoid being connected while transcribing is to first load the model version you want to use. See [here](https://github.com/openai/whisper/blob/main/README.md#available-models-and-languages) for more info."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "85cd2d12",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Whisper(\n",
       "  (encoder): AudioEncoder(\n",
       "    (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))\n",
       "    (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))\n",
       "    (blocks): ModuleList(\n",
       "      (0-23): 24 x ResidualAttentionBlock(\n",
       "        (attn): MultiHeadAttention(\n",
       "          (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "          (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "          (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "          (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "        )\n",
       "        (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "        (mlp): Sequential(\n",
       "          (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "          (1): GELU(approximate='none')\n",
       "          (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "        )\n",
       "        (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "      )\n",
       "    )\n",
       "    (ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "  )\n",
       "  (decoder): TextDecoder(\n",
       "    (token_embedding): Embedding(51865, 1024)\n",
       "    (blocks): ModuleList(\n",
       "      (0-23): 24 x ResidualAttentionBlock(\n",
       "        (attn): MultiHeadAttention(\n",
       "          (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "          (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "          (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "          (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "        )\n",
       "        (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "        (cross_attn): MultiHeadAttention(\n",
       "          (query): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "          (key): Linear(in_features=1024, out_features=1024, bias=False)\n",
       "          (value): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "          (out): Linear(in_features=1024, out_features=1024, bias=True)\n",
       "        )\n",
       "        (cross_attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "        (mlp): Sequential(\n",
       "          (0): Linear(in_features=1024, out_features=4096, bias=True)\n",
       "          (1): GELU(approximate='none')\n",
       "          (2): Linear(in_features=4096, out_features=1024, bias=True)\n",
       "        )\n",
       "        (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "      )\n",
       "    )\n",
       "    (ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
       "  )\n",
       ")"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import whisper\n",
    "#change to model size, bigger is more accurate but slower\n",
    "whisper.load_model(\"medium\") #base, small, medium, large"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0d2acd54",
   "metadata": {},
   "outputs": [],
   "source": [
    "#after it loads, you can disconnect from the internet and run the rest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a2cd4050",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transcribe import transcribe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "24e1d24e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on function transcribe in module transcribe:\n",
      "\n",
      "transcribe(path, file_type, model=None, language=None, verbose=True)\n",
      "    Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(transcribe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "e52477fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "path='sample_audio/'#folder path\n",
    "file_type='ogg' #check your file for file type, will only transcribe those files\n",
    "model='medium' #'small', 'medium', 'large' (tradeoff between speed and accuracy)\n",
    "language= None #tries to auto-detect, other options include 'English', 'Spanish', etc...\n",
    "verbose = True # prints output while transcribing, False to deactivate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d66866af",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using medium model, you can change this by specifying model=\"medium\" for example\n",
      "Only looking for file type ogg, you can change this by specifying file_type=\"mp3\"\n",
      "Expecting None language, you can change this by specifying language=\"English\". None will try to auto-detect\n",
      "Verbosity is True. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False\n",
      "\n",
      "There are 2 ogg files in path: sample_audio/\n",
      "\n",
      "\n",
      "Loading model...\n",
      "Transcribing file number number 1: Armstrong_Small_Step\n",
      "Model and file loaded...\n",
      "Starting transcription...\n",
      "\n",
      "Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
      "Detected language: English\n",
      "[00:00.000 --> 00:24.000]  That's one small step for man, one giant leap for mankind.\n",
      "\n",
      "Finished file number 1.\n",
      "\n",
      "\n",
      "\n",
      "Transcribing file number number 2: Axel_Pettersson_röstinspelning\n",
      "Model and file loaded...\n",
      "Starting transcription...\n",
      "\n",
      "Detecting language using up to the first 30 seconds. Use `--language` to specify the language\n",
      "Detected language: Swedish\n",
      "[00:00.000 --> 00:16.000]  Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.\n",
      "\n",
      "Finished file number 2.\n",
      "\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'Finished transcription, files can be found in sample_audio/transcriptions'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "transcribe(path, file_type, model, language, verbose)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bc67265",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
@@ -0,0 +1,5 @@
@echo off
 echo Starting...
 call conda activate base
 REM OPTION 2 : (KEEP TEXT WITHIN QUOTES AND CHANGE USERNAME) "C:/Users/user/Anaconda3/condabin/activate.bat"
 call python GUI.py
@@ -1,3 +1,5 @@
 Armstrong_Small_Step
 In seconds:
-[0.00 --> 24.00]: That's one small step for man, one giant leap for mankind.
+[0.00 --> 7.00]: I'm going to step off the limb now.
 [7.00 --> 18.00]: That's one small step for man.
 [18.00 --> 24.00]: One giant leap for mankind.
@@ -1,3 +1,4 @@
 Axel_Pettersson_röstinspelning
 In seconds:
-[0.00 --> 16.00]: Hej, jag heter Axel Pettersson, jag föddes i Örebro 1976. Jag har varit Wikipedia sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.
+[0.00 --> 6.14]: Hej, jag heter Axel Pettersson. Jag följer bror 1976.
 [6.40 --> 15.10]: Jag har varit vikerpedjan sen 2008 och jag har översatt röstintroduktionsprojektet till svenska.
@@ -1,27 +1,32 @@
 import whisper 
 import glob, os
 #import torch #uncomment if using torch with cuda, below too
 import datetime
-def transcribe(path, file_type, model=None, language=None, verbose=True):
+def transcribe(path, file_type, model=None, language=None, verbose=False):
    '''Implementation of OpenAI's whisper model. Downloads model, transcribes audio files in a folder and returns the text files with transcriptions'''
    try:
-        os.mkdir('{}transcriptions'.format(path))
+        os.mkdir('{}/transcriptions'.format(path))
    except FileExistsError:
        pass
    glob_file = glob.glob(path+'/*{}'.format(file_type))
    path = path
-    print('Using {} model, you can change this by specifying model="medium" for example'.format(model))
+    #if torch.cuda.is_available():
-    print('Only looking for file type {}, you can change this by specifying file_type="mp3"'.format(file_type))    
+    #    generator = torch.Generator('cuda').manual_seed(42)
-    print('Expecting {} language, you can change this by specifying language="English". None will try to auto-detect'.format(language))
+    #else:
-    print('Verbosity is {}. If TRUE it will print out the text as it is transcribed, you can turn this off by setting verbose=False'.format(verbose))
+    #    generator = torch.Generator().manual_seed(42)
    print('Using {} model'.format(model))
    print('File type is {}'.format(file_type))    
    print('Language is being detected automatically for each file')
    print('Verbosity is set to {}'.format(verbose))
    print('\nThere are {} {} files in path: {}\n\n'.format(len(glob_file), file_type, path))
    print('Loading model...')
    model = whisper.load_model(model)
    for idx,file in enumerate(glob_file):
        title = os.path.basename(file).split('.')[0]
@@ -30,22 +35,22 @@ def transcribe(path, file_type, model=None, language=None, verbose=True):
        result = model.transcribe(
            file, 
            language=language, 
-            verbose=True
+            verbose=verbose
        )
        start=[]
        end=[]
        text=[]
        for i in range(len(result['segments'])):
-            start.append(result['segments'][i]['start'])
+            start.append(str(datetime.timedelta(seconds=(result['segments'][i]['start']))))
-            end.append(result['segments'][i]['end'])
+            end.append(str(datetime.timedelta(seconds=(result['segments'][i]['end']))))
            text.append(result['segments'][i]['text'])
-        with open("{}transcriptions/{}.txt".format(path,title), 'w', encoding='utf-8') as file:
+        with open("{}/transcriptions/{}.txt".format(path,title), 'w', encoding='utf-8') as file:
            file.write(title)
            file.write('\nIn seconds:')
            for i in range(len(result['segments'])):
-                file.writelines('\n[{:.2f} --> {:.2f}]:{}'.format(start[i], end[i], text[i]))
+                file.writelines('\n[{} --> {}]:{}'.format(start[i], end[i], text[i]))
        print('\nFinished file number {}.\n\n\n'.format(idx+1))
-    return 'Finished transcription, files can be found in {}transcriptions'.format(path)    
+    return 'Finished transcription, files can be found in {}/transcriptions'.format(path)
Author	SHA1	Message	Date
Kristofer Rolf Söderström	4e1c709f43	Update transcribe.py better time keeping	2023-04-20 20:13:54 +02:00
Kristofer Rolf Söderström	dfe967bd58	Update run_Windows.bat	2023-04-20 19:35:51 +02:00
Kristofer Rolf Söderström	586289efe5	Update Mac_instructions.txt	2023-04-19 16:51:36 +02:00
Kristofer Rolf Söderström	c5a5597eee	Update README.md	2023-04-19 16:46:49 +02:00
Kristofer Rolf Söderström	ce8c365fc4	Update and rename Mac_2_instructions.txt to Mac_instructions.txt	2023-04-17 20:28:52 +02:00
Kristofer Rolf Söderström	e2afd34170	Delete run_Mac_2.command	2023-04-17 20:25:18 +02:00
Kristofer Rolf Söderström	6fa49e41d9	Delete run_Mac_1.sh	2023-04-17 20:24:50 +02:00
Kristofer Söderström	1da9adbf5e	updated version number	2023-04-14 10:32:38 +02:00
Kristofer Söderström	2769ddf68b	dedicated windows and mac scripts, fixed verbose checkbox	2023-04-14 10:31:26 +02:00
Kristofer Rolf Söderström	1128e44486	Update README.md	2023-04-14 09:09:52 +02:00
Kristofer Rolf Söderström	eec20b48c4	Update README.md	2023-04-14 08:30:29 +02:00
Kristofer Rolf Söderström	b569d41aa9	Update README.md	2023-04-14 08:28:24 +02:00
Kristofer Rolf Söderström	99a6625e0e	Update README.md	2023-03-31 11:12:06 +02:00
Kristofer Rolf Söderström	b09114625a	Update README.md	2023-03-27 21:29:51 +02:00
Kristofer Rolf Söderström	785f2b8215	Update README.md	2023-03-27 21:28:34 +02:00
Kristofer Rolf Söderström	412ab97157	Update README.md	2023-03-27 21:26:34 +02:00
Kristofer Rolf Söderström	a14196b055	Update README.md	2023-03-27 21:25:41 +02:00
Kristofer Rolf Söderström	c319316a4d	Add files via upload	2023-03-27 21:25:11 +02:00
Kristofer Rolf Söderström	26c6f84e72	Update README.md	2023-03-27 21:24:12 +02:00
Kristofer Rolf Söderström	8f76466f57	typos	2023-03-27 21:18:04 +02:00
Kristofer Rolf Söderström	1f684a848a	Update README.md	2023-03-27 10:08:19 +02:00
Kristofer Rolf Söderström	bf75df30a4	Update README.md	2023-03-27 10:05:58 +02:00
Kristofer Söderström	f5a8b19b65	fixed bug	2023-03-27 09:57:28 +02:00
Kristofer Söderström	7bbfef44cb	added GUI and batch file to run GUI	2023-03-27 09:25:56 +02:00
Kristofer Söderström	acadd17007	some corrections	2023-03-23 15:14:03 +01:00
Kristofer Söderström	918c2e489e	added example	2023-03-23 15:06:20 +01:00
Kristofer Rolf Söderström	4710c61e22	Update README.md	2023-03-23 14:58:23 +01:00
Kristofer Söderström	eea7441e43	fixed some spacing	2023-03-22 22:02:41 +01:00
Kristofer Rolf Söderström	4701f94bc2	Update CITATION.cff	2023-03-22 21:56:05 +01:00
Kristofer Rolf Söderström	eda1adca07	Update README.md	2023-03-22 16:05:29 +01:00