From de5777208b52c5af8bf4a6438490c166efb252e9 Mon Sep 17 00:00:00 2001 From: Navodplayer1 Date: Mon, 22 Jan 2024 21:23:50 +0530 Subject: [PATCH] added faster-whisper --- README.md | 75 +++++-- examples/preprocess.py | 4 +- examples/transcribe.py | 7 +- library.md | 62 ++++-- metrics.txt | 67 ++++++ requirements.txt | 4 +- setup.py | 6 +- setup_instruction.md | 2 +- speechlib/convert_to_wav.py | 22 ++ speechlib/core_analysis.py | 26 ++- speechlib/mp3_to_wav.py | 13 -- speechlib/speaker_recognition.py | 14 ++ speechlib/speechlib.py | 345 ++++++++++++++++++++----------- speechlib/transcribe.py | 44 ++-- speechlib/wav_segmenter.py | 4 +- speechlib/whisper_large.py | 31 --- speechlib/whisper_medium.py | 31 --- speechlib/whisper_tiny.py | 31 --- speechlib/write_log_file.py | 9 +- 19 files changed, 497 insertions(+), 300 deletions(-) create mode 100644 metrics.txt create mode 100644 speechlib/convert_to_wav.py delete mode 100644 speechlib/mp3_to_wav.py delete mode 100644 speechlib/whisper_large.py delete mode 100644 speechlib/whisper_medium.py delete mode 100644 speechlib/whisper_tiny.py diff --git a/README.md b/README.md index 2e26fee..ba9efd5 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,31 @@

-installation: +### Requirements + +* Python 3.8 or greater + +### GPU execution + +GPU execution needs CUDA 11. + +GPU execution requires the following NVIDIA libraries to be installed: + +* [cuBLAS for CUDA 11](https://developer.nvidia.com/cublas) +* [cuDNN 8 for CUDA 11](https://developer.nvidia.com/cudnn) + +There are multiple ways to install these libraries. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below. + +### Google Colab: + +on google colab run this to install CUDA dependencies: +``` +!apt install libcublas11 +``` + +You can see this example [notebook]() + +### installation: ``` pip install speechlib ``` @@ -30,18 +54,20 @@ This library contains following audio preprocessing functions: 3. re-encode the wav file to have 16-bit PCM encoding -Transcriptor method takes 5 arguments. +Transcriptor method takes 6 arguments. 1. file to transcribe 2. log_folder to store transcription -3. language used for transcribing +3. language used for transcribing (language code is used) -4. model size ("tiny", "medium", or "large") +4. model size ("tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3") 5. voices_folder (contains speaker voice samples for speaker recognition) +6. quantization: this determine whether to use int8 quantization or not. Quantization may speed up the process but lower the accuracy. + voices_folder should contain subfolders named with speaker names. Each subfolder belongs to a speaker and it can contain many voice samples. This will be used for speaker recognition to identify the speaker. if voices_folder is not provided then speaker tags will be arbitrary. @@ -55,13 +81,14 @@ transcript will also indicate the timeframe in seconds where each speaker speaks ``` from speechlib import Transcriptor -file = "obama1.wav" +file = "obama_zach.wav" voices_folder = "voices" -language = "english" +language = "en" log_folder = "logs" modelSize = "medium" +quantization = False # setting this 'True' may speed up the process but lower the accuracy -transcriptor = Transcriptor(file, log_folder, language, modelSize, voices_folder) +transcriptor = Transcriptor(file, log_folder, language, modelSize, voices_folder, quantization) res = transcriptor.transcribe() @@ -70,20 +97,35 @@ res --> [["start", "end", "text", "speaker"], ["start", "end", "text", "speaker" start: starting time of speech in seconds end: ending time of speech in seconds -text: transcribed text for speech during start and end +text: transcribed text for speech during start and end speaker: speaker of the text voices_folder structure: +``` +voices_folder +|---> person1 +| |---> sample1.wav +| |---> sample2.wav +| ... +| +|---> person2 +| |---> sample1.wav +| |---> sample2.wav +| ... +|--> ... +``` -![voices_folder structure](voices_folder_structure1.png) - -Generated transcript: +supported language codes: -![Transcript](transcript.png) +``` +"af", "am", "ar", "as", "az", "ba", "be", "bg", "bn", "bo", "br", "bs", "ca", "cs", "cy", "da", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fo", "fr", "gl", "gu", "ha", "haw", "he", "hi", "hr", "ht", "hu", "hy", "id", "is","it", "ja", "jw", "ka", "kk", "km", "kn", "ko", "la", "lb", "ln", "lo", "lt", "lv", "mg", "mi", "mk", "ml", "mn","mr", "ms", "mt", "my", "ne", "nl", "nn", "no", "oc", "pa", "pl", "ps", "pt", "ro", "ru", "sa", "sd", "si", "sk","sl", "sn", "so", "sq", "sr", "su", "sv", "sw", "ta", "te", "tg", "th", "tk", "tl", "tr", "tt", "uk", "ur", "uz","vi", "yi", "yo", "zh", "yue" +``` -supported languages: +supported language names: -['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese', 'turkish', 'polish', 'catalan', 'dutch', 'arabic', 'swedish', 'italian', 'indonesian', 'hindi', 'finnish', 'vietnamese', 'hebrew', 'ukrainian', 'greek', 'malay', 'czech', 'romanian', 'danish', 'hungarian', 'tamil', 'norwegian', 'thai', 'urdu', 'croatian', 'bulgarian', 'lithuanian', 'latin', 'maori', 'malayalam', 'welsh', 'slovak', 'telugu', 'persian', 'latvian', 'bengali', 'serbian', 'azerbaijani', 'slovenian', 'kannada', 'estonian', 'macedonian', 'breton', 'basque', 'icelandic', 'armenian', 'nepali', 'mongolian', 'bosnian', 'kazakh', 'albanian', 'swahili', 'galician', 'marathi', 'punjabi', 'sinhala', 'khmer', 'shona', 'yoruba', 'somali', 'afrikaans', 'occitan', 'georgian', 'belarusian', 'tajik', 'sindhi', 'gujarati', 'amharic', 'yiddish', 'lao', 'uzbek', 'faroese', 'haitian creole', 'pashto', 'turkmen', 'nynorsk', 'maltese', 'sanskrit', 'luxembourgish', 'myanmar', 'tibetan', 'tagalog', 'malagasy', 'assamese', 'tatar', 'hawaiian', 'lingala', 'hausa', 'bashkir', 'javanese', 'sundanese', 'burmese', 'valencian', 'flemish', 'haitian', 'letzeburgesch', 'pushto', 'panjabi', 'moldavian', 'moldovan', 'sinhalese', 'castilian'] +``` +"Afrikaans", "Amharic", "Arabic", "Assamese", "Azerbaijani", "Bashkir", "Belarusian", "Bulgarian", "Bengali","Tibetan", "Breton", "Bosnian", "Catalan", "Czech", "Welsh", "Danish", "German", "Greek", "English", "Spanish","Estonian", "Basque", "Persian", "Finnish", "Faroese", "French", "Galician", "Gujarati", "Hausa", "Hawaiian","Hebrew", "Hindi", "Croatian", "Haitian", "Hungarian", "Armenian", "Indonesian", "Icelandic", "Italian", "Japanese","Javanese", "Georgian", "Kazakh", "Khmer", "Kannada", "Korean", "Latin", "Luxembourgish", "Lingala", "Lao","Lithuanian", "Latvian", "Malagasy", "Maori", "Macedonian", "Malayalam", "Mongolian", "Marathi", "Malay", "Maltese","Burmese", "Nepali", "Dutch", "Norwegian Nynorsk", "Norwegian", "Occitan", "Punjabi", "Polish", "Pashto","Portuguese", "Romanian", "Russian", "Sanskrit", "Sindhi", "Sinhalese", "Slovak", "Slovenian", "Shona", "Somali","Albanian", "Serbian", "Sundanese", "Swedish", "Swahili", "Tamil", "Telugu", "Tajik", "Thai", "Turkmen", "Tagalog","Turkish", "Tatar", "Ukrainian", "Urdu", "Uzbek", "Vietnamese", "Yiddish", "Yoruba", "Chinese", "Cantonese", +``` ### Audio preprocessing example: @@ -93,9 +135,7 @@ from speechlib import PreProcessor file = "obama1.mp3" # convert mp3 to wav -PreProcessor.mp3_to_wav(file) - -wav_file = "obama1.wav" +wav_file = PreProcessor.convert_to_wav(file) # convert wav file from stereo to mono PreProcessor.convert_to_mono(wav_file) @@ -108,5 +148,4 @@ This library uses following huggingface models: #### https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb #### https://huggingface.co/Ransaka/whisper-tiny-sinhala-20k-8k-steps-v2 -#### https://huggingface.co/openai/whisper-medium #### https://huggingface.co/pyannote/speaker-diarization \ No newline at end of file diff --git a/examples/preprocess.py b/examples/preprocess.py index 60f1b02..0cffbc8 100644 --- a/examples/preprocess.py +++ b/examples/preprocess.py @@ -3,9 +3,7 @@ file = "obama1.mp3" # convert mp3 to wav -PreProcessor.mp3_to_wav(file) - -wav_file = "obama1.wav" +wav_file = PreProcessor.convert_to_wav(file) # convert wav file from stereo to mono PreProcessor.convert_to_mono(wav_file) diff --git a/examples/transcribe.py b/examples/transcribe.py index cddfc17..4b0bc0d 100644 --- a/examples/transcribe.py +++ b/examples/transcribe.py @@ -2,12 +2,11 @@ file = "obama_zach.wav" voices_folder = "voices" -language = "english" +language = "en" log_folder = "logs" modelSize = "medium" +quantization = False # setting this 'True' may speed up the process but lower the accuracy -transcriptor = Transcriptor(file, log_folder, language, modelSize, voices_folder) +transcriptor = Transcriptor(file, log_folder, language, modelSize, voices_folder, quantization) res = transcriptor.transcribe() - -print("res", res) \ No newline at end of file diff --git a/library.md b/library.md index cc3579c..d7d2604 100644 --- a/library.md +++ b/library.md @@ -1,8 +1,32 @@ -installation: +### Requirements + +* Python 3.8 or greater + +### GPU execution + +GPU execution needs CUDA 11. + +GPU execution requires the following NVIDIA libraries to be installed: + +* [cuBLAS for CUDA 11](https://developer.nvidia.com/cublas) +* [cuDNN 8 for CUDA 11](https://developer.nvidia.com/cudnn) + +There are multiple ways to install these libraries. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below. + +### Google Colab: + +on google colab run this to install CUDA dependencies: +``` +!apt install libcublas11 +``` + +You can see this example [notebook]() + +### installation: ``` pip install speechlib ``` - + This library does speaker diarization, speaker recognition, and transcription on a single wav file to provide a transcript with actual speaker names. This library will also return an array containing result information. ⚙ This library contains following audio preprocessing functions: @@ -13,18 +37,20 @@ This library contains following audio preprocessing functions: 3. re-encode the wav file to have 16-bit PCM encoding -Transcriptor method takes 5 arguments. +Transcriptor method takes 6 arguments. 1. file to transcribe 2. log_folder to store transcription -3. language used for transcribing +3. language used for transcribing (language code is used) -4. model size ("tiny", "medium", or "large") +4. model size ("tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3") 5. voices_folder (contains speaker voice samples for speaker recognition) +6. quantization: this determine whether to use int8 quantization or not. Quantization may speed up the process but lower the accuracy. + voices_folder should contain subfolders named with speaker names. Each subfolder belongs to a speaker and it can contain many voice samples. This will be used for speaker recognition to identify the speaker. if voices_folder is not provided then speaker tags will be arbitrary. @@ -38,13 +64,14 @@ transcript will also indicate the timeframe in seconds where each speaker speaks ``` from speechlib import Transcriptor -file = "obama1.wav" +file = "obama_zach.wav" voices_folder = "voices" -language = "english" +language = "en" log_folder = "logs" modelSize = "medium" +quantization = False # setting this 'True' may speed up the process but lower the accuracy -transcriptor = Transcriptor(file, log_folder, language, modelSize, voices_folder) +transcriptor = Transcriptor(file, log_folder, language, modelSize, voices_folder, quantization) res = transcriptor.transcribe() @@ -53,7 +80,7 @@ res --> [["start", "end", "text", "speaker"], ["start", "end", "text", "speaker" start: starting time of speech in seconds end: ending time of speech in seconds -text: transcribed text for speech during start and end +text: transcribed text for speech during start and end speaker: speaker of the text voices_folder structure: @@ -71,9 +98,17 @@ voices_folder |--> ... ``` -supported languages: +supported language codes: + +``` +"af", "am", "ar", "as", "az", "ba", "be", "bg", "bn", "bo", "br", "bs", "ca", "cs", "cy", "da", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fo", "fr", "gl", "gu", "ha", "haw", "he", "hi", "hr", "ht", "hu", "hy", "id", "is","it", "ja", "jw", "ka", "kk", "km", "kn", "ko", "la", "lb", "ln", "lo", "lt", "lv", "mg", "mi", "mk", "ml", "mn","mr", "ms", "mt", "my", "ne", "nl", "nn", "no", "oc", "pa", "pl", "ps", "pt", "ro", "ru", "sa", "sd", "si", "sk","sl", "sn", "so", "sq", "sr", "su", "sv", "sw", "ta", "te", "tg", "th", "tk", "tl", "tr", "tt", "uk", "ur", "uz","vi", "yi", "yo", "zh", "yue" +``` + +supported language names: -['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese', 'turkish', 'polish', 'catalan', 'dutch', 'arabic', 'swedish', 'italian', 'indonesian', 'hindi', 'finnish', 'vietnamese', 'hebrew', 'ukrainian', 'greek', 'malay', 'czech', 'romanian', 'danish', 'hungarian', 'tamil', 'norwegian', 'thai', 'urdu', 'croatian', 'bulgarian', 'lithuanian', 'latin', 'maori', 'malayalam', 'welsh', 'slovak', 'telugu', 'persian', 'latvian', 'bengali', 'serbian', 'azerbaijani', 'slovenian', 'kannada', 'estonian', 'macedonian', 'breton', 'basque', 'icelandic', 'armenian', 'nepali', 'mongolian', 'bosnian', 'kazakh', 'albanian', 'swahili', 'galician', 'marathi', 'punjabi', 'sinhala', 'khmer', 'shona', 'yoruba', 'somali', 'afrikaans', 'occitan', 'georgian', 'belarusian', 'tajik', 'sindhi', 'gujarati', 'amharic', 'yiddish', 'lao', 'uzbek', 'faroese', 'haitian creole', 'pashto', 'turkmen', 'nynorsk', 'maltese', 'sanskrit', 'luxembourgish', 'myanmar', 'tibetan', 'tagalog', 'malagasy', 'assamese', 'tatar', 'hawaiian', 'lingala', 'hausa', 'bashkir', 'javanese', 'sundanese', 'burmese', 'valencian', 'flemish', 'haitian', 'letzeburgesch', 'pushto', 'panjabi', 'moldavian', 'moldovan', 'sinhalese', 'castilian'] +``` +"Afrikaans", "Amharic", "Arabic", "Assamese", "Azerbaijani", "Bashkir", "Belarusian", "Bulgarian", "Bengali","Tibetan", "Breton", "Bosnian", "Catalan", "Czech", "Welsh", "Danish", "German", "Greek", "English", "Spanish","Estonian", "Basque", "Persian", "Finnish", "Faroese", "French", "Galician", "Gujarati", "Hausa", "Hawaiian","Hebrew", "Hindi", "Croatian", "Haitian", "Hungarian", "Armenian", "Indonesian", "Icelandic", "Italian", "Japanese","Javanese", "Georgian", "Kazakh", "Khmer", "Kannada", "Korean", "Latin", "Luxembourgish", "Lingala", "Lao","Lithuanian", "Latvian", "Malagasy", "Maori", "Macedonian", "Malayalam", "Mongolian", "Marathi", "Malay", "Maltese","Burmese", "Nepali", "Dutch", "Norwegian Nynorsk", "Norwegian", "Occitan", "Punjabi", "Polish", "Pashto","Portuguese", "Romanian", "Russian", "Sanskrit", "Sindhi", "Sinhalese", "Slovak", "Slovenian", "Shona", "Somali","Albanian", "Serbian", "Sundanese", "Swedish", "Swahili", "Tamil", "Telugu", "Tajik", "Thai", "Turkmen", "Tagalog","Turkish", "Tatar", "Ukrainian", "Urdu", "Uzbek", "Vietnamese", "Yiddish", "Yoruba", "Chinese", "Cantonese", +``` ### Audio preprocessing example: @@ -83,9 +118,7 @@ from speechlib import PreProcessor file = "obama1.mp3" # convert mp3 to wav -PreProcessor.mp3_to_wav(file) - -wav_file = "obama1.wav" +wav_file = PreProcessor.convert_to_wav(file) # convert wav file from stereo to mono PreProcessor.convert_to_mono(wav_file) @@ -98,5 +131,4 @@ This library uses following huggingface models: #### https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb #### https://huggingface.co/Ransaka/whisper-tiny-sinhala-20k-8k-steps-v2 -#### https://huggingface.co/openai/whisper-medium #### https://huggingface.co/pyannote/speaker-diarization \ No newline at end of file diff --git a/metrics.txt b/metrics.txt new file mode 100644 index 0000000..ff0977a --- /dev/null +++ b/metrics.txt @@ -0,0 +1,67 @@ +These metrics are from Google Colab tests. +These metrics do not take into account model download times. +These metrics are done without quantization enabled. +(quantization will make this even faster) + +metrics for faster-whisper "tiny" model: + on cpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: + speaker recognition time: + transcription time: + + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 64s + + +metrics for faster-whisper "small" model: + on cpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: + speaker recognition time: + transcription time: + + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 95s + + +metrics for faster-whisper "medium" model: + on cpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: + speaker recognition time: + transcription time: + + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 193s + + +metrics for faster-whisper "large" model: + on cpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: + speaker recognition time: + transcription time: + + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 343s \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 695c2b9..ef661ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ torch torchaudio pydub pyannote.audio -speechbrain \ No newline at end of file +speechbrain +accelerate +faster-whisper \ No newline at end of file diff --git a/setup.py b/setup.py index b6e3dce..f631a03 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="speechlib", - version="1.0.7", + version="1.0.10", description="speechlib is a library that can do speaker diarization, transcription and speaker recognition on an audio file to create transcripts with actual speaker names. This library also contain audio preprocessor functions.", packages=find_packages(), long_description=long_description, @@ -19,6 +19,6 @@ "Programming Language :: Python :: 3.10", "Operating System :: OS Independent", ], - install_requires=["transformers", "torch", "torchaudio", "pydub", "pyannote.audio", "speechbrain", "accelerate"], - python_requires=">=3.7", + install_requires=["transformers", "torch", "torchaudio", "pydub", "pyannote.audio", "speechbrain", "accelerate", "faster-whisper"], + python_requires=">=3.8", ) \ No newline at end of file diff --git a/setup_instruction.md b/setup_instruction.md index 6e27e4b..94ad9b6 100644 --- a/setup_instruction.md +++ b/setup_instruction.md @@ -9,7 +9,7 @@ for publishing: pip install twine for install locally for testing: - pip install dist/speechlib-1.0.6-py3-none-any.whl + pip install dist/speechlib-1.0.10-py3-none-any.whl finally run: twine upload dist/* diff --git a/speechlib/convert_to_wav.py b/speechlib/convert_to_wav.py new file mode 100644 index 0000000..7b7aecd --- /dev/null +++ b/speechlib/convert_to_wav.py @@ -0,0 +1,22 @@ +from pydub import AudioSegment +import os + +def convert_to_wav(input_file): + # Load the MP3 file using pydub + # Check if the file is already in WAV format + if input_file.lower().endswith(".wav"): + print(f"{input_file} is already in WAV format.") + return input_file + + audio = AudioSegment.from_file(input_file) + + # Create the output WAV file path + wav_path = os.path.splitext(input_file)[0] + ".wav" + + # Export the audio to WAV + audio.export(wav_path, format="wav") + + print(f"{input_file} has been converted to WAV format.") + + return wav_path + diff --git a/speechlib/core_analysis.py b/speechlib/core_analysis.py index a0fd4d3..f9db0a3 100644 --- a/speechlib/core_analysis.py +++ b/speechlib/core_analysis.py @@ -1,4 +1,5 @@ from pyannote.audio import Pipeline +import time from .hf_access import (ACCESS_TOKEN) from .wav_segmenter import (wav_file_segmentation) import torch, torchaudio @@ -8,13 +9,17 @@ from .re_encode import (re_encode) from .convert_to_mono import (convert_to_mono) +from .convert_to_wav import (convert_to_wav) # by default use google speech-to-text API # if False, then use whisper finetuned version for sinhala -def core_analysis(file_name, voices_folder, log_folder, language, modelSize): +def core_analysis(file_name, voices_folder, log_folder, language, modelSize, quantization=False): # <-------------------PreProcessing file--------------------------> + # check if file is in wav format, if not convert to wav + file_name = convert_to_wav(file_name) + # convert file to mono convert_to_mono(file_name) @@ -36,7 +41,12 @@ def core_analysis(file_name, voices_folder, log_folder, language, modelSize): pipeline.to(device) waveform, sample_rate = torchaudio.load(file_name) + start_time = int(time.time()) + print("running diarization...") diarization = pipeline({"waveform": waveform, "sample_rate": sample_rate}, min_speakers=0, max_speakers=10) + end_time = int(time.time()) + elapsed_time = int(end_time - start_time) + print(f"diarization done. Time taken: {elapsed_time} seconds.") speakers = {} @@ -62,12 +72,17 @@ def core_analysis(file_name, voices_folder, log_folder, language, modelSize): if voices_folder != None: identified = [] + start_time = int(time.time()) + print("running speaker recognition...") for spk_tag, spk_segments in speakers.items(): spk_name = speaker_recognition(file_name, voices_folder, spk_segments, identified) spk = spk_name identified.append(spk) speaker_map[spk_tag] = spk - + end_time = int(time.time()) + elapsed_time = int(end_time - start_time) + print(f"speaker recognition done. Time taken: {elapsed_time} seconds.") + keys_to_remove = [] merged = [] @@ -92,10 +107,15 @@ def core_analysis(file_name, voices_folder, log_folder, language, modelSize): del speaker_map[key] # transcribing the texts differently according to speaker + start_time = int(time.time()) + print("running transcription...") for spk_tag, spk_segments in speakers.items(): spk = speaker_map[spk_tag] - segment_out = wav_file_segmentation(file_name, spk_segments, language, modelSize) + segment_out = wav_file_segmentation(file_name, spk_segments, language, modelSize, quantization) speakers[spk_tag] = segment_out + end_time = int(time.time()) + elapsed_time = int(end_time - start_time) + print(f"transcription done. Time taken: {elapsed_time} seconds.") common_segments = [] diff --git a/speechlib/mp3_to_wav.py b/speechlib/mp3_to_wav.py deleted file mode 100644 index 494f3ec..0000000 --- a/speechlib/mp3_to_wav.py +++ /dev/null @@ -1,13 +0,0 @@ -from pydub import AudioSegment -import os - -def mp3_to_wav(input_file): - # Load the MP3 file using pydub - audio = AudioSegment.from_mp3(input_file) - - # Create the output WAV file path - wav_path = os.path.splitext(input_file)[0] + ".wav" - - # Export the audio to WAV - audio.export(wav_path, format="wav") - diff --git a/speechlib/speaker_recognition.py b/speechlib/speaker_recognition.py index 96fbabf..7c78ebb 100644 --- a/speechlib/speaker_recognition.py +++ b/speechlib/speaker_recognition.py @@ -25,6 +25,14 @@ def speaker_recognition(file_name, voices_folder, segments, wildcards): i = 0 + ''' + iterate over segments and check speaker for increased accuracy. + assign speaker name to arbitrary speaker tag 'SPEAKER_XX' + ''' + + limit = 60 + duration = 0 + for segment in segments: start = segment[0] * 1000 # start time in miliseconds end = segment[1] * 1000 # end time in miliseconds @@ -62,6 +70,12 @@ def speaker_recognition(file_name, voices_folder, segments, wildcards): # Delete the WAV file after processing os.remove(file) + + current_pred = max(Id_count, key=Id_count.get) + + duration += (end - start) + if duration >= limit and current_pred != "unknown": + break most_common_Id = max(Id_count, key=Id_count.get) return most_common_Id diff --git a/speechlib/speechlib.py b/speechlib/speechlib.py index c9aa52c..9718dec 100644 --- a/speechlib/speechlib.py +++ b/speechlib/speechlib.py @@ -1,147 +1,241 @@ from .core_analysis import (core_analysis) from .re_encode import (re_encode) from .convert_to_mono import (convert_to_mono) -from .mp3_to_wav import (mp3_to_wav) +from .convert_to_wav import (convert_to_wav) class Transcriptor: - '''transcribe a wav file - - arguments: - file: name of wav file with extension ex: file.wav + def __init__(self, file, log_folder, language, modelSize, voices_folder=None, quantization=False): + '''transcribe a wav file + + arguments: - log_folder: name of folder where transcript will be stored + file: name of wav file with extension ex: file.wav - language: language of wav file + log_folder: name of folder where transcript will be stored - modelSize: tiny, medium, large (bigger model is more accurate but slow!!) + language: language of wav file - voices_folder: folder containing subfolders named after each speaker with speaker voice samples in them. This will be used for speaker recognition + modelSize: tiny, small, medium, large, large-v1, large-v2, large-v3 (bigger model is more accurate but slow!!) - see documentation: https://github.com/Navodplayer1/speechlib - ''' + voices_folder: folder containing subfolders named after each speaker with speaker voice samples in them. This will be used for speaker recognition - def __init__(self, file, log_folder, language, modelSize, voices_folder=None): - ''' + quantization: whether to use int8 quantization or not (default=False) + + see documentation: https://github.com/Navodplayer1/speechlib + + supported languages: - ['english', - 'chinese', - 'german', - 'spanish', - 'russian', - 'korean', - 'french', ' - japanese', - 'portuguese', - 'turkish', - 'polish', - 'catalan', - 'dutch', - 'arabic', - 'swedish', - 'italian', - 'indonesian', - 'hindi', - 'finnish', - 'vietnamese', - 'hebrew', - 'ukrainian', - 'greek', - 'malay', - 'czech', - 'romanian', - 'danish', - 'hungarian', - 'tamil', - 'norwegian', - 'thai', - 'urdu', - 'croatian', - 'bulgarian', - 'lithuanian', - 'latin', - 'maori', - 'malayalam', - 'welsh', - 'slovak', - 'telugu', - 'persian', - 'latvian', - 'bengali', - 'serbian', - 'azerbaijani', - 'slovenian', - 'kannada', - 'estonian', - 'macedonian', - 'breton', - 'basque', - 'icelandic', - 'armenian', - 'nepali', - 'mongolian', - 'bosnian', - 'kazakh', - 'albanian', - 'swahili', - 'galician', - 'marathi', - 'punjabi', - 'sinhala', - 'khmer', - 'shona', - 'yoruba', - 'somali', - 'afrikaans', - 'occitan', - 'georgian', - 'belarusian', - 'tajik', - 'sindhi', - 'gujarati', - 'amharic', - 'yiddish', - 'lao', - 'uzbek', - 'faroese', - 'haitian creole', - 'pashto', - 'turkmen', - 'nynorsk', - 'maltese', - 'sanskrit', - 'luxembourgish', - 'myanmar', - 'tibetan', - 'tagalog', - 'malagasy', - 'assamese', - 'tatar', - 'hawaiian', - 'lingala', - 'hausa', - 'bashkir', - 'javanese', - 'sundanese', - 'burmese', - 'valencian', - 'flemish', - 'haitian', - 'letzeburgesch', - 'pushto', - 'panjabi', - 'moldavian', - 'moldovan', - 'castilian'] + #### Afrikaans + "af", + #### Amharic + "am", + #### Arabic + "ar", + #### Assamese + "as", + #### Azerbaijani + "az", + #### Bashkir + "ba", + #### Belarusian + "be", + #### Bulgarian + "bg", + #### Bengali + "bn", + #### Tibetan + "bo", + #### Breton + "br", + #### Bosnian + "bs", + #### Catalan + "ca", + #### Czech + "cs", + #### Welsh + "cy", + #### Danish + "da", + #### German + "de", + #### Greek + "el", + #### English + "en", + #### Spanish + "es", + #### Estonian + "et", + #### Basque + "eu", + #### Persian + "fa", + #### Finnish + "fi", + #### Faroese + "fo", + #### French + "fr", + #### Galician + "gl", + #### Gujarati + "gu", + #### Hausa + "ha", + #### Hawaiian + "haw", + #### Hebrew + "he", + #### Hindi + "hi", + #### Croatian + "hr", + #### Haitian + "ht", + #### Hungarian + "hu", + #### Armenian + "hy", + #### Indonesian + "id", + #### Icelandic + "is", + #### Italian + "it", + #### Japanese + "ja", + #### Javanese + "jw", + #### Georgian + "ka", + #### Kazakh + "kk", + #### Khmer + "km", + #### Kannada + "kn", + #### Korean + "ko", + #### Latin + "la", + #### Luxembourgish + "lb", + #### Lingala + "ln", + #### Lao + "lo", + #### Lithuanian + "lt", + #### Latvian + "lv", + #### Malagasy + "mg", + #### Maori + "mi", + #### Macedonian + "mk", + #### Malayalam + "ml", + #### Mongolian + "mn", + #### Marathi + "mr", + #### Malay + "ms", + #### Maltese + "mt", + #### Burmese + "my", + #### Nepali + "ne", + #### Dutch + "nl", + #### Norwegian Nynorsk + "nn", + #### Norwegian + "no", + #### Occitan + "oc", + #### Punjabi + "pa", + #### Polish + "pl", + #### Pashto + "ps", + #### Portuguese + "pt", + #### Romanian + "ro", + #### Russian + "ru", + #### Sanskrit + "sa", + #### Sindhi + "sd", + #### Sinhalese + "si", + #### Slovak + "sk", + #### Slovenian + "sl", + #### Shona + "sn", + #### Somali + "so", + #### Albanian + "sq", + #### Serbian + "sr", + #### Sundanese + "su", + #### Swedish + "sv", + #### Swahili + "sw", + #### Tamil + "ta", + #### Telugu + "te", + #### Tajik + "tg", + #### Thai + "th", + #### Turkmen + "tk", + #### Tagalog + "tl", + #### Turkish + "tr", + #### Tatar + "tt", + #### Ukrainian + "uk", + #### Urdu + "ur", + #### Uzbek + "uz", + #### Vietnamese + "vi", + #### Yiddish + "yi", + #### Yoruba + "yo", + #### Chinese + "zh", + #### Cantonese + "yue", ''' self.file = file self.voices_folder = voices_folder self.language = language self.log_folder = log_folder self.modelSize = modelSize + self.quantization = quantization def transcribe(self): - res = core_analysis(self.file, self.voices_folder, self.log_folder, self.language, self.modelSize) + res = core_analysis(self.file, self.voices_folder, self.log_folder, self.language, self.modelSize, self.quantization) return res class PreProcessor: @@ -164,5 +258,6 @@ def re_encode(file): def convert_to_mono(file): convert_to_mono(file) - def mp3_to_wav(file): - mp3_to_wav(file) + def convert_to_wav(file): + path = convert_to_wav(file) + return path diff --git a/speechlib/transcribe.py b/speechlib/transcribe.py index 3ce4495..3cc4c74 100644 --- a/speechlib/transcribe.py +++ b/speechlib/transcribe.py @@ -1,22 +1,34 @@ +import torch from .whisper_sinhala import (whisper_sinhala) -from .whisper_medium import (whisper_medium) -from .whisper_large import (whisper_large) -from .whisper_tiny import (whisper_tiny) +from faster_whisper import WhisperModel -def transcribe(file, language, modelSize): - - if language == "sinhala" or language == "Sinhala": +def transcribe(file, language, model_size, quantization): + res = "" + if language == "si" or language == "Si": res = whisper_sinhala(file) return res - elif modelSize == "medium": - res = whisper_medium(file, language) - return res - elif modelSize == "large": - res = whisper_large(file, language) - return res - elif modelSize == "tiny": - res = whisper_tiny(file, language) - return res + elif model_size == "tiny" or model_size == "small" or model_size == "medium" or model_size == "large" or model_size == "large-v1" or model_size == "large-v2" or model_size == "large-v3": + + if torch.cuda.is_available(): + if quantization: + model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") + else: + model = WhisperModel(model_size, device="cuda", compute_type="float16") + else: + if quantization: + model = WhisperModel(model_size, device="cpu", compute_type="int8") + else: + model = WhisperModel(model_size, device="cpu", compute_type="float32") + + if language in model.supported_languages: + segments, info = model.transcribe(file, language=language, beam_size=5) + + for segment in segments: + res += segment.text + " " + + return res + else: + Exception("Language code not supported.\nThese are the supported languages:\n", model.supported_languages) else: - raise Exception("only tiny, medium, large models are available. If you use Sinhala language, use tiny model") + raise Exception("only 'tiny', 'small', 'medium', 'large', 'large-v1', 'large-v2', 'large-v3' models are available.") diff --git a/speechlib/wav_segmenter.py b/speechlib/wav_segmenter.py index 6d137d9..b314602 100644 --- a/speechlib/wav_segmenter.py +++ b/speechlib/wav_segmenter.py @@ -3,7 +3,7 @@ from .transcribe import (transcribe) # segment according to speaker -def wav_file_segmentation(file_name, segments, language, modelSize): +def wav_file_segmentation(file_name, segments, language, modelSize, quantization): # Load the WAV file audio = AudioSegment.from_file(file_name, format="wav") trans = "" @@ -27,7 +27,7 @@ def wav_file_segmentation(file_name, segments, language, modelSize): clip.export(file, format="wav") try: - trans = transcribe(file, language, modelSize) + trans = transcribe(file, language, modelSize, quantization) # return -> [[start time, end time, transcript], [start time, end time, transcript], ..] texts.append([segment[0], segment[1], trans]) diff --git a/speechlib/whisper_large.py b/speechlib/whisper_large.py deleted file mode 100644 index c970aa0..0000000 --- a/speechlib/whisper_large.py +++ /dev/null @@ -1,31 +0,0 @@ -import torch -from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline - -def whisper_large(file, language): - device = "cuda:0" if torch.cuda.is_available() else "cpu" - torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 - - model_id = "openai/whisper-large-v3" - - model = AutoModelForSpeechSeq2Seq.from_pretrained( - model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True - ) - model.to(device) - - processor = AutoProcessor.from_pretrained(model_id) - - pipe = pipeline( - "automatic-speech-recognition", - model=model, - tokenizer=processor.tokenizer, - feature_extractor=processor.feature_extractor, - max_new_tokens=128, - chunk_length_s=30, - batch_size=16, - return_timestamps=True, - torch_dtype=torch_dtype, - device=device, - ) - result = pipe(file, generate_kwargs={"language": language}) - - return result["text"] diff --git a/speechlib/whisper_medium.py b/speechlib/whisper_medium.py deleted file mode 100644 index e5da698..0000000 --- a/speechlib/whisper_medium.py +++ /dev/null @@ -1,31 +0,0 @@ -import torch -from transformers import WhisperProcessor, WhisperForConditionalGeneration -import torchaudio - -def whisper_medium(file, language): - device = "cuda:0" if torch.cuda.is_available() else "cpu" - # load model and processor - processor = WhisperProcessor.from_pretrained("openai/whisper-medium") - model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium") - - model.to(device) - model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe") - - # Load the WAV file - waveform, sample_rate = torchaudio.load(file) - resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) - - if sample_rate != 16000: - wav4trans = resampler(waveform) - else: - wav4trans = waveform - - input_features = processor(wav4trans.squeeze(0), sampling_rate=16000, return_tensors="pt").input_features - - # generate token ids - predicted_ids = model.generate(input_features) - # decode token ids to text - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) - - return transcription[0] - diff --git a/speechlib/whisper_tiny.py b/speechlib/whisper_tiny.py deleted file mode 100644 index 3ff5dc4..0000000 --- a/speechlib/whisper_tiny.py +++ /dev/null @@ -1,31 +0,0 @@ -import torch -from transformers import WhisperProcessor, WhisperForConditionalGeneration -import torchaudio - -def whisper_tiny(file, language): - device = "cuda:0" if torch.cuda.is_available() else "cpu" - # load model and processor - processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") - model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") - - model.to(device) - model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe") - - # Load the WAV file - waveform, sample_rate = torchaudio.load(file) - resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) - - if sample_rate != 16000: - wav4trans = resampler(waveform) - else: - wav4trans = waveform - - input_features = processor(wav4trans.squeeze(0), sampling_rate=16000, return_tensors="pt").input_features - - # generate token ids - predicted_ids = model.generate(input_features) - # decode token ids to text - transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) - - return transcription[0] - diff --git a/speechlib/write_log_file.py b/speechlib/write_log_file.py index 3ca40ec..dad4268 100644 --- a/speechlib/write_log_file.py +++ b/speechlib/write_log_file.py @@ -3,14 +3,17 @@ def write_log_file(common_segments, log_folder): + if not os.path.exists(log_folder): + os.makedirs(log_folder) + file_name = "output" current_datetime = datetime.now().strftime("%Y-%m-%d") #---------------------log file part------------------------- - log_file = file_name + "_" + current_datetime + ".txt" - - lf=open(os.path.join(log_folder, log_file),"wb") + log_file = log_folder + "/" + file_name + "_" + current_datetime + ".txt" + + lf=open(log_file,"wb") entry = ""