diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5957730 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +venv +build +dist +speechlib.egg-info +.env \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2300d7a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Navod Peiris + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..31da908 --- /dev/null +++ b/README.md @@ -0,0 +1,427 @@ +

+ +

+ +

+ + + + + + + + + +

+ +### Run your IDE as administrator + +you will get following error if administrator permission is not there: + +**OSError: [WinError 1314] A required privilege is not held by the client** + +### Requirements + +* Python 3.8 or greater + +### GPU execution + +GPU execution needs CUDA 11. + +GPU execution requires the following NVIDIA libraries to be installed: + +* [cuBLAS for CUDA 11](https://developer.nvidia.com/cublas) +* [cuDNN 8 for CUDA 11](https://developer.nvidia.com/cudnn) + +There are multiple ways to install these libraries. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below. + +### Google Colab: + +on google colab run this to install CUDA dependencies: +``` +!apt install libcublas11 +``` + +You can see this example [notebook](https://colab.research.google.com/drive/1lpoWrHl5443LSnTG3vJQfTcg9oFiCQSz?usp=sharing) + +### installation: +``` +pip install speechlib +``` + +This library does speaker diarization, speaker recognition, and transcription on a single wav file to provide a transcript with actual speaker names. This library will also return an array containing result information. ⚙ + +This library contains following audio preprocessing functions: + +1. convert other audio formats to wav + +2. convert stereo wav file to mono + +3. re-encode the wav file to have 16-bit PCM encoding + +Transcriptor method takes 6 arguments. + +1. file to transcribe + +2. log_folder to store transcription + +3. language used for transcribing (language code is used) + +4. model size ("tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3") + +5. voices_folder (contains speaker voice samples for speaker recognition) + +6. quantization: this determine whether to use int8 quantization or not. Quantization may speed up the process but lower the accuracy. + +voices_folder should contain subfolders named with speaker names. Each subfolder belongs to a speaker and it can contain many voice samples. This will be used for speaker recognition to identify the speaker. + +if voices_folder is not provided then speaker tags will be arbitrary. + +log_folder is to store the final transcript as a text file. + +transcript will also indicate the timeframe in seconds where each speaker speaks. + +### Transcription example: + +``` +from speechlib import Transcriptor + +file = "obama_zach.wav" +voices_folder = "voices" +language = "en" +log_folder = "logs" +modelSize = "medium" +quantization = False # setting this 'True' may speed up the process but lower the accuracy + +transcriptor = Transcriptor(file, log_folder, language, modelSize, voices_folder, quantization) + +res = transcriptor.transcribe() + +res --> [["start", "end", "text", "speaker"], ["start", "end", "text", "speaker"]...] +``` + +#### if you don't want speaker names: keep voices_folder as an empty string "" + +start: starting time of speech in seconds +end: ending time of speech in seconds +text: transcribed text for speech during start and end +speaker: speaker of the text + +#### voices_folder structure: + +![voices_folder_structure](voices_folder_structure1.png) + +#### Transcription: + +![transcription](transcript.png) + +supported language codes: + +``` +"af", "am", "ar", "as", "az", "ba", "be", "bg", "bn", "bo", "br", "bs", "ca", "cs", "cy", "da", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fo", "fr", "gl", "gu", "ha", "haw", "he", "hi", "hr", "ht", "hu", "hy", "id", "is","it", "ja", "jw", "ka", "kk", "km", "kn", "ko", "la", "lb", "ln", "lo", "lt", "lv", "mg", "mi", "mk", "ml", "mn","mr", "ms", "mt", "my", "ne", "nl", "nn", "no", "oc", "pa", "pl", "ps", "pt", "ro", "ru", "sa", "sd", "si", "sk","sl", "sn", "so", "sq", "sr", "su", "sv", "sw", "ta", "te", "tg", "th", "tk", "tl", "tr", "tt", "uk", "ur", "uz","vi", "yi", "yo", "zh", "yue" +``` + +supported language names: + +``` +"Afrikaans", "Amharic", "Arabic", "Assamese", "Azerbaijani", "Bashkir", "Belarusian", "Bulgarian", "Bengali","Tibetan", "Breton", "Bosnian", "Catalan", "Czech", "Welsh", "Danish", "German", "Greek", "English", "Spanish","Estonian", "Basque", "Persian", "Finnish", "Faroese", "French", "Galician", "Gujarati", "Hausa", "Hawaiian","Hebrew", "Hindi", "Croatian", "Haitian", "Hungarian", "Armenian", "Indonesian", "Icelandic", "Italian", "Japanese","Javanese", "Georgian", "Kazakh", "Khmer", "Kannada", "Korean", "Latin", "Luxembourgish", "Lingala", "Lao","Lithuanian", "Latvian", "Malagasy", "Maori", "Macedonian", "Malayalam", "Mongolian", "Marathi", "Malay", "Maltese","Burmese", "Nepali", "Dutch", "Norwegian Nynorsk", "Norwegian", "Occitan", "Punjabi", "Polish", "Pashto","Portuguese", "Romanian", "Russian", "Sanskrit", "Sindhi", "Sinhalese", "Slovak", "Slovenian", "Shona", "Somali","Albanian", "Serbian", "Sundanese", "Swedish", "Swahili", "Tamil", "Telugu", "Tajik", "Thai", "Turkmen", "Tagalog","Turkish", "Tatar", "Ukrainian", "Urdu", "Uzbek", "Vietnamese", "Yiddish", "Yoruba", "Chinese", "Cantonese", +``` + +### Audio preprocessing example: + +``` +from speechlib import PreProcessor + +file = "obama1.mp3" + +# convert mp3 to wav +wav_file = PreProcessor.convert_to_wav(file) + +# convert wav file from stereo to mono +PreProcessor.convert_to_mono(wav_file) + +# re-encode wav file to have 16-bit PCM encoding +PreProcessor.re_encode(wav_file) +``` + +### Performance +``` +These metrics are from Google Colab tests. +These metrics do not take into account model download times. +These metrics are done without quantization enabled. +(quantization will make this even faster) + +metrics for faster-whisper "tiny" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 64s + + +metrics for faster-whisper "small" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 95s + + +metrics for faster-whisper "medium" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 193s + + +metrics for faster-whisper "large" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 343s +``` + +#### why not using pyannote/speaker-diarization-3.1, speechbrain >= 1.0.0, faster-whisper >= 1.0.0: + +because older versions give more accurate transcriptions. this was tested. + +with pyannote/speaker-diarization-3.1, speechbrain >= 1.0.0, faster-whisper >= 1.0.0: + +``` +zach (5.2 : 7.7) : Sorry, I had a canceled a few times, but... +zach (9.0 : 12.0) : My mouse pad broke as we had to get my great answer. +obama (12.0 : 22.2) : diabetes shoes and it's no problem. I mean, I have to say when I heard that people actually watch this show as I was actually pretty surprised. +zach (23.2 : 27.7) : Hi, welcome to another edition of Between Two Furns. I'm your host, Zach Gallethonakis. +zach (28.0 : 28.1) : You +zach (28.5 : 31.8) : And my guest today is Barack Obama. +zach (32.7 : 36.5) : President Barack Obama, good to be with you, Zack. First question. +zach (36.9 : 39.2) : in 2013, part in the Turkey. +zach (39.4 : 41.7) : What do you have planned for 2014? +obama (42.7 : 49.9) : We'll probably pardon another turkey. We do that every Thanksgiving. Was that depressing to you seeing a one turkey? +obama (50.3 : 53.0) : going to take an out of circulation, turkey you couldn't eat. +zach (56.4 : 65.8) : So how does this work? Do you send a Ambassador Rodman to North Korea on your behalf? I'd read somewhere that you'd be sitting in Hulk Hogan, a Syria, or is that more of a job for Tony Harding? +obama (66.4 : 68.0) : Exactly, he's not our ambassador. +zach (68.5 : 68.5) : +obama (68.5 : 69.2) : What should we do with that? +zach (69.2 : 69.2) : +obama (69.2 : 69.3) : Thank you very much for watching this video and I'll see you in the next video. +zach (69.3 : 69.3) : +obama (69.3 : 69.3) : +zach (69.3 : 69.9) : and we'll take care. +obama (71.5 : 72.0) : too. +zach (72.0 : 72.0) : +obama (72.0 : 72.1) : . +zach (72.1 : 72.2) : +obama (72.2 : 72.2) : +zach (73.0 : 73.7) : while we move on. +zach (75.5 : 78.2) : I have to know, what is it like to be the last black president? +obama (78.7 : 79.4) : Seriously? +obama (79.9 : 83.0) : What's it like for this to be the last time you ever talk to a prey? +zach (82.8 : 87.2) : It must kind of stink though that you can't run three times. +obama (86.8 : 89.2) : You know, actually I think it's a good idea. +obama (89.4 : 89.9) : . +obama (90.3 : 94.0) : If I ran a third time, it'd be sort of like doing a third hangover movie. +obama (94.4 : 95.8) : didn't really work out very well did. +obama (97.4 : 99.8) : Now I have to say that I've seen this show before and... +obama (100.4 : 100.8) : of… +obama (101.1 : 109.4) : Some of the episodes are probably a little bit better than this, you know, for example, the one with Bradley Cooper, that was a great brand. She kind of carried that movie, didn't it? +zach (108.3 : 108.6) : . +zach (112.1 : 113.6) : Which, which film are you thinking of? +obama (114.1 : 118.7) : Those hangover movies, basically he carried them. +zach (118.7 : 120.0) : Yeah, everybody loves Bradley. +zach (120.3 : 121.0) : Good for him. +obama (121.3 : 122.1) : Good luck in guy. +zach (122.1 : 124.0) : being like that and how that's easy. +zach (124.3 : 128.1) : Paul handsomeets he'd be short fat and smell like Doritos and try to make it Hollywood. +zach (130.6 : 136.0) : Is it going to be hard in two years when you're no longer president and people will stop letting you win a basketball? +zach (136.3 : 138.5) : How does it feel having a 3 inch vertical? +zach (138.7 : 140.0) : It's a three inch horizontal. +zach (140.8 : 141.2) : So. +zach (144.0 : 148.5) : Where are you planning on building a presidential library in Hawaii or your home country of Kenya? Because... +zach (149.7 : 152.0) : I mean both places seem like they would be... +obama (152.0 : 153.4) : Exactly, that's a ridiculous question. +zach (153.4 : 153.5) : You +obama (153.5 : 153.5) : +zach (153.5 : 156.0) : Well, you know, I mean, not to bring up the birth certificate thing that you... +obama (156.0 : 160.1) : I really never did really produce you. Where's your verse? Why don't you show it to us right now? I don't know. +zach (157.5 : 158.0) : Where's your- +zach (160.0 : 162.4) : I don't want to show anybody my birth certificate because it's embarrassing. +obama (162.4 : 163.3) : What's embarrassing about it? +zach (163.3 : 163.3) : +zach (163.4 : 166.7) : My weight on it, it says that I was born seven pounds. +zach (167.1 : 168.1) : 800 ounces. +zach (170.0 : 171.9) : You know what I'll do if I were president, Mr. President. +zach (172.4 : 175.0) : I would make same-sex divorce illegal. +zach (175.4 : 176.7) : then see how bad they want it. +zach (177.7 : 178.4) : I think that's... +obama (179.1 : 179.1) : +zach (179.1 : 179.3) : What? +obama (179.3 : 179.5) : You +zach (179.5 : 180.0) : President. +obama (180.0 : 180.1) : You +zach (180.1 : 180.1) : +obama (180.1 : 180.1) : +obama (180.6 : 180.7) : +zach (180.7 : 181.0) : I'm not too +obama (181.0 : 181.4) : good thing. +zach (182.0 : 188.5) : You said if you had a son, you would not let him play football. What makes you think that he would want to play football? What if he was a nerd like you? +obama (189.1 : 191.1) : Do you think a woman like Michelle would marry a nerd? +obama (192.5 : 195.3) : I want you to ask her whether she thinks I'm a nerd. +obama (195.6 : 196.1) : Good night. +obama (196.4 : 197.9) : No, I'm not going to let her near you. +zach (199.6 : 205.8) : So do you go to any websites that are dot coms or dot nets or do you mainly just stick with dot gobs? +obama (206.7 : 210.5) : No, actually we go to dot-govs, have you heard of health care dot-gov? +zach (213.0 : 215.3) : Okay, let's get this out of the way. What did you come here to plug? +obama (216.0 : 221.5) : Well, first of all, I think it's fair to say that I wouldn't be with you here today if I didn't have something to plug. +obama (221.7 : 223.4) : Have you heard of the Affordable Care Act? +zach (223.6 : 228.1) : Oh, yeah, I heard about that. That's the thing that doesn't work. Why would you get the guy that created the zone? +zach (228.4 : 230.2) : to make your website. +obama (229.9 : 232.1) : healthcare.gov works great now. +obama (232.6 : 234.7) : and millions of Americans have already gotten. +obama (235.3 : 238.9) : health insurance plans and what we want is for people to know. +obama (239.4 : 250.6) : that you can get a formal health care. And most young Americans right now, they're not covered. And the truth is that they can get coverage all for what it cost you. +obama (251.0 : 253.0) : to pay yourself on them. +obama (253.1 : 257.5) : This is what they mean by drones. The point is that a lot of young people. +obama (257.6 : 258.9) : They think they're invincible. +obama (259.3 : 263.5) : Did you say it invisible? Because no, it's not like that's... +zach (262.5 : 263.1) : No, no. +zach (263.5 : 263.5) : +obama (263.5 : 263.6) : +zach (263.6 : 263.7) : up. +obama (263.7 : 266.0) : not invisible, invincible. +obama (266.6 : 269.2) : meaning that they don't think they can get hurt. +zach (269.2 : 272.1) : I'm just saying that nobody could be invisible if you had said invisible. +obama (272.1 : 272.9) : I understand that. +obama (274.2 : 279.0) : If they get that health insurance, it can really make a big difference, and they've got until March 31st to sign up. +zach (279.0 : 279.5) : I don't have a computer. +obama (279.5 : 279.6) : +zach (279.6 : 280.5) : So who had us? +obama (280.5 : 282.2) : Well then you can call 1-800. +obama (282.6 : 283.5) : 318. +obama (283.6 : 285.0) : 2596. +zach (285.0 : 289.8) : I don't have a phone, I'm off the grid, I don't want you people looking at my text if you know what I mean. +obama (290.8 : 300.9) : First of all, is that nobody's interested in your taxes. But second of all, you can do it in person. And the law means that insurers can't discriminate against you. If you've got to pre-existing condition anymore. +zach (300.9 : 301.6) : Yeah, but what about? +obama (303.1 : 303.1) : +zach (303.1 : 304.0) : What about this though? +obama (304.0 : 304.0) : +zach (304.0 : 304.1) : +obama (308.8 : 312.8) : That's the discussion, how long have you had it? I'll just four months. +zach (314.1 : 314.4) : Really? +obama (314.1 : 314.3) : So +obama (314.4 : 314.4) : +zach (314.4 : 315.6) : Spider-Bytes. +obama (316.3 : 317.5) : I got attacked by spiders. +obama (318.1 : 322.4) : Zack you you need to get that checked right away you need to get on health care.gov because +obama (323.6 : 325.6) : That's one of the most disgusting things I've ever seen. +zach (325.8 : 327.0) : is your plug finally over? +zach (327.4 : 328.1) : of +zach (330.0 : 333.0) : I suppose so. So which country were you rooting for and the winner Olympic? +obama (333.0 : 334.7) : x. Seriously? +zach (334.7 : 334.8) : You +obama (336.9 : 339.0) : I'm the president of the United States, what do you think, Zach? +zach (342.4 : 347.4) : I want to thank President Obama for being on the show. I'm going to press this. I don't touch that, please. +zach (354.6 : 359.6) : Thanks for the interview and thanks for letting me shoot my show here all these years. +obama (360.4 : 362.8) : You've been shooting these, these showers. +obama (363.4 : 364.5) : here in the diplomatic room. +obama (365.8 : 367.1) : Who gave you permission to do that? +obama (368.2 : 368.5) : Thank you very much. +obama (369.2 : 369.9) : Seriously? +obama (370.2 : 371.3) : Who gave him clearance? +unknown (377.7 : 378.5) : What's the spider bite? +obama (379.7 : 380.4) : That's the other hand. +unknown (380.7 : 381.8) : It's not all, it's everywhere. + +``` + +with pyannote/speaker-diarization@2.1, speechbrain==0.5.16, faster-whisper==0.10.1: + +``` +zach (1.0 : 14.0) : Sorry, I had to cancel a few times, but my mouse pad broke, and I had to get my great +obama (7.1 : 7.2) : You +obama (14.0 : 22.3) : It's no problem. I mean, I have to say when I heard that people actually watch this show as I was actually pretty surprised. +zach (23.2 : 27.6) : Hi, welcome to another edition of Between Two Furns. I'm your host, Zach Gallethonakis. +zach (28.5 : 31.7) : And my guest today is Barack Obama. +zach (32.7 : 42.1) : President Barack Obama, good to be with you, Zack. First question. In 2013, he pardoned the Turkey. What do you have planned for 2014? +obama (41.5 : 41.6) : Thank you. +obama (42.6 : 53.0) : we'll probably pardon another turkey. We do that every Thanksgiving. Was that depressing to you seeing a one turkey taken out of circulation turkey you couldn't eat? +zach (56.4 : 70.0) : So how does this work? Do you send a ambassador Rodman to North Korea on your behalf? I'd read somewhere that you'd be sitting Hulk Hogan to Syria, or is that more of a job or Tonya Harding? Exactly, he's not our ambassador. What should we do about North IKEA? +zach (71.5 : 72.2) : Good. +obama (72.2 : 73.0) : Thank you. +zach (73.0 : 73.8) : We will move on. +zach (75.5 : 79.0) : I have to know, what is it like to be the last black president? +obama (78.7 : 83.1) : Seriously, what's it like for this to be the last time you ever talk to a president? +zach (82.8 : 87.1) : It must kind of stink though that you can't run three times. +obama (87.0 : 95.9) : Actually, I think it's a good idea, if I ran a third time it'd be sort of like doing a third hangover movie, it didn't really work out very well. +obama (97.4 : 99.8) : Now I have to say that I've seen this show before and +obama (100.4 : 109.5) : Some of the episodes are probably a little bit better than this, you know, for example, the one with Bradley Cooper, that was a great brand. She kind of carried that movie, didn't he? +zach (108.2 : 108.7) : as you kind of care. +zach (112.1 : 113.7) : which film are you seeing us? +obama (114.1 : 119.0) : Those hangover movies, basically he carried them. +zach (118.6 : 121.4) : Yeah, everybody loves Bradley. Good for him. +obama (121.4 : 122.1) : Good luck, guys. +zach (122.1 : 128.1) : being like that in Hollywood. That's easy. Paul Hansome, that'd be short fat and smell like Doritos and try to make it Hollywood. +zach (130.6 : 141.5) : Is it going to be hard in two years when you're no longer president and people will stop letting you win a basketball? How does a field have in a three inch vertical? It's a three inch horizontal, so. +zach (144.1 : 148.5) : Where are you planning on building a presidential library in Hawaii or your home country of Kenya? Because +zach (149.7 : 158.0) : I mean, both places seem like they would be- Exactly, that's a ridiculous question. Well, you know, I mean, not to bring up the first thing that you really never did really produce your real- Where's your- +obama (157.5 : 160.0) : Where's your verse to why don't you show it to us right now? +zach (160.0 : 168.1) : I don't want to show anybody my birth certificate because it's embarrassing. What's embarrassing about it? My weight on it, it says that I was born seven pounds, 800 ounces. +obama (162.9 : 163.3) : Thank you very much. +zach (170.0 : 176.7) : You know what I would do if I were president, Mr. President? I would make same-sex divorce illegal. Then see how bad they want it. +obama (177.7 : 181.0) : I think that's why you're not President. And that's a good thing. +zach (181.0 : 189.3) : You said if you had a son, you would not let him play football. What makes you think that he would want to play football? What if he was a nerd like you? Do you think he would want to play football? +obama (189.3 : 191.1) : I think a woman like Michelle would marry a nerd. +obama (192.5 : 197.9) : Why don't you ask her whether she thinks I'm a nerd? Could I? No, I'm not going to let her near you. +zach (199.6 : 205.8) : So, do you go to any websites that are dot coms or dot nets or do you mainly just stick with dot gobs? +zach (206.8 : 206.8) : +obama (206.8 : 210.6) : No, actually we go to dot gobs, if you heard of health care.gov. +zach (211.3 : 211.6) : Thank you. +zach (213.0 : 216.0) : OK, let's get this out of the way. What did you come here to plug? +obama (216.0 : 223.0) : Well, first of all, I think it's fair to say that I wouldn't be with you here today if I didn't have something to plug. Have you heard of the Affordable Care +zach (222.8 : 230.4) : correct. Oh, yeah, I heard about that. That's the thing that doesn't work. Why would you get the guy that created the Zoom to make your website? Healthcare. +obama (229.9 : 234.7) : healthcare.gov works great now and millions of Americans have already gotten +obama (235.3 : 253.1) : health insurance plans, and what we want is for people to know that you can get a affordable health care, and most young Americans right now, they're not covered. And the truth is that they can get coverage all for what it cost you to pay yourself on them. +zach (253.1 : 254.0) : This is what they mean, but +obama (254.0 : 259.3) : drones, the point is that a lot of young people, they think they're invincible. +zach (259.3 : 264.5) : Did you say it invisible? No, no, that's a problem. No, no, that's a problem. +obama (262.4 : 269.8) : No, no, that's a problem. Not invisible, invincible. Meaning that they don't think they can get hurt. I'm just kidding. +zach (269.4 : 272.8) : that nobody could be invisible if you had said invisible. I understand that. +obama (272.1 : 273.0) : I understand that. +obama (274.2 : 285.3) : If they get that health insurance, it can really make a big difference. And they've got till March 31st to sign up. I don't have a computer. So how does, well, then you can call 1-800-318-2596. +zach (285.2 : 289.9) : I don't have a phone, I'm off the grid, I don't want you people looking at my text if you know what I mean. +obama (290.8 : 300.8) : First of all, there's nobody's interest in your taxes, but second of all, you can do it in person. And the law means that insurers can't discriminate against you if you've got to pre-existing condition. +zach (300.8 : 301.7) : Yeah, but what about? +zach (303.1 : 304.1) : What about this though? +obama (308.8 : 312.9) : That's the disgusting, how long have you had it? I'll just four months. +zach (312.5 : 312.8) : +zach (314.1 : 316.5) : Really? Spider-Bytes. +obama (314.1 : 314.5) : Really? +obama (316.5 : 322.5) : I tacked my spiders. Zack, you need to get that checked right away. You need to get on healthcare.gov because +obama (323.6 : 328.1) : That's one of the most disgusting things I've ever seen. Is your plug finally over? +obama (330.0 : 330.9) : I, I suppose so. +zach (330.9 : 334.7) : So which country were you rooting for in the winter Olympics? Seriously? +obama (334.7 : 334.9) : +obama (336.9 : 339.1) : I'm the president of the United States. What do you think, Zach? +zach (342.3 : 347.3) : I want to thank President Obama for being on the show. I'm going to press this. I don't touch that, please. +zach (354.6 : 359.6) : Thanks for the interview and thanks for letting me shoot my show here all these years. +obama (360.5 : 364.7) : You've been shooting these shows here in the diplomatic room. +obama (365.9 : 367.2) : Who gave you permission to do that? +obama (368.2 : 368.5) : Thank you very much. +obama (369.2 : 371.3) : Seriously? Who gave him clearance? +zach (377.7 : 378.6) : What's the spider bite? +obama (379.7 : 380.6) : That's the other hand. +zach (380.6 : 383.0) : It's not all, it's everywhere. + +``` + +This library uses following huggingface models: + +#### https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb +#### https://huggingface.co/Ransaka/whisper-tiny-sinhala-20k-8k-steps-v2 +#### https://huggingface.co/pyannote/speaker-diarization \ No newline at end of file diff --git a/examples/.gitignore b/examples/.gitignore new file mode 100644 index 0000000..b54121b --- /dev/null +++ b/examples/.gitignore @@ -0,0 +1,7 @@ +example1.wav +temp +segments +pretrained_models +audio_cache +__pycache__ +logs diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..26b2ef6 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,3 @@ +##### Run transcribe.py for trancribing an audio file + +##### Run preprocess.py for preprocessing an audio file \ No newline at end of file diff --git a/examples/obama1.mp3 b/examples/obama1.mp3 new file mode 100644 index 0000000..d7f3268 Binary files /dev/null and b/examples/obama1.mp3 differ diff --git a/examples/obama1.wav b/examples/obama1.wav new file mode 100644 index 0000000..0d7b764 Binary files /dev/null and b/examples/obama1.wav differ diff --git a/examples/obama_zach.wav b/examples/obama_zach.wav new file mode 100644 index 0000000..4718c59 Binary files /dev/null and b/examples/obama_zach.wav differ diff --git a/examples/preprocess.py b/examples/preprocess.py new file mode 100644 index 0000000..f124bd2 --- /dev/null +++ b/examples/preprocess.py @@ -0,0 +1,13 @@ +from speechlib import PreProcessor + +file = "obama1.mp3" +#initialize +prep = PreProcessor() +# convert mp3 to wav +wav_file = prep.convert_to_wav(file) + +# convert wav file from stereo to mono +prep.convert_to_mono(wav_file) + +# re-encode wav file to have 16-bit PCM encoding +prep.re_encode(wav_file) \ No newline at end of file diff --git a/examples/transcribe.py b/examples/transcribe.py new file mode 100644 index 0000000..f6a0af0 --- /dev/null +++ b/examples/transcribe.py @@ -0,0 +1,18 @@ +from speechlib import Transcriptor + +file = "obama1.wav" # your audio file +voices_folder = "voices" # voices folder containing voice samples for recognition +language = "en" # language code +log_folder = "logs" # log folder for storing transcripts +modelSize = "tiny" # size of model to be used [tiny, small, medium, large-v1, large-v2, large-v3] +quantization = False # setting this 'True' may speed up the process but lower the accuracy +ACCESS_TOKEN = "your huggingface access token" # get permission to access pyannote/speaker-diarization@2.1 on huggingface + +# quantization only works on faster-whisper +transcriptor = Transcriptor(file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder, quantization) + +# use normal whisper +res = transcriptor.whisper() + +# use faster-whisper (simply faster) +res = transcriptor.faster_whisper() diff --git a/examples/voices/obama/obama1.wav b/examples/voices/obama/obama1.wav new file mode 100644 index 0000000..3732744 Binary files /dev/null and b/examples/voices/obama/obama1.wav differ diff --git a/examples/voices/obama/obama2.wav b/examples/voices/obama/obama2.wav new file mode 100644 index 0000000..d29b752 Binary files /dev/null and b/examples/voices/obama/obama2.wav differ diff --git a/examples/voices/zach/zach1.wav b/examples/voices/zach/zach1.wav new file mode 100644 index 0000000..83423a1 Binary files /dev/null and b/examples/voices/zach/zach1.wav differ diff --git a/examples/voices/zach/zach2.wav b/examples/voices/zach/zach2.wav new file mode 100644 index 0000000..e7dda8d Binary files /dev/null and b/examples/voices/zach/zach2.wav differ diff --git a/library.md b/library.md new file mode 100644 index 0000000..5ef1c7d --- /dev/null +++ b/library.md @@ -0,0 +1,178 @@ +### Requirements + +* Python 3.8 or greater + +### GPU execution + +GPU execution needs CUDA 11. + +GPU execution requires the following NVIDIA libraries to be installed: + +* [cuBLAS for CUDA 11](https://developer.nvidia.com/cublas) +* [cuDNN 8 for CUDA 11](https://developer.nvidia.com/cudnn) + +There are multiple ways to install these libraries. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below. + +### Google Colab: + +on google colab run this to install CUDA dependencies: +``` +!apt install libcublas11 +``` + +You can see this example [notebook](https://colab.research.google.com/drive/1lpoWrHl5443LSnTG3vJQfTcg9oFiCQSz?usp=sharing) + +### installation: +``` +pip install speechlib +``` + +This library does speaker diarization, speaker recognition, and transcription on a single wav file to provide a transcript with actual speaker names. This library will also return an array containing result information. ⚙ + +This library contains following audio preprocessing functions: + +1. convert mp3 to wav + +2. convert stereo wav file to mono + +3. re-encode the wav file to have 16-bit PCM encoding + +Transcriptor method takes 6 arguments. + +1. file to transcribe + +2. log_folder to store transcription + +3. language used for transcribing (language code is used) + +4. model size ("tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3") + +5. voices_folder (contains speaker voice samples for speaker recognition) + +6. quantization: this determine whether to use int8 quantization or not. Quantization may speed up the process but lower the accuracy. + +voices_folder should contain subfolders named with speaker names. Each subfolder belongs to a speaker and it can contain many voice samples. This will be used for speaker recognition to identify the speaker. + +if voices_folder is not provided then speaker tags will be arbitrary. + +log_folder is to store the final transcript as a text file. + +transcript will also indicate the timeframe in seconds where each speaker speaks. + +### Transcription example: + +``` +from speechlib import Transcriptor + +file = "obama_zach.wav" +voices_folder = "voices" +language = "en" +log_folder = "logs" +modelSize = "medium" +quantization = False # setting this 'True' may speed up the process but lower the accuracy + +transcriptor = Transcriptor(file, log_folder, language, modelSize, voices_folder, quantization) + +res = transcriptor.transcribe() + +res --> [["start", "end", "text", "speaker"], ["start", "end", "text", "speaker"]...] +``` + +start: starting time of speech in seconds +end: ending time of speech in seconds +text: transcribed text for speech during start and end +speaker: speaker of the text + +voices_folder structure: +``` +voices_folder +|---> person1 +| |---> sample1.wav +| |---> sample2.wav +| ... +| +|---> person2 +| |---> sample1.wav +| |---> sample2.wav +| ... +|--> ... +``` + +supported language codes: + +``` +"af", "am", "ar", "as", "az", "ba", "be", "bg", "bn", "bo", "br", "bs", "ca", "cs", "cy", "da", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fo", "fr", "gl", "gu", "ha", "haw", "he", "hi", "hr", "ht", "hu", "hy", "id", "is","it", "ja", "jw", "ka", "kk", "km", "kn", "ko", "la", "lb", "ln", "lo", "lt", "lv", "mg", "mi", "mk", "ml", "mn","mr", "ms", "mt", "my", "ne", "nl", "nn", "no", "oc", "pa", "pl", "ps", "pt", "ro", "ru", "sa", "sd", "si", "sk","sl", "sn", "so", "sq", "sr", "su", "sv", "sw", "ta", "te", "tg", "th", "tk", "tl", "tr", "tt", "uk", "ur", "uz","vi", "yi", "yo", "zh", "yue" +``` + +supported language names: + +``` +"Afrikaans", "Amharic", "Arabic", "Assamese", "Azerbaijani", "Bashkir", "Belarusian", "Bulgarian", "Bengali","Tibetan", "Breton", "Bosnian", "Catalan", "Czech", "Welsh", "Danish", "German", "Greek", "English", "Spanish","Estonian", "Basque", "Persian", "Finnish", "Faroese", "French", "Galician", "Gujarati", "Hausa", "Hawaiian","Hebrew", "Hindi", "Croatian", "Haitian", "Hungarian", "Armenian", "Indonesian", "Icelandic", "Italian", "Japanese","Javanese", "Georgian", "Kazakh", "Khmer", "Kannada", "Korean", "Latin", "Luxembourgish", "Lingala", "Lao","Lithuanian", "Latvian", "Malagasy", "Maori", "Macedonian", "Malayalam", "Mongolian", "Marathi", "Malay", "Maltese","Burmese", "Nepali", "Dutch", "Norwegian Nynorsk", "Norwegian", "Occitan", "Punjabi", "Polish", "Pashto","Portuguese", "Romanian", "Russian", "Sanskrit", "Sindhi", "Sinhalese", "Slovak", "Slovenian", "Shona", "Somali","Albanian", "Serbian", "Sundanese", "Swedish", "Swahili", "Tamil", "Telugu", "Tajik", "Thai", "Turkmen", "Tagalog","Turkish", "Tatar", "Ukrainian", "Urdu", "Uzbek", "Vietnamese", "Yiddish", "Yoruba", "Chinese", "Cantonese", +``` + +### Audio preprocessing example: + +``` +from speechlib import PreProcessor + +file = "obama1.mp3" + +# convert mp3 to wav +wav_file = PreProcessor.convert_to_wav(file) + +# convert wav file from stereo to mono +PreProcessor.convert_to_mono(wav_file) + +# re-encode wav file to have 16-bit PCM encoding +PreProcessor.re_encode(wav_file) +``` + +### Performance +``` +These metrics are from Google Colab tests. +These metrics do not take into account model download times. +These metrics are done without quantization enabled. +(quantization will make this even faster) + +metrics for faster-whisper "tiny" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 64s + + +metrics for faster-whisper "small" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 95s + + +metrics for faster-whisper "medium" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 193s + + +metrics for faster-whisper "large" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 343s +``` + + +This library uses following huggingface models: + +#### https://huggingface.co/speechbrain/spkrec-ecapa-voxceleb +#### https://huggingface.co/Ransaka/whisper-tiny-sinhala-20k-8k-steps-v2 +#### https://huggingface.co/pyannote/speaker-diarization \ No newline at end of file diff --git a/metrics.txt b/metrics.txt new file mode 100644 index 0000000..3eb8509 --- /dev/null +++ b/metrics.txt @@ -0,0 +1,39 @@ +These metrics are from Google Colab tests. +These metrics do not take into account model download times. +These metrics are done without quantization enabled. +(quantization will make this even faster) + +metrics for faster-whisper "tiny" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 64s + + +metrics for faster-whisper "small" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 95s + + +metrics for faster-whisper "medium" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 193s + + +metrics for faster-whisper "large" model: + on gpu: + audio name: obama_zach.wav + duration: 6 min 36 s + diarization time: 24s + speaker recognition time: 10s + transcription time: 343s \ No newline at end of file diff --git a/pyannote-audio_LICENSE b/pyannote-audio_LICENSE new file mode 100644 index 0000000..9712e8a --- /dev/null +++ b/pyannote-audio_LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 CNRS + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ef661ee --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +transformers +torch +torchaudio +pydub +pyannote.audio +speechbrain +accelerate +faster-whisper \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..9b08241 --- /dev/null +++ b/setup.py @@ -0,0 +1,24 @@ +from setuptools import find_packages, setup + +with open("library.md", "r") as f: + long_description = f.read() + +setup( + name="speechlib", + version="1.1.0", + description="speechlib is a library that can do speaker diarization, transcription and speaker recognition on an audio file to create transcripts with actual speaker names. This library also contain audio preprocessor functions.", + packages=find_packages(), + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/NavodPeiris/speechlib", + author="Navod Peiris", + author_email="navodpeiris1234@gmail.com", + license="MIT", + classifiers=[ + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.10", + "Operating System :: OS Independent", + ], + install_requires=["transformers==4.36.2", "torch==2.1.2", "torchaudio==2.1.2", "pydub==0.25.1", "pyannote.audio==3.1.1", "speechbrain==0.5.16", "accelerate==0.26.1", "faster-whisper==0.10.1", "openai-whisper==20231117"], + python_requires=">=3.8", +) \ No newline at end of file diff --git a/setup_instruction.md b/setup_instruction.md new file mode 100644 index 0000000..fbf0bee --- /dev/null +++ b/setup_instruction.md @@ -0,0 +1,19 @@ +for building setup: + pip install setuptools + pip install wheel + +on root: + python setup.py sdist bdist_wheel + +for publishing: + pip install twine + +for install locally for testing: + pip install dist/speechlib-1.1.0-py3-none-any.whl + +finally run: + twine upload dist/* + + fill as follows: + username: __token__ + password: {your token value} \ No newline at end of file diff --git a/speechbrain_LICENSE b/speechbrain_LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/speechbrain_LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/speechlib.png b/speechlib.png new file mode 100644 index 0000000..6698605 Binary files /dev/null and b/speechlib.png differ diff --git a/speechlib/__init__.py b/speechlib/__init__.py new file mode 100644 index 0000000..4e20dea --- /dev/null +++ b/speechlib/__init__.py @@ -0,0 +1,4 @@ +from .speechlib import( + Transcriptor, + PreProcessor +) diff --git a/speechlib/convert_to_mono.py b/speechlib/convert_to_mono.py new file mode 100644 index 0000000..4d6b02b --- /dev/null +++ b/speechlib/convert_to_mono.py @@ -0,0 +1,31 @@ +import wave +import numpy as np + +def convert_to_mono(input_wav): + # Open the input WAV file + with wave.open(input_wav, 'rb') as input_file: + # Get the parameters of the input file + params = input_file.getparams() + + # Check if the file is stereo + if params.nchannels > 1: + # Read the audio data + frames = input_file.readframes(-1) + audio_data = np.frombuffer(frames, dtype=np.int16) + + # Take the average of the channels to convert to mono + mono_audio_data = np.mean(audio_data.reshape(-1, params.nchannels), axis=1) + + # Create a new WAV file for mono audio + with wave.open(input_wav, 'wb') as output_file: + # Set the parameters for the output file + output_file.setparams((1, params.sampwidth, params.framerate, len(mono_audio_data), params.comptype, params.compname)) + + # Write the mono audio data to the output file + output_file.writeframes(np.int16(mono_audio_data)) + + print(f'{input_wav} converted to mono') + else: + print(f'{input_wav} is already a mono audio file.') + + diff --git a/speechlib/convert_to_wav.py b/speechlib/convert_to_wav.py new file mode 100644 index 0000000..7b7aecd --- /dev/null +++ b/speechlib/convert_to_wav.py @@ -0,0 +1,22 @@ +from pydub import AudioSegment +import os + +def convert_to_wav(input_file): + # Load the MP3 file using pydub + # Check if the file is already in WAV format + if input_file.lower().endswith(".wav"): + print(f"{input_file} is already in WAV format.") + return input_file + + audio = AudioSegment.from_file(input_file) + + # Create the output WAV file path + wav_path = os.path.splitext(input_file)[0] + ".wav" + + # Export the audio to WAV + audio.export(wav_path, format="wav") + + print(f"{input_file} has been converted to WAV format.") + + return wav_path + diff --git a/speechlib/core_analysis.py b/speechlib/core_analysis.py new file mode 100644 index 0000000..afbbaea --- /dev/null +++ b/speechlib/core_analysis.py @@ -0,0 +1,139 @@ +import os +from pyannote.audio import Pipeline +import time +from .wav_segmenter import (wav_file_segmentation) +import torch, torchaudio + +from .speaker_recognition import (speaker_recognition) +from .write_log_file import (write_log_file) + +from .re_encode import (re_encode) +from .convert_to_mono import (convert_to_mono) +from .convert_to_wav import (convert_to_wav) + +# by default use google speech-to-text API +# if False, then use whisper finetuned version for sinhala +def core_analysis(file_name, voices_folder, log_folder, language, modelSize, ACCESS_TOKEN, whisper_type ,quantization=False): + + # <-------------------PreProcessing file--------------------------> + + # check if file is in wav format, if not convert to wav + file_name = convert_to_wav(file_name) + + # convert file to mono + convert_to_mono(file_name) + + # re-encode file to 16-bit PCM encoding + re_encode(file_name) + + # <--------------------running analysis---------------------------> + + speaker_tags = [] + + pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", + use_auth_token=ACCESS_TOKEN) + + if torch.cuda.is_available(): + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' + else: + device = torch.device("cpu") + + pipeline.to(device) + waveform, sample_rate = torchaudio.load(file_name) + + start_time = int(time.time()) + print("running diarization...") + diarization = pipeline({"waveform": waveform, "sample_rate": sample_rate}, min_speakers=0, max_speakers=10) + end_time = int(time.time()) + elapsed_time = int(end_time - start_time) + print(f"diarization done. Time taken: {elapsed_time} seconds.") + + speakers = {} + + common = [] + + # create a dictionary of SPEAKER_XX to real name mappings + speaker_map = {} + + for turn, _, speaker in diarization.itertracks(yield_label=True): + + start = round(turn.start, 1) + end = round(turn.end, 1) + common.append([start, end, speaker]) + + # find different speakers + if speaker not in speaker_tags: + speaker_tags.append(speaker) + speaker_map[speaker] = speaker + speakers[speaker] = [] + + speakers[speaker].append([start, end, speaker]) + + if voices_folder != None and voices_folder != "": + identified = [] + + start_time = int(time.time()) + print("running speaker recognition...") + for spk_tag, spk_segments in speakers.items(): + spk_name = speaker_recognition(file_name, voices_folder, spk_segments, identified) + spk = spk_name + identified.append(spk) + speaker_map[spk_tag] = spk + end_time = int(time.time()) + elapsed_time = int(end_time - start_time) + print(f"speaker recognition done. Time taken: {elapsed_time} seconds.") + + keys_to_remove = [] + merged = [] + + # merging same speakers + for spk_tag1, spk_segments1 in speakers.items(): + for spk_tag2, spk_segments2 in speakers.items(): + if spk_tag1 not in merged and spk_tag2 not in merged and spk_tag1 != spk_tag2 and speaker_map[spk_tag1] == speaker_map[spk_tag2]: + for segment in spk_segments2: + speakers[spk_tag1].append(segment) + + merged.append(spk_tag1) + merged.append(spk_tag2) + keys_to_remove.append(spk_tag2) + + # fixing the speaker names in common + for segment in common: + speaker = segment[2] + segment[2] = speaker_map[speaker] + + for key in keys_to_remove: + del speakers[key] + del speaker_map[key] + + # transcribing the texts differently according to speaker + start_time = int(time.time()) + print("running transcription...") + for spk_tag, spk_segments in speakers.items(): + spk = speaker_map[spk_tag] + segment_out = wav_file_segmentation(file_name, spk_segments, language, modelSize, whisper_type, quantization) + speakers[spk_tag] = segment_out + end_time = int(time.time()) + elapsed_time = int(end_time - start_time) + print(f"transcription done. Time taken: {elapsed_time} seconds.") + + common_segments = [] + + for item in common: + speaker = item[2] + start = item[0] + end = item[1] + + for spk_tag, spk_segments in speakers.items(): + if speaker == speaker_map[spk_tag]: + for segment in spk_segments: + if start == segment[0] and end == segment[1]: + common_segments.append([start, end, segment[2], speaker]) + + # writing log file + write_log_file(common_segments, log_folder, file_name, language) + + return common_segments diff --git a/speechlib/re_encode.py b/speechlib/re_encode.py new file mode 100644 index 0000000..f34dc4a --- /dev/null +++ b/speechlib/re_encode.py @@ -0,0 +1,37 @@ +import wave +import struct + +def re_encode(file_name): + + with wave.open(file_name, 'rb') as original_file: + + # Get the original audio parameters + params = original_file.getparams() + + # Check if the sample width is already 16-bit + if params.sampwidth == 2: + + print("The file already has 16-bit samples.") + + elif params.sampwidth == 1: + + # Open a new WAV file with 16-bit samples + file_name = file_name + '_16bit.wav' + + with wave.open(file_name, 'wb') as new_file: + # Set the new audio parameters + new_file.setparams(params) + new_file.setsampwidth(2) + new_file.setnchannels(1) + + # Read and convert each sample + for _ in range(params.nframes): + sample = original_file.readframes(1) + sample_value = struct.unpack("= max_score: + max_score = score + speakerId = speaker.split(".")[0] + if speakerId not in wildcards: # speaker_00 cannot be speaker_01 + person = speakerId + except: + pass + + Id_count[person] += 1 + + # Delete the WAV file after processing + os.remove(file) + + current_pred = max(Id_count, key=Id_count.get) + + duration += (end - start) + if duration >= limit and current_pred != "unknown": + break + + most_common_Id = max(Id_count, key=Id_count.get) + return most_common_Id + diff --git a/speechlib/speechlib.py b/speechlib/speechlib.py new file mode 100644 index 0000000..a52b0e4 --- /dev/null +++ b/speechlib/speechlib.py @@ -0,0 +1,268 @@ +from .core_analysis import (core_analysis) +from .re_encode import (re_encode) +from .convert_to_mono import (convert_to_mono) +from .convert_to_wav import (convert_to_wav) + +class Transcriptor: + + def __init__(self, file, log_folder, language, modelSize, ACCESS_TOKEN, voices_folder=None, quantization=False): + '''transcribe a wav file + + arguments: + + file: name of wav file with extension ex: file.wav + + log_folder: name of folder where transcript will be stored + + language: language of wav file + + modelSize: tiny, small, medium, large, large-v1, large-v2, large-v3 (bigger model is more accurate but slow!!) + + voices_folder: folder containing subfolders named after each speaker with speaker voice samples in them. This will be used for speaker recognition + + quantization: whether to use int8 quantization or not (default=False) + + see documentation: https://github.com/Navodplayer1/speechlib + + + supported languages: + #### Afrikaans + "af", + #### Amharic + "am", + #### Arabic + "ar", + #### Assamese + "as", + #### Azerbaijani + "az", + #### Bashkir + "ba", + #### Belarusian + "be", + #### Bulgarian + "bg", + #### Bengali + "bn", + #### Tibetan + "bo", + #### Breton + "br", + #### Bosnian + "bs", + #### Catalan + "ca", + #### Czech + "cs", + #### Welsh + "cy", + #### Danish + "da", + #### German + "de", + #### Greek + "el", + #### English + "en", + #### Spanish + "es", + #### Estonian + "et", + #### Basque + "eu", + #### Persian + "fa", + #### Finnish + "fi", + #### Faroese + "fo", + #### French + "fr", + #### Galician + "gl", + #### Gujarati + "gu", + #### Hausa + "ha", + #### Hawaiian + "haw", + #### Hebrew + "he", + #### Hindi + "hi", + #### Croatian + "hr", + #### Haitian + "ht", + #### Hungarian + "hu", + #### Armenian + "hy", + #### Indonesian + "id", + #### Icelandic + "is", + #### Italian + "it", + #### Japanese + "ja", + #### Javanese + "jw", + #### Georgian + "ka", + #### Kazakh + "kk", + #### Khmer + "km", + #### Kannada + "kn", + #### Korean + "ko", + #### Latin + "la", + #### Luxembourgish + "lb", + #### Lingala + "ln", + #### Lao + "lo", + #### Lithuanian + "lt", + #### Latvian + "lv", + #### Malagasy + "mg", + #### Maori + "mi", + #### Macedonian + "mk", + #### Malayalam + "ml", + #### Mongolian + "mn", + #### Marathi + "mr", + #### Malay + "ms", + #### Maltese + "mt", + #### Burmese + "my", + #### Nepali + "ne", + #### Dutch + "nl", + #### Norwegian Nynorsk + "nn", + #### Norwegian + "no", + #### Occitan + "oc", + #### Punjabi + "pa", + #### Polish + "pl", + #### Pashto + "ps", + #### Portuguese + "pt", + #### Romanian + "ro", + #### Russian + "ru", + #### Sanskrit + "sa", + #### Sindhi + "sd", + #### Sinhalese + "si", + #### Slovak + "sk", + #### Slovenian + "sl", + #### Shona + "sn", + #### Somali + "so", + #### Albanian + "sq", + #### Serbian + "sr", + #### Sundanese + "su", + #### Swedish + "sv", + #### Swahili + "sw", + #### Tamil + "ta", + #### Telugu + "te", + #### Tajik + "tg", + #### Thai + "th", + #### Turkmen + "tk", + #### Tagalog + "tl", + #### Turkish + "tr", + #### Tatar + "tt", + #### Ukrainian + "uk", + #### Urdu + "ur", + #### Uzbek + "uz", + #### Vietnamese + "vi", + #### Yiddish + "yi", + #### Yoruba + "yo", + #### Chinese + "zh", + #### Cantonese + "yue", + ''' + self.file = file + self.voices_folder = voices_folder + self.language = language + self.log_folder = log_folder + self.modelSize = modelSize + self.quantization = quantization + self.ACCESS_TOKEN = ACCESS_TOKEN + + def whisper(self): + res = core_analysis(self.file, self.voices_folder, self.log_folder, self.language, self.modelSize, self.ACCESS_TOKEN, "whisper", self.quantization) + return res + + def faster_whisper(self): + res = core_analysis(self.file, self.voices_folder, self.log_folder, self.language, self.modelSize, self.ACCESS_TOKEN, "faster-whisper", self.quantization) + return res + +class PreProcessor: + ''' + class for preprocessing audio files. + + methods: + + re_encode(file) -> re-encode file to 16-bit PCM encoding + + convert_to_mono(file) -> convert file from stereo to mono + + mp3_to_wav(file) -> convert mp3 file to wav format + + ''' + + def re_encode(self, file): + re_encode(file) + + def convert_to_mono(self, file): + convert_to_mono(file) + + def convert_to_wav(self, file): + path = convert_to_wav(file) + return path diff --git a/speechlib/transcribe.py b/speechlib/transcribe.py new file mode 100644 index 0000000..0df235b --- /dev/null +++ b/speechlib/transcribe.py @@ -0,0 +1,44 @@ +import torch +from .whisper_sinhala import (whisper_sinhala) +from faster_whisper import WhisperModel +import whisper + +def transcribe(file, language, model_size, whisper_type, quantization): + res = "" + if language in ["si", "Si"]: + res = whisper_sinhala(file) + return res + elif model_size in ["base", "tiny", "small", "medium", "large", "large-v1", "large-v2", "large-v3"]: + if whisper_type == "faster-whisper": + if torch.cuda.is_available(): + if quantization: + model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") + else: + model = WhisperModel(model_size, device="cuda", compute_type="float16") + else: + if quantization: + model = WhisperModel(model_size, device="cpu", compute_type="int8") + else: + model = WhisperModel(model_size, device="cpu", compute_type="float32") + + if language in model.supported_languages: + segments, info = model.transcribe(file, language=language, beam_size=5) + + for segment in segments: + res += segment.text + " " + + return res + else: + Exception("Language code not supported.\nThese are the supported languages:\n", model.supported_languages) + else: + try: + model = whisper.load_model(model_size) + result = model.transcribe(file, language=language) + res = result["text"] + + return res + except Exception as err: + print("an error occured while transcribing: ", err) + else: + raise Exception("only 'base', 'tiny', 'small', 'medium', 'large', 'large-v1', 'large-v2', 'large-v3' models are available.") + diff --git a/speechlib/wav_segmenter.py b/speechlib/wav_segmenter.py new file mode 100644 index 0000000..ae4197c --- /dev/null +++ b/speechlib/wav_segmenter.py @@ -0,0 +1,39 @@ +import os +from pydub import AudioSegment +from .transcribe import (transcribe) + +# segment according to speaker +def wav_file_segmentation(file_name, segments, language, modelSize, whisper_type, quantization): + # Load the WAV file + audio = AudioSegment.from_file(file_name, format="wav") + trans = "" + + texts = [] + + folder_name = "segments" + + if not os.path.exists(folder_name): + os.makedirs(folder_name) + + i = 0 + + for segment in segments: + + start = segment[0] * 1000 # start time in miliseconds + end = segment[1] * 1000 # end time in miliseconds + clip = audio[start:end] + i = i + 1 + file = folder_name + "/" + "segment"+ str(i) + ".wav" + clip.export(file, format="wav") + + try: + trans = transcribe(file, language, modelSize, whisper_type, quantization) + + # return -> [[start time, end time, transcript], [start time, end time, transcript], ..] + texts.append([segment[0], segment[1], trans]) + except: + pass + # Delete the WAV file after processing + os.remove(file) + + return texts diff --git a/speechlib/whisper_sinhala.py b/speechlib/whisper_sinhala.py new file mode 100644 index 0000000..d4a3d92 --- /dev/null +++ b/speechlib/whisper_sinhala.py @@ -0,0 +1,8 @@ +from transformers import pipeline + +def whisper_sinhala(file): + pipe = pipeline("automatic-speech-recognition", model="Ransaka/whisper-tiny-sinhala-20k-8k-steps-v2") + res = pipe(file) + return res["text"] + + diff --git a/speechlib/write_log_file.py b/speechlib/write_log_file.py new file mode 100644 index 0000000..bf8d9e1 --- /dev/null +++ b/speechlib/write_log_file.py @@ -0,0 +1,32 @@ +import os +from datetime import datetime + +def write_log_file(common_segments, log_folder, file_name, language): + + if not os.path.exists(log_folder): + os.makedirs(log_folder) + + #---------------------log file part------------------------- + + current_time = datetime.now().strftime('%H%M%S') + + file_name = os.path.splitext(os.path.basename(file_name))[0] + + log_file = log_folder + "/" + file_name + "_" + current_time + "_" + language + ".txt" + + lf=open(log_file,"wb") + + entry = "" + + for segment in common_segments: + start = segment[0] + end = segment[1] + text = segment[2] + speaker = segment[3] + + entry += f"{speaker} ({start} : {end}) : {text}\n" + + lf.write(bytes(entry.encode('utf-8'))) + lf.close() + + # -------------------------log file end------------------------- diff --git a/transcript.png b/transcript.png new file mode 100644 index 0000000..21e7b0b Binary files /dev/null and b/transcript.png differ diff --git a/voices_folder_structure1.png b/voices_folder_structure1.png new file mode 100644 index 0000000..2e8ad16 Binary files /dev/null and b/voices_folder_structure1.png differ diff --git a/whisper_LICENSE b/whisper_LICENSE new file mode 100644 index 0000000..d255525 --- /dev/null +++ b/whisper_LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 OpenAI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE.