-
Notifications
You must be signed in to change notification settings - Fork 20
/
WhisperServer.py
117 lines (92 loc) · 3.57 KB
/
WhisperServer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# install dependencies:
# sudo apt install uvicorn
# pip3 install FastAPI[all]
# pip3 install uvloop
# launch the app
# python3 -m uvicorn WhisperServer:app --reload --port 11437
# browse: http://127.0.0.1:11437
from fastapi import FastAPI
from fastapi.responses import HTMLResponse
from pydantic import BaseModel
from typing import List
import numpy as np
import ffmpeg
import scipy.signal as sps
import whisper
print("Loading Whisper Model...")
#model = whisper.load_model("tiny")
model = whisper.load_model("base") #max size supported by Samsung ChromeBook4 with 4GB of RAM
#model = whisper.load_model("small")
#model = whisper.load_model("medium")
#model = whisper.load_model("large") # too large for my PC
app = FastAPI()
class TranslateItem(BaseModel):
data: List[float]
sampleRate: int
# hard-coded audio hyperparameters
SAMPLE_RATE = 16000
def load_audio(file: str, sr: int = SAMPLE_RATE):
"""
Open an audio file and read as mono waveform, resampling as necessary
Parameters
----------
file: str
The audio file to open
sr: int
The sample rate to resample the audio if necessary
Returns
-------
A NumPy array containing the audio waveform, in float32 dtype.
"""
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
@app.post('/translate')
async def api_translate(item:TranslateItem):
print("SAMPLE_RATE", SAMPLE_RATE, "sampleRate", item.sampleRate, "data", len(item.data))
if len(item.data) == 0:
return {"text":"Data is empty!"}
#print("Transcription: Start...")
#results = model.transcribe("Test_MP3.mp3", language="en")
#audio = whisper.load_audio("Test_WAV.wav")
#audio = load_audio("Test_1Sec.wav")
#print ("ffmpeg length", len(audio), "max", max(abs(audio)))
#print ("ffmpeg audio", audio)
#from scipy.io import wavfile
#rateWave,rawWave = wavfile.read("Test_WAV.wav")
#print("rateWave", rateWave, "rawWave", rawWave)
rawWave = item.data
#print("sampleRate", item.sampleRate, "rawWave", rawWave)
#print("Wave length", len(rawWave))
#print("Wave audio", rawWave)
# Resample data
number_of_samples = round(len(rawWave) * float(SAMPLE_RATE) / float(item.sampleRate))
resampledWave = sps.resample(rawWave, number_of_samples)
#audio = np.array(resampledWave).astype(np.float32) / 32768.0
audio = np.array(resampledWave).astype(np.float32)
#print("Resampled length", len(audio), "max", max(abs(audio)))
#print("Resampled audio", audio)
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
#mel = whisper.log_mel_spectrogram(audio).to(model.device)
# detect the spoken language
#_, probs = model.detect_language(mel)
#print(f"Detected language: {max(probs, key=probs.get)}")
# decode the audio
#options = whisper.DecodingOptions()
#result = whisper.decode(model, mel, options)
# print the recognized text
#print(result.text)
results = model.transcribe(audio, language="en")
#print("Transcription: Done!")
print("Transcription: Done!", "Text", results["text"])
return {"text":results["text"]}
print("Whisper Server loaded!")