Skip to content

Commit

Permalink
testing
Browse files Browse the repository at this point in the history
  • Loading branch information
tim-roethig-db committed May 18, 2024
1 parent 7bfb69b commit 6b2ca0a
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 8 deletions.
4 changes: 2 additions & 2 deletions amondin/diarize_speakers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,17 @@


def diarize_speakers(
file_path: str, hf_token: str, num_speakers: int = None, tolerance: float = 1.0
file_path: str, hf_token: str, device: str, num_speakers: int, tolerance: float = 1.0
) -> list[dict]:
"""
Detect speakers in audio.wav file and label the segments of each speaker accordingly
:param device: Device to run the model on
:param file_path:
:param hf_token: HF token since the pyanote model needs authentication
:param num_speakers: Set to None to self detect the number of speakers
:param tolerance:
:return:
"""
device = "cuda:0" if torch.cuda.is_available() else "cpu"

pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
Expand Down
13 changes: 9 additions & 4 deletions amondin/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@


def transcribe(
input_file_path: str, output_file_path: str, hf_token: str, language: str = "german", num_speakers: int = None,
s2t_model: str = "openai/whisper-tiny"
input_file_path: str, output_file_path: str, hf_token: str, device: str = "cpu",
language: str = "german", num_speakers: int = None, s2t_model: str = "openai/whisper-tiny"
):
"""
Transcribe a give audio.wav file.
:param device: Device to run the model on [cpu, cuda or cuda:x]
:param output_file_path:
:param input_file_path:
:param hf_token:
Expand All @@ -23,17 +24,21 @@ def transcribe(
:param s2t_model:
:return:
"""

print(f"Running on {device}.")

print("Diarizing speakers...")
diarized_speakers = diarize_speakers(
input_file_path,
hf_token=hf_token,
num_speakers=num_speakers,
device=device
)

print("Transcripting audio...")
print("Transcribing audio...")
transcript = []
for i, speaker_section in enumerate(diarized_speakers):
print(f"Transcripting part {i+1} of {len(diarized_speakers)}")
print(f"Transcribing part {i+1} of {len(diarized_speakers)}")
text = speech2text(
speaker_section["audio"],
model=s2t_model,
Expand Down
4 changes: 2 additions & 2 deletions amondin/speech2text.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
from transformers import WhisperProcessor, WhisperForConditionalGeneration


def speech2text(audio: dict, model: str = "openai/whisper-tiny", language: str = "german") -> str:
def speech2text(audio: dict, device: str, model: str = "openai/whisper-tiny", language: str = "german") -> str:
"""
Translate audio to text
:param device: Device to run the model on [cpu, cuda or cuda:x]
:param audio: dictionary containing audio as numpy array of shape (n,) and the sampling rate
:param model:
:param language:
:return:
"""
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# load model from huggingface
Expand Down

0 comments on commit 6b2ca0a

Please sign in to comment.