adding xlsx as output file type

tim-roethig-db · May 20, 2024 · 4a7e360 · 4a7e360
1 parent f86b6c2
commit 4a7e360
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 16 deletions.
diff --git a/amondin/main.py b/amondin/main.py
@@ -33,19 +33,7 @@ def transcribe(
     """
 
     print(f"Running on {device}...")
-    """
-    if not input_file_path.endswith(".wav"):
-        print(f"Converting {input_file_path} to .wav...")
-        # get filename
-        file_name = Path(input_file_path).stem
-        # convert input file to .wav and store it to disk
-        convert_audio_to_wav(input_file_path, f"{file_name}.wav")
-        # proceed with newly created .wav file
-        input_file_path = f"{file_name}.wav"
-        print(f"Created {input_file_path}")
-    """
     waveform, sample_rate = torchaudio.load(input_file_path)
-
     audio = {"waveform": waveform, "sample_rate": sample_rate}
 
     print("Segmenting speakers...")

diff --git a/amondin/segment_speakers.py b/amondin/segment_speakers.py
@@ -7,7 +7,7 @@
 
 
 def segment_speakers(
-        file_path: str,
+        audio: dict,
         hf_token: str,
         device: str,
         num_speakers: int,
@@ -16,7 +16,7 @@ def segment_speakers(
     """
     Detect speakers in audio.wav file and label the segments of each speaker accordingly
     :param device: Device to run the model on
-    :param file_path:
+    :param audio:
     :param hf_token: HF token since the pyannote model needs authentication
     :param num_speakers: Set to None to self detect the number of speakers
     :param tolerance:
@@ -32,7 +32,7 @@ def segment_speakers(
     pipeline.to(torch.device(device))
 
     # inference on the whole file
-    annotation = pipeline(file_path, num_speakers=num_speakers)
+    annotation = pipeline(audio, num_speakers=num_speakers)
 
     # merge passages from same speaker if occurring in less than tolerance after each other
     annotation = annotation.support(tolerance)
@@ -44,7 +44,7 @@ def segment_speakers(
     speaker_segments = []
     for segment in segments:
         # get audio passages as numpy array
-        waveform, sample_rate = Audio().crop(file_path, segment)
+        waveform, sample_rate = Audio().crop(audio, segment)
         waveform = torch.squeeze(waveform)
         waveform = waveform.numpy()