adding xlsx as output file type

tim-roethig-db · May 20, 2024 · 154ffe9 · 154ffe9
1 parent 4a7e360
commit 154ffe9
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 29 deletions.
diff --git a/amondin/__init__.py b/amondin/__init__.py
@@ -3,5 +3,5 @@
 """
 from .segment_speakers import segment_speakers
 from .speech2text import speech2text
-from .tools import get_secret, convert_audio_to_wav
+from .tools import get_secret
 from .main import transcribe
diff --git a/amondin/main.py b/amondin/main.py
@@ -2,11 +2,10 @@
 Main module of transcription tool
 """
 
-from pathlib import Path
 import pandas as pd
 import torchaudio
 
-from amondin.tools import convert_audio_to_wav
+from amondin.tools import get_secret
 from amondin.segment_speakers import segment_speakers
 from amondin.speech2text import speech2text
 
@@ -33,8 +32,13 @@ def transcribe(
     """
 
     print(f"Running on {device}...")
+
+    print(f"Loading {input_file_path}...")
     waveform, sample_rate = torchaudio.load(input_file_path)
-    audio = {"waveform": waveform, "sample_rate": sample_rate}
+    audio = {
+        "waveform": waveform,
+        "sample_rate": sample_rate
+    }
 
     print("Segmenting speakers...")
     speaker_segments = segment_speakers(
@@ -70,3 +74,14 @@ def transcribe(
         transcript.to_excel(output_file_path, index=False)
     else:
         raise TypeError("Only .csv and .xlsx are valid file types.")
+
+
+if __name__ == "__main__":
+    transcribe(
+        "../data/sample.wav", "../data/sample.xlsx",
+        hf_token=get_secret("../secrets.yaml", "hf-token"),
+        s2t_model="openai/whisper-tiny",
+        device="cpu",
+        language="german",
+        num_speakers=2
+    )
diff --git a/amondin/speech2text.py b/amondin/speech2text.py
@@ -40,7 +40,6 @@ def speech2text(
         max_new_tokens=128,
         chunk_length_s=30,
         batch_size=16,
-        return_timestamps=True,
         torch_dtype=torch_dtype,
         device=device,
     )

diff --git a/amondin/tools.py b/amondin/tools.py
@@ -3,9 +3,6 @@
 """
 
 import yaml
-import ffmpeg
-import librosa
-import soundfile
 
 
 def get_secret(path2yaml: str, key: str):
@@ -19,21 +16,3 @@ def get_secret(path2yaml: str, key: str):
         secrets = yaml.safe_load(file)
 
     return secrets[key]
-
-
-def convert_audio_to_wav(input_path: str, output_path: str):
-    """
-    Convert a given input audio file to .wav needed for AI pipelines
-    :param input_path:
-    :param output_path:
-    :return:
-    """
-    ffmpeg.input(input_path).output(
-        output_path,
-        format="wav",
-    ).run(
-        overwrite_output=True
-    )
-
-    y, s = librosa.load(output_path, sr=16000)
-    soundfile.write(output_path, y, s)
diff --git a/setup.py b/setup.py
@@ -12,11 +12,9 @@
         "pyannote.audio",
         "pyannote.core",
         "pyyaml",
-        "ffmpeg-python",
         "pandas",
-        "librosa",
-        "soundfile",
         "numpy",
         "torch",
+        "torchaudio",
     ]
 )