adding post processing

tim-roethig-db · May 25, 2024 · 1bf907e · 1bf907e
1 parent b2dcf26
commit 1bf907e
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 7 deletions.
diff --git a/amondin/main.py b/amondin/main.py
@@ -8,7 +8,7 @@
 from amondin.tools import get_secret
 from amondin.segment_speakers import segment_speakers
 from amondin.speech2text import speech2text
-from amondin.post_processing import merge_rows_consecutive_speaker
+from amondin.post_processing import merge_rows_consecutive_speaker, format_time_stamp
 
 
 def transcribe(
@@ -73,6 +73,8 @@ def transcribe(
 
     transcript = merge_rows_consecutive_speaker(transcript)
 
+    transcript = format_time_stamp(transcript)
+
     # save transcript
     print(transcript.to_markdown(index=False))
     if output_file_path.endswith(".csv"):

diff --git a/amondin/post_processing.py b/amondin/post_processing.py
@@ -1,11 +1,18 @@
+from datetime import timedelta
 import pandas as pd
 
 
+def _seconds_to_time_stamp(seconds: float) -> str:
+    minutes, seconds = divmod(seconds, 60)
+
+    milliseconds = int((seconds - int(seconds)) * 1000)
+
+    return f"{int(minutes):02}:{int(seconds):02}:{milliseconds:03}"
+
+
 def merge_rows_consecutive_speaker(transcript: pd.DataFrame) -> pd.DataFrame:
     transcript['speaker_group'] = (transcript['speaker'] != transcript['speaker'].shift()).cumsum()
 
-    print(transcript.to_markdown())
-
     transcript = transcript.groupby(['speaker_group', 'speaker']).agg({
         'start': "min",
         "end": "max",
@@ -14,11 +21,21 @@ def merge_rows_consecutive_speaker(transcript: pd.DataFrame) -> pd.DataFrame:
 
     transcript = transcript.drop(columns='speaker_group')
 
-    print(transcript.to_markdown())
-
     return transcript
 
 
+def format_time_stamp(transcript: pd.DataFrame) -> pd.DataFrame:
+    transcript['start'] = transcript['start'].apply(_seconds_to_time_stamp)
+    transcript['end'] = transcript['end'].apply(_seconds_to_time_stamp)
+
+    transcript['time_stamp'] = transcript.apply(
+        lambda row: f"{row['start']} -> {row['end']}",
+        axis='columns'
+    )
+
+    return transcript[['speaker', "time_stamp", "text"]]
+
+
 if __name__ == "__main__":
     test_transcript = pd.read_excel("../data/test_transcript.xlsx")
-    merge_rows_consecutive_speaker(transcript=test_transcript)
+    print(format_time_stamp(transcript=test_transcript).to_markdown())
diff --git a/amondin/segment_speakers.py b/amondin/segment_speakers.py
@@ -52,7 +52,7 @@ def segment_speakers(
     # store all passages in a list of dicts
     speaker_segments = []
     for segment in segments:
-        if segment.duration > 0.1:
+        if segment.duration > tolerance:
             # get audio passages as numpy array
             waveform, sample_rate = Audio().crop(audio, segment, mode="pad")
             waveform = torch.squeeze(waveform)