Skip to content

Commit

Permalink
adding post processing
Browse files Browse the repository at this point in the history
  • Loading branch information
tim-roethig-db committed May 25, 2024
1 parent b2dcf26 commit 1bf907e
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 7 deletions.
4 changes: 3 additions & 1 deletion amondin/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from amondin.tools import get_secret
from amondin.segment_speakers import segment_speakers
from amondin.speech2text import speech2text
from amondin.post_processing import merge_rows_consecutive_speaker
from amondin.post_processing import merge_rows_consecutive_speaker, format_time_stamp


def transcribe(
Expand Down Expand Up @@ -73,6 +73,8 @@ def transcribe(

transcript = merge_rows_consecutive_speaker(transcript)

transcript = format_time_stamp(transcript)

# save transcript
print(transcript.to_markdown(index=False))
if output_file_path.endswith(".csv"):
Expand Down
27 changes: 22 additions & 5 deletions amondin/post_processing.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from datetime import timedelta
import pandas as pd


def _seconds_to_time_stamp(seconds: float) -> str:
minutes, seconds = divmod(seconds, 60)

milliseconds = int((seconds - int(seconds)) * 1000)

return f"{int(minutes):02}:{int(seconds):02}:{milliseconds:03}"


def merge_rows_consecutive_speaker(transcript: pd.DataFrame) -> pd.DataFrame:
transcript['speaker_group'] = (transcript['speaker'] != transcript['speaker'].shift()).cumsum()

print(transcript.to_markdown())

transcript = transcript.groupby(['speaker_group', 'speaker']).agg({
'start': "min",
"end": "max",
Expand All @@ -14,11 +21,21 @@ def merge_rows_consecutive_speaker(transcript: pd.DataFrame) -> pd.DataFrame:

transcript = transcript.drop(columns='speaker_group')

print(transcript.to_markdown())

return transcript


def format_time_stamp(transcript: pd.DataFrame) -> pd.DataFrame:
transcript['start'] = transcript['start'].apply(_seconds_to_time_stamp)
transcript['end'] = transcript['end'].apply(_seconds_to_time_stamp)

transcript['time_stamp'] = transcript.apply(
lambda row: f"{row['start']} -> {row['end']}",
axis='columns'
)

return transcript[['speaker', "time_stamp", "text"]]


if __name__ == "__main__":
test_transcript = pd.read_excel("../data/test_transcript.xlsx")
merge_rows_consecutive_speaker(transcript=test_transcript)
print(format_time_stamp(transcript=test_transcript).to_markdown())
2 changes: 1 addition & 1 deletion amondin/segment_speakers.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def segment_speakers(
# store all passages in a list of dicts
speaker_segments = []
for segment in segments:
if segment.duration > 0.1:
if segment.duration > tolerance:
# get audio passages as numpy array
waveform, sample_rate = Audio().crop(audio, segment, mode="pad")
waveform = torch.squeeze(waveform)
Expand Down

0 comments on commit 1bf907e

Please sign in to comment.