substudy: Add more transcription prompt opts

You can now use: --example-text (vaguely related text) --expected-text (expected results of transcription, good for songs) Or you can pass neither, in which case the transcriber will do its best. This PR also adds test cases for the OpenAI-based features. The "poem" in fixtures was written by ChatGPT-4 and was voiced by their TTS.
emk · Apr 4, 2024 · 66d0709 · 66d0709
1 parent 3ee84df
commit 66d0709
Show file tree

Hide file tree

Showing 10 changed files with 280 additions and 34 deletions.
diff --git a/python-experiments/tts.py b/python-experiments/tts.py
@@ -0,0 +1,22 @@
+from dotenv import load_dotenv
+from openai import OpenAI
+
+load_dotenv()
+
+def text_to_speech(input_text, output_file):
+    client = OpenAI()
+
+    response = client.audio.speech.create(
+        model="tts-1",
+        voice="shimmer",
+        input=input_text
+    )
+
+    response.stream_to_file(output_file)
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) != 3:
+        print("Usage: python tts.py <input_text> <output_mp3_file>")
+        sys.exit(1)
+    text_to_speech(sys.argv[1], sys.argv[2])
diff --git a/substudy/Cargo.toml b/substudy/Cargo.toml
@@ -15,7 +15,7 @@ repository = "https://github.com/emk/substudy"
 readme = "README.md"
 keywords = ["text"]
 
-exclude = ["fixtures/empty.mp4"]
+exclude = ["/fixtures"]
 
 [dependencies]
 anyhow = "1.0.80"

diff --git a/substudy/fixtures/poem.es.mp3 b/substudy/fixtures/poem.es.mp3
diff --git a/substudy/fixtures/poem.es.srt b/substudy/fixtures/poem.es.srt
@@ -0,0 +1,7 @@
+0
+00:00:00,000 --> 00:00:03,840
+En el bosque se escucha un susurro, el viento entre los árboles murmuro.
+
+1
+00:00:04,247 --> 00:00:08,580
+Bajo su sombra, todo es puro, en su abrazo, el mundo se siente seguro.
diff --git a/substudy/fixtures/poem.es.txt b/substudy/fixtures/poem.es.txt
@@ -0,0 +1,2 @@
+En el bosque se escucha un susurro, el viento entre los árboles murmuro.
+Bajo su sombra, todo es puro, en su abrazo, el mundo se siente seguro.
diff --git a/substudy/src/import/whisper.rs b/substudy/src/import/whisper.rs
@@ -44,7 +44,9 @@ const DEFAULT_CHARS_PER_SECOND: f32 = 15.0;
 /// Import a Whisper JSON file and convert it to an SRT file.
 pub fn import_whisper_json(whisper_json: &WhisperJson) -> Result<SubtitleFile> {
     let mut whisper = whisper_json.clone().clean();
-    whisper.resegment();
+    if !whisper.trust_segment_text {
+        whisper.resegment();
+    }
     let words = whisper.words_for_each_segment(&whisper.words);
     let mut analyzed = AnalyzedSegments::new(&whisper.segments, &words);
     analyzed.fix_times();
@@ -127,6 +129,10 @@ pub struct WhisperJson {
     words: Vec<Word>,
     segments: Vec<Segment>,
 
+    /// Should we trust the segment text as much as possible?
+    #[serde(default, skip)]
+    trust_segment_text: bool,
+
     /// Other keys we don't recognize but want to keep.
     #[serde(flatten)]
     extra: serde_json::Map<String, serde_json::Value>,
@@ -149,6 +155,20 @@ impl WhisperJson {
             .with_context(|| format!("Failed to parse Whisper JSON string: {:?}", s))
     }
 
+    /// Set segments from untimed text. This will be split into lines. We use
+    /// this when we have known-good text, and we want to match it to timing
+    /// data.
+    pub(crate) fn set_segments_from_untimed_text(&mut self, text: &str) {
+        debug!("Setting segments from untimed text");
+        self.trust_segment_text = true;
+        self.segments = text
+            .lines()
+            .map(|s| s.trim())
+            .filter(|s| !s.is_empty())
+            .map(Segment::from_untimed_text)
+            .collect();
+    }
+
     /// Clean up and normalize the Whisper JSON file.
     fn clean(mut self) -> WhisperJson {
         // We don't bother to clean the "text" field, because we don't use it to
@@ -332,6 +352,17 @@ struct Segment {
 }
 
 impl Segment {
+    /// Build a segment from untimed text.
+    fn from_untimed_text<S: Into<String>>(text: S) -> Segment {
+        Segment {
+            text: text.into(),
+            start: 0.0,
+            end: 0.0,
+            no_speech_prob: 0.0,
+            extra: Default::default(),
+        }
+    }
+
     /// Offset by the specified time.
     fn offset(&mut self, time_offset: f32) {
         self.start += time_offset;

diff --git a/substudy/src/main.rs b/substudy/src/main.rs
@@ -3,14 +3,17 @@
 #![warn(missing_docs)]
 
 use std::{
+    fs::read_to_string,
     io::{stdout, BufWriter},
     path::{Path, PathBuf},
 };
 
+use anyhow::{bail, Context};
 pub use anyhow::{Error, Result};
 use clap::{Parser, Subcommand};
 use dotenv::dotenv;
 use export::ExporterBuilder;
+use services::oai::TranscriptionPrompt;
 use tempfile::tempdir;
 use video::Video;
 
@@ -93,9 +96,16 @@ enum Args {
         /// Path to the video.
         video: PathBuf,
 
-        /// Example text related to or similar to audio.
-        #[arg(long)]
-        example_text: PathBuf,
+        /// Path to sample text which resembles the content of the video.
+        #[arg(long, conflicts_with = "expected_text")]
+        example_text: Option<PathBuf>,
+
+        /// Path to expected text, for when you already know more or less what
+        /// the subtitles should say, but you want to sync them up with the
+        /// video. Line breaks will be treated as subtitle breaks. Most useful
+        /// for music.
+        #[arg(long, conflicts_with = "example_text")]
+        expected_text: Option<PathBuf>,
 
         /// Output format for the transcription.
         #[arg(long, default_value = "srt")]
@@ -260,15 +270,45 @@ async fn main() -> Result<()> {
         Args::Transcribe {
             video,
             example_text,
+            expected_text,
             format,
-        } => cmd_transcribe(&ui, &video, &example_text, format).await,
+        } => {
+            let prompt = prompt_from(example_text, expected_text)?;
+            cmd_transcribe(&ui, &video, prompt.as_ref(), format).await
+        }
         Args::Translate {
             foreign_subs,
             native_lang,
         } => cmd_translate(&ui, &foreign_subs, &native_lang).await,
     }
 }
 
+/// Build our transcription prompt from the command-line arguments.
+fn prompt_from(
+    example_text: Option<PathBuf>,
+    expected_text: Option<PathBuf>,
+) -> Result<Option<TranscriptionPrompt>> {
+    let read = |p: &Path| {
+        read_to_string(p)
+            .with_context(|| format!("Could not read file: {}", p.display()))
+    };
+    match (example_text, expected_text) {
+        (Some(_), Some(_)) => {
+            // Clap should prevent this from happening.
+            bail!("Cannot specify both --example-text and --expected-text")
+        }
+        (Some(example_text), None) => {
+            let example_text = read(&example_text)?;
+            Ok(Some(TranscriptionPrompt::Example(example_text)))
+        }
+        (None, Some(expected_text)) => {
+            let expected_text = read(&expected_text)?;
+            Ok(Some(TranscriptionPrompt::Expected(expected_text)))
+        }
+        (None, None) => Ok(None),
+    }
+}
+
 fn cmd_clean(path: &Path) -> Result<()> {
     let file1 = SubtitleFile::cleaned_from_path(path)?;
     print!("{}", file1.to_string());
@@ -351,15 +391,14 @@ fn cmd_import(format: ImportFormat) -> Result<()> {
 async fn cmd_transcribe(
     ui: &Ui,
     video_path: &Path,
-    example_text_path: &Path,
+    prompt: Option<&TranscriptionPrompt>,
     format: TranscriptionFormat,
 ) -> Result<()> {
     let video = Video::new(video_path).await?;
-    let prompt = std::fs::read_to_string(example_text_path)?;
     let out = stdout();
     let writer = out.lock();
     format
-        .write_transcription(ui, &video, &prompt, &mut BufWriter::new(writer))
+        .write_transcription(ui, &video, prompt, &mut BufWriter::new(writer))
         .await?;
     Ok(())
 }

diff --git a/substudy/src/services/oai/mod.rs b/substudy/src/services/oai/mod.rs
@@ -12,7 +12,10 @@ use log::debug;
 
 use crate::Result;
 
-pub use self::{transcribe::TranscriptionFormat, translate::translate_subtitle_file};
+pub use self::{
+    transcribe::{TranscriptionFormat, TranscriptionPrompt},
+    translate::translate_subtitle_file,
+};
 
 mod transcribe;
 mod translate;