Skip to content

Commit

Permalink
substudy: Polish transcription
Browse files Browse the repository at this point in the history
- New --help messages
- New --lang option for when no text is available
- `--example-text` is now `--related-text` (with alias)
- Language auto-detection fixed
  • Loading branch information
emk committed Apr 4, 2024
1 parent 66d0709 commit c202744
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 32 deletions.
83 changes: 62 additions & 21 deletions substudy/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,24 +91,62 @@ enum Args {
},

/// Transcribe subtitles from audio.
#[command(name = "transcribe")]
#[command(
name = "transcribe",
after_help = "\
Examples:
To transcribe a song, you might use the following command:
substudy transcribe --expected-text=full-lyrics.txt song.mp3 > song.srt
To transcribe a video, you might use the following command:
substudy transcribe --related-text=opening-voiceover.txt vid.mp4 > vid.srt
If your output is missing lots of lines, you might try Whisper's raw SRT output:
substudy transcribe --format=whisper-srt --related-text=sample-dialog.txt \\
vid.mp4 > vid.srt
This may require more cleanup.
If you have no related text at all, you can omit both `--related-text` and
`--expected-text`. The transcriber will try its best."
)]
Transcribe {
/// Path to the video.
video: PathBuf,

/// Path to sample text which resembles the content of the video.
#[arg(long, conflicts_with = "expected_text")]
example_text: Option<PathBuf>,

/// Path to expected text, for when you already know more or less what
/// the subtitles should say, but you want to sync them up with the
/// video. Line breaks will be treated as subtitle breaks. Most useful
/// for music.
#[arg(long, conflicts_with = "example_text")]
/// Path to sample text which resembles the content of the video. Using
/// this will help the transciber to better understand the audio.
///
/// Cannot be used with `--expected-text`.
#[arg(long, conflicts_with = "expected_text", alias = "example-text")]
related_text: Option<PathBuf>,

/// Path to complete expected text. This is for when you already know
/// more or less what the subtitles should say, but you want to sync
/// them up with the video. Line breaks will be treated as subtitle
/// breaks. Most useful for music.
///
/// Cannot be used with `--example-text`. Treated the same as
/// `--related-text` when `--format=whisper-srt`.
#[arg(long, conflicts_with = "related_text")]
expected_text: Option<PathBuf>,

/// Output format for the transcription.
#[arg(long, default_value = "srt")]
/// Primary language used in the media (e.g. "en" for English). This can
/// normally be auto-detected from `--related-text` or
/// `--expected-text`. But if you don't pass either, it might help.
#[arg(long)]
lang: Option<String>,

/// Output format for the transcription. Possible values:
///
/// - `srt`: Standard SRT format, with cleanup applied.
/// - `whisper-srt`: Whisper's raw SRT format, with no cleanup.
/// - `whisper-json`: Whisper's verbose JSON output, for programmers.
#[arg(long, default_value = "srt", verbatim_doc_comment)]
format: TranscriptionFormat,
},

Expand Down Expand Up @@ -269,12 +307,14 @@ async fn main() -> Result<()> {
} => cmd_tracks(&video).await,
Args::Transcribe {
video,
example_text,
related_text,
expected_text,
lang,
format,
} => {
let prompt = prompt_from(example_text, expected_text)?;
cmd_transcribe(&ui, &video, prompt.as_ref(), format).await
let lang = lang.map(|l| Lang::iso639(&l)).transpose()?;
let prompt = prompt_from(related_text, expected_text)?;
cmd_transcribe(&ui, &video, prompt.as_ref(), lang, format).await
}
Args::Translate {
foreign_subs,
Expand All @@ -285,21 +325,21 @@ async fn main() -> Result<()> {

/// Build our transcription prompt from the command-line arguments.
fn prompt_from(
example_text: Option<PathBuf>,
related_text: Option<PathBuf>,
expected_text: Option<PathBuf>,
) -> Result<Option<TranscriptionPrompt>> {
let read = |p: &Path| {
read_to_string(p)
.with_context(|| format!("Could not read file: {}", p.display()))
};
match (example_text, expected_text) {
match (related_text, expected_text) {
(Some(_), Some(_)) => {
// Clap should prevent this from happening.
bail!("Cannot specify both --example-text and --expected-text")
}
(Some(example_text), None) => {
let example_text = read(&example_text)?;
Ok(Some(TranscriptionPrompt::Example(example_text)))
(Some(related_text), None) => {
let related_text = read(&related_text)?;
Ok(Some(TranscriptionPrompt::Related(related_text)))
}
(None, Some(expected_text)) => {
let expected_text = read(&expected_text)?;
Expand Down Expand Up @@ -392,13 +432,14 @@ async fn cmd_transcribe(
ui: &Ui,
video_path: &Path,
prompt: Option<&TranscriptionPrompt>,
lang: Option<Lang>,
format: TranscriptionFormat,
) -> Result<()> {
let video = Video::new(video_path).await?;
let out = stdout();
let writer = out.lock();
format
.write_transcription(ui, &video, prompt, &mut BufWriter::new(writer))
.write_transcription(ui, &video, prompt, lang, &mut BufWriter::new(writer))
.await?;
Ok(())
}
Expand Down
25 changes: 14 additions & 11 deletions substudy/src/services/oai/transcribe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ use crate::{
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum TranscriptionPrompt {
/// Text similar to the output we want. This acts as a hint to the AI.
Example(String),
Related(String),

/// Expected output from the transcription, including line breaks. This is
/// used to synchronize existing text with media. Most useful for lyrics.
Expand All @@ -46,7 +46,7 @@ impl TranscriptionPrompt {
/// Get the text of the prompt.
pub fn text(&self) -> &str {
match self {
TranscriptionPrompt::Example(text)
TranscriptionPrompt::Related(text)
| TranscriptionPrompt::Expected(text) => text,
}
}
Expand All @@ -73,6 +73,7 @@ impl TranscriptionFormat {
ui: &Ui,
video: &Video,
prompt: Option<&TranscriptionPrompt>,
lang: Option<Lang>,
writer: &mut BufWriter<W>,
) -> Result<()>
where
Expand All @@ -83,22 +84,25 @@ impl TranscriptionFormat {
{
match self {
TranscriptionFormat::Srt => {
let srt = transcribe_subtitles_to_substudy_srt_file(ui, video, prompt)
.await?;
let srt =
transcribe_subtitles_to_substudy_srt_file(ui, video, prompt, lang)
.await?;
writer
.write_all(srt.to_string().as_bytes())
.context("failed to write SRT transcription")?;
}
TranscriptionFormat::WhisperSrt => {
let srt =
transcribe_subtitles::<SubtitleFile>(ui, video, prompt).await?;
transcribe_subtitles::<SubtitleFile>(ui, video, prompt, lang)
.await?;
writer
.write_all(srt.to_string().as_bytes())
.context("failed to write SRT transcription")?;
}
TranscriptionFormat::WhisperJson => {
let json =
transcribe_subtitles::<WhisperJson>(ui, video, prompt).await?;
transcribe_subtitles::<WhisperJson>(ui, video, prompt, lang)
.await?;
serde_json::to_writer(writer, &json)
.context("failed to write Whisper JSON transcription")?;
}
Expand Down Expand Up @@ -135,8 +139,9 @@ async fn transcribe_subtitles_to_substudy_srt_file(
ui: &Ui,
video: &Video,
prompt: Option<&TranscriptionPrompt>,
lang: Option<Lang>,
) -> Result<SubtitleFile> {
let whisper_json = transcribe_subtitles(&ui, video, prompt).await?;
let whisper_json = transcribe_subtitles(&ui, video, prompt, lang).await?;
import_whisper_json(&whisper_json)
}

Expand All @@ -145,15 +150,13 @@ async fn transcribe_subtitles<Subs>(
ui: &Ui,
video: &Video,
prompt: Option<&TranscriptionPrompt>,
lang: Option<Lang>,
) -> Result<Subs>
where
Subs: TranscribeFile + DeserializeOwned + Serialize + Send + Sync,
{
// Find our language.
let lang = match prompt {
Some(TranscriptionPrompt::Example(text)) => Lang::for_text(text),
_ => None,
};
let lang = lang.or_else(|| prompt.and_then(|p| Lang::for_text(p.text())));

// Figure out where to split the video to fit under the 25 MB limit.
let stream = lang.and_then(|l| video.audio_track_for(l));
Expand Down
1 change: 1 addition & 0 deletions substudy/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ fn cmd_transcribe_no_text() {
let output = testdir
.cmd()
.arg("transcribe")
.arg("--lang=es")
.arg(testdir.src_path("fixtures/poem.es.mp3"))
.output()
.expect("could not run substudy");
Expand Down

0 comments on commit c202744

Please sign in to comment.