Skip to content

Commit

Permalink
substudy: Add more transcription prompt opts
Browse files Browse the repository at this point in the history
You can now use:

--example-text (vaguely related text)
--expected-text (expected results of transcription, good for songs)

Or you can pass neither, in which case the transcriber will do its best.

This PR also adds test cases for the OpenAI-based features.

The "poem" in fixtures was written by ChatGPT-4 and was voiced by their TTS.
  • Loading branch information
emk committed Apr 4, 2024
1 parent 3ee84df commit 66d0709
Show file tree
Hide file tree
Showing 10 changed files with 280 additions and 34 deletions.
22 changes: 22 additions & 0 deletions python-experiments/tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

def text_to_speech(input_text, output_file):
client = OpenAI()

response = client.audio.speech.create(
model="tts-1",
voice="shimmer",
input=input_text
)

response.stream_to_file(output_file)

if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
print("Usage: python tts.py <input_text> <output_mp3_file>")
sys.exit(1)
text_to_speech(sys.argv[1], sys.argv[2])
2 changes: 1 addition & 1 deletion substudy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ repository = "https://github.com/emk/substudy"
readme = "README.md"
keywords = ["text"]

exclude = ["fixtures/empty.mp4"]
exclude = ["/fixtures"]

[dependencies]
anyhow = "1.0.80"
Expand Down
Binary file added substudy/fixtures/poem.es.mp3
Binary file not shown.
7 changes: 7 additions & 0 deletions substudy/fixtures/poem.es.srt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
0
00:00:00,000 --> 00:00:03,840
En el bosque se escucha un susurro, el viento entre los árboles murmuro.

1
00:00:04,247 --> 00:00:08,580
Bajo su sombra, todo es puro, en su abrazo, el mundo se siente seguro.
2 changes: 2 additions & 0 deletions substudy/fixtures/poem.es.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
En el bosque se escucha un susurro, el viento entre los árboles murmuro.
Bajo su sombra, todo es puro, en su abrazo, el mundo se siente seguro.
33 changes: 32 additions & 1 deletion substudy/src/import/whisper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ const DEFAULT_CHARS_PER_SECOND: f32 = 15.0;
/// Import a Whisper JSON file and convert it to an SRT file.
pub fn import_whisper_json(whisper_json: &WhisperJson) -> Result<SubtitleFile> {
let mut whisper = whisper_json.clone().clean();
whisper.resegment();
if !whisper.trust_segment_text {
whisper.resegment();
}
let words = whisper.words_for_each_segment(&whisper.words);
let mut analyzed = AnalyzedSegments::new(&whisper.segments, &words);
analyzed.fix_times();
Expand Down Expand Up @@ -127,6 +129,10 @@ pub struct WhisperJson {
words: Vec<Word>,
segments: Vec<Segment>,

/// Should we trust the segment text as much as possible?
#[serde(default, skip)]
trust_segment_text: bool,

/// Other keys we don't recognize but want to keep.
#[serde(flatten)]
extra: serde_json::Map<String, serde_json::Value>,
Expand All @@ -149,6 +155,20 @@ impl WhisperJson {
.with_context(|| format!("Failed to parse Whisper JSON string: {:?}", s))
}

/// Set segments from untimed text. This will be split into lines. We use
/// this when we have known-good text, and we want to match it to timing
/// data.
pub(crate) fn set_segments_from_untimed_text(&mut self, text: &str) {
debug!("Setting segments from untimed text");
self.trust_segment_text = true;
self.segments = text
.lines()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.map(Segment::from_untimed_text)
.collect();
}

/// Clean up and normalize the Whisper JSON file.
fn clean(mut self) -> WhisperJson {
// We don't bother to clean the "text" field, because we don't use it to
Expand Down Expand Up @@ -332,6 +352,17 @@ struct Segment {
}

impl Segment {
/// Build a segment from untimed text.
fn from_untimed_text<S: Into<String>>(text: S) -> Segment {
Segment {
text: text.into(),
start: 0.0,
end: 0.0,
no_speech_prob: 0.0,
extra: Default::default(),
}
}

/// Offset by the specified time.
fn offset(&mut self, time_offset: f32) {
self.start += time_offset;
Expand Down
53 changes: 46 additions & 7 deletions substudy/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@
#![warn(missing_docs)]

use std::{
fs::read_to_string,
io::{stdout, BufWriter},
path::{Path, PathBuf},
};

use anyhow::{bail, Context};
pub use anyhow::{Error, Result};
use clap::{Parser, Subcommand};
use dotenv::dotenv;
use export::ExporterBuilder;
use services::oai::TranscriptionPrompt;
use tempfile::tempdir;
use video::Video;

Expand Down Expand Up @@ -93,9 +96,16 @@ enum Args {
/// Path to the video.
video: PathBuf,

/// Example text related to or similar to audio.
#[arg(long)]
example_text: PathBuf,
/// Path to sample text which resembles the content of the video.
#[arg(long, conflicts_with = "expected_text")]
example_text: Option<PathBuf>,

/// Path to expected text, for when you already know more or less what
/// the subtitles should say, but you want to sync them up with the
/// video. Line breaks will be treated as subtitle breaks. Most useful
/// for music.
#[arg(long, conflicts_with = "example_text")]
expected_text: Option<PathBuf>,

/// Output format for the transcription.
#[arg(long, default_value = "srt")]
Expand Down Expand Up @@ -260,15 +270,45 @@ async fn main() -> Result<()> {
Args::Transcribe {
video,
example_text,
expected_text,
format,
} => cmd_transcribe(&ui, &video, &example_text, format).await,
} => {
let prompt = prompt_from(example_text, expected_text)?;
cmd_transcribe(&ui, &video, prompt.as_ref(), format).await
}
Args::Translate {
foreign_subs,
native_lang,
} => cmd_translate(&ui, &foreign_subs, &native_lang).await,
}
}

/// Build our transcription prompt from the command-line arguments.
fn prompt_from(
example_text: Option<PathBuf>,
expected_text: Option<PathBuf>,
) -> Result<Option<TranscriptionPrompt>> {
let read = |p: &Path| {
read_to_string(p)
.with_context(|| format!("Could not read file: {}", p.display()))
};
match (example_text, expected_text) {
(Some(_), Some(_)) => {
// Clap should prevent this from happening.
bail!("Cannot specify both --example-text and --expected-text")
}
(Some(example_text), None) => {
let example_text = read(&example_text)?;
Ok(Some(TranscriptionPrompt::Example(example_text)))
}
(None, Some(expected_text)) => {
let expected_text = read(&expected_text)?;
Ok(Some(TranscriptionPrompt::Expected(expected_text)))
}
(None, None) => Ok(None),
}
}

fn cmd_clean(path: &Path) -> Result<()> {
let file1 = SubtitleFile::cleaned_from_path(path)?;
print!("{}", file1.to_string());
Expand Down Expand Up @@ -351,15 +391,14 @@ fn cmd_import(format: ImportFormat) -> Result<()> {
async fn cmd_transcribe(
ui: &Ui,
video_path: &Path,
example_text_path: &Path,
prompt: Option<&TranscriptionPrompt>,
format: TranscriptionFormat,
) -> Result<()> {
let video = Video::new(video_path).await?;
let prompt = std::fs::read_to_string(example_text_path)?;
let out = stdout();
let writer = out.lock();
format
.write_transcription(ui, &video, &prompt, &mut BufWriter::new(writer))
.write_transcription(ui, &video, prompt, &mut BufWriter::new(writer))
.await?;
Ok(())
}
Expand Down
5 changes: 4 additions & 1 deletion substudy/src/services/oai/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ use log::debug;

use crate::Result;

pub use self::{transcribe::TranscriptionFormat, translate::translate_subtitle_file};
pub use self::{
transcribe::{TranscriptionFormat, TranscriptionPrompt},
translate::translate_subtitle_file,
};

mod transcribe;
mod translate;
Expand Down
Loading

0 comments on commit 66d0709

Please sign in to comment.