Skip to content

Commit

Permalink
fix: deepgram
Browse files Browse the repository at this point in the history
  • Loading branch information
louis030195 committed Sep 16, 2024
1 parent ec408a0 commit edce2ae
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ resolver = "2"


[workspace.package]
version = "0.1.82"
version = "0.1.83"
authors = ["louis030195 <hi@louis030195.com>"]
description = ""
repository = "https://github.com/mediar-ai/screenpipe"
Expand Down
2 changes: 1 addition & 1 deletion screenpipe-app-tauri/src-tauri/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "screenpipe-app"
version = "0.2.38"
version = "0.2.39"
description = ""
authors = ["you"]
license = ""
Expand Down
16 changes: 13 additions & 3 deletions screenpipe-audio/src/stt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,12 @@ fn get_deepgram_api_key() -> String {
}

// TODO: this should use async reqwest not blocking, cause crash issue because all our code is async
fn transcribe_with_deepgram(api_key: &str, audio_data: &[f32], device: &str) -> Result<String> {
fn transcribe_with_deepgram(
api_key: &str,
audio_data: &[f32],
device: &str,
sample_rate: u32,
) -> Result<String> {
debug!("starting deepgram transcription");
let client = Client::new();

Expand All @@ -443,7 +448,7 @@ fn transcribe_with_deepgram(api_key: &str, audio_data: &[f32], device: &str) ->
{
let spec = WavSpec {
channels: 1,
sample_rate: 32000,
sample_rate: sample_rate / 3, // for some reason 96khz device need 32 and 48khz need 16 (be mindful resampling)
bits_per_sample: 32,
sample_format: hound::SampleFormat::Float,
};
Expand Down Expand Up @@ -605,7 +610,12 @@ pub fn stt(
audio_input.device,
&api_key[..8]
);
match transcribe_with_deepgram(&api_key, &speech_frames, &audio_input.device) {
match transcribe_with_deepgram(
&api_key,
&speech_frames,
&audio_input.device,
audio_input.sample_rate,
) {
Ok(transcription) => Ok(transcription),
Err(e) => {
error!(
Expand Down
10 changes: 8 additions & 2 deletions screenpipe-server/src/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,10 @@ async fn record_audio(
});

while let Ok(transcription) = whisper_receiver.try_recv() {
info!("Received transcription");
info!(
"device {} received transcription {:?}",
transcription.input.device, transcription.transcription
);
// avoiding crashing the audio processing if one fails
if let Err(e) = process_audio_result(
&db,
Expand Down Expand Up @@ -386,7 +389,10 @@ async fn process_audio_result(
let transcription = result.transcription.unwrap();
let transcription_engine = audio_transcription_engine.to_string();

info!("Inserting audio chunk: {:?}", result.path);
info!(
"device {} inserting audio chunk: {:?}",
result.input.device, result.path
);
match db.insert_audio_chunk(&result.path).await {
Ok(audio_chunk_id) => {
if transcription.is_empty() {
Expand Down

1 comment on commit edce2ae

@louis030195
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark 'OCR Benchmarks'.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 2.

Benchmark suite Current: edce2ae Previous: ec408a0 Ratio
Apple Vision OCR/Performance/ 4279803916 ns/iter (± 947340499) 1972165583 ns/iter (± 19675750) 2.17

This comment was automatically generated by workflow using github-action-benchmark.

CC: @louis030195

Please sign in to comment.