fix: deepgram

mediar-ai · Sep 16, 2024 · edce2ae · edce2ae · louis030195 · Sep 16, 2024
1 parent ec408a0
commit edce2ae
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 7 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,7 +13,7 @@ resolver = "2"
 
 
 [workspace.package]
-version = "0.1.82"
+version = "0.1.83"
 authors = ["louis030195 <hi@louis030195.com>"]
 description = ""
 repository = "https://github.com/mediar-ai/screenpipe"

diff --git a/screenpipe-app-tauri/src-tauri/Cargo.toml b/screenpipe-app-tauri/src-tauri/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "screenpipe-app"
-version = "0.2.38"
+version = "0.2.39"
 description = ""
 authors = ["you"]
 license = ""

diff --git a/screenpipe-audio/src/stt.rs b/screenpipe-audio/src/stt.rs
@@ -434,7 +434,12 @@ fn get_deepgram_api_key() -> String {
 }
 
 // TODO: this should use async reqwest not blocking, cause crash issue because all our code is async
-fn transcribe_with_deepgram(api_key: &str, audio_data: &[f32], device: &str) -> Result<String> {
+fn transcribe_with_deepgram(
+    api_key: &str,
+    audio_data: &[f32],
+    device: &str,
+    sample_rate: u32,
+) -> Result<String> {
     debug!("starting deepgram transcription");
     let client = Client::new();
 
@@ -443,7 +448,7 @@ fn transcribe_with_deepgram(api_key: &str, audio_data: &[f32], device: &str) ->
     {
         let spec = WavSpec {
             channels: 1,
-            sample_rate: 32000,
+            sample_rate: sample_rate / 3, // for some reason 96khz device need 32 and 48khz need 16 (be mindful resampling)
             bits_per_sample: 32,
             sample_format: hound::SampleFormat::Float,
         };
@@ -605,7 +610,12 @@ pub fn stt(
                 audio_input.device,
                 &api_key[..8]
             );
-            match transcribe_with_deepgram(&api_key, &speech_frames, &audio_input.device) {
+            match transcribe_with_deepgram(
+                &api_key,
+                &speech_frames,
+                &audio_input.device,
+                audio_input.sample_rate,
+            ) {
                 Ok(transcription) => Ok(transcription),
                 Err(e) => {
                     error!(

diff --git a/screenpipe-server/src/core.rs b/screenpipe-server/src/core.rs
@@ -351,7 +351,10 @@ async fn record_audio(
         });
 
         while let Ok(transcription) = whisper_receiver.try_recv() {
-            info!("Received transcription");
+            info!(
+                "device {} received transcription {:?}",
+                transcription.input.device, transcription.transcription
+            );
             // avoiding crashing the audio processing if one fails
             if let Err(e) = process_audio_result(
                 &db,
@@ -386,7 +389,10 @@ async fn process_audio_result(
     let transcription = result.transcription.unwrap();
     let transcription_engine = audio_transcription_engine.to_string();
 
-    info!("Inserting audio chunk: {:?}", result.path);
+    info!(
+        "device {} inserting audio chunk: {:?}",
+        result.input.device, result.path
+    );
     match db.insert_audio_chunk(&result.path).await {
         Ok(audio_chunk_id) => {
             if transcription.is_empty() {