Wordcab · chainyo · Jun 30, 2023 · Jun 29, 2023 · Jun 29, 2023 · Jun 30, 2023
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -43,6 +43,7 @@ def test_audio_request() -> None:
     assert request.source_lang == "en"
     assert request.timestamps == "s"
     assert request.use_batch is False
+    assert request.vocab == []
     assert request.word_timestamps is False
 
 
@@ -56,6 +57,7 @@ def test_audio_response() -> None:
         source_lang="en",
         timestamps="s",
         use_batch=False,
+        vocab=["custom company", "custom product"],
         word_timestamps=False,
     )
     assert response.utterances == []
@@ -65,6 +67,7 @@ def test_audio_response() -> None:
     assert response.source_lang == "en"
     assert response.timestamps == "s"
     assert response.use_batch is False
+    assert response.vocab == ["custom company", "custom product"]
     assert response.word_timestamps is False
 
     response = AudioResponse(
@@ -78,6 +81,7 @@ def test_audio_response() -> None:
         source_lang="en",
         timestamps="s",
         use_batch=False,
+        vocab=["custom company", "custom product"],
         word_timestamps=True,
     )
     assert response.utterances == [
@@ -90,6 +94,7 @@ def test_audio_response() -> None:
     assert response.source_lang == "en"
     assert response.timestamps == "s"
     assert response.use_batch is False
+    assert response.vocab == ["custom company", "custom product"]
     assert response.word_timestamps is True
 
 
@@ -135,6 +140,7 @@ def test_base_response() -> None:
         source_lang="en",
         timestamps="s",
         use_batch=False,
+        vocab=["custom company", "custom product"],
         word_timestamps=False,
     )
     assert response.utterances == [
@@ -146,6 +152,7 @@ def test_base_response() -> None:
     assert response.source_lang == "en"
     assert response.timestamps == "s"
     assert response.use_batch is False
+    assert response.vocab == ["custom company", "custom product"]
     assert response.word_timestamps is False
 
 
@@ -157,7 +164,7 @@ def test_cortex_error() -> None:
     assert error.message == "This is a test error"
 
 
-def test_corxet_payload() -> None:
+def test_cortex_payload() -> None:
     """Test the CortexPayload model."""
     payload = CortexPayload(
         url_type="youtube",
@@ -182,6 +189,7 @@ def test_corxet_payload() -> None:
     assert payload.source_lang == "en"
     assert payload.timestamps == "s"
     assert payload.use_batch is False
+    assert payload.vocab == []
     assert payload.word_timestamps is False
     assert payload.job_name == "test_job"
     assert payload.ping is False
@@ -199,6 +207,7 @@ def test_cortex_url_response() -> None:
         source_lang="en",
         timestamps="s",
         use_batch=False,
+        vocab=["custom company", "custom product"],
         word_timestamps=False,
         dual_channel=False,
         job_name="test_job",
@@ -213,6 +222,7 @@ def test_cortex_url_response() -> None:
     assert response.source_lang == "en"
     assert response.timestamps == "s"
     assert response.use_batch is False
+    assert response.vocab == ["custom company", "custom product"]
     assert response.word_timestamps is False
     assert response.dual_channel is False
     assert response.job_name == "test_job"
@@ -231,6 +241,7 @@ def test_cortex_youtube_response() -> None:
         source_lang="en",
         timestamps="s",
         use_batch=False,
+        vocab=["custom company", "custom product"],
         word_timestamps=False,
         video_url="https://www.youtube.com/watch?v=dQw4w9WgXcQ",
         job_name="test_job",
@@ -245,6 +256,7 @@ def test_cortex_youtube_response() -> None:
     assert response.source_lang == "en"
     assert response.timestamps == "s"
     assert response.use_batch is False
+    assert response.vocab == ["custom company", "custom product"]
     assert response.word_timestamps is False
     assert response.video_url == "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
     assert response.job_name == "test_job"
@@ -263,6 +275,7 @@ def test_youtube_response() -> None:
         source_lang="en",
         timestamps="s",
         use_batch=False,
+        vocab=["custom company", "custom product"],
         word_timestamps=False,
         video_url="https://www.youtube.com/watch?v=dQw4w9WgXcQ",
     )
@@ -275,5 +288,6 @@ def test_youtube_response() -> None:
     assert response.source_lang == "en"
     assert response.timestamps == "s"
     assert response.use_batch is False
+    assert response.vocab == ["custom company", "custom product"]
     assert response.word_timestamps is False
     assert response.video_url == "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
diff --git a/wordcab_transcribe/models.py b/wordcab_transcribe/models.py
@@ -27,6 +27,7 @@ class BaseResponse(BaseModel):
     source_lang: str
     timestamps: str
     use_batch: bool
+    vocab: List[str]
     word_timestamps: bool
 
 
@@ -59,6 +60,11 @@ class Config:
                 "source_lang": "en",
                 "timestamps": "s",
                 "use_batch": False,
+                "vocab": [
+                    "custom company name",
+                    "custom product name",
+                    "custom co-worker name",
+                ],
                 "word_timestamps": False,
                 "dual_channel": False,
             }
@@ -94,6 +100,11 @@ class Config:
                 "source_lang": "en",
                 "timestamps": "s",
                 "use_batch": False,
+                "vocab": [
+                    "custom company name",
+                    "custom product name",
+                    "custom co-worker name",
+                ],
                 "word_timestamps": False,
                 "video_url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
             }
@@ -127,6 +138,7 @@ class CortexPayload(BaseModel):
     source_lang: Optional[str] = "en"
     timestamps: Optional[str] = "s"
     use_batch: Optional[bool] = False
+    vocab: Optional[List[str]] = []
     word_timestamps: Optional[bool] = False
     job_name: Optional[str] = None
     ping: Optional[bool] = False
@@ -159,6 +171,11 @@ class Config:
                 "source_lang": "en",
                 "timestamps": "s",
                 "use_batch": False,
+                "vocab": [
+                    "custom company name",
+                    "custom product name",
+                    "custom co-worker name",
+                ],
                 "word_timestamps": False,
                 "job_name": "job_abc123",
                 "ping": False,
@@ -196,6 +213,11 @@ class Config:
                 "source_lang": "en",
                 "timestamps": "s",
                 "use_batch": False,
+                "vocab": [
+                    "custom company name",
+                    "custom product name",
+                    "custom co-worker name",
+                ],
                 "word_timestamps": False,
                 "dual_channel": False,
                 "job_name": "job_name",
@@ -234,6 +256,11 @@ class Config:
                 "source_lang": "en",
                 "timestamps": "s",
                 "use_batch": False,
+                "vocab": [
+                    "custom company name",
+                    "custom product name",
+                    "custom co-worker name",
+                ],
                 "word_timestamps": False,
                 "video_url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
                 "job_name": "job_name",
@@ -250,6 +277,7 @@ class BaseRequest(BaseModel):
     source_lang: str = "en"
     timestamps: str = "s"
     use_batch: bool = False
+    vocab: List[str] = []
     word_timestamps: bool = False
 
     @validator("timestamps")
@@ -259,6 +287,15 @@ def validate_timestamps_values(cls, value: str) -> str:  # noqa: B902, N805
             raise ValueError("timestamps must be one of 'hms', 'ms', 's'.")
         return value
 
+    @validator("vocab")
+    def validate_each_vocab_value(
+        cls, value: List[str]  # noqa: B902, N805
+    ) -> List[str]:
+        """Validate the value of each vocab field."""
+        if not all(isinstance(v, str) for v in value):
+            raise ValueError("vocab must be a list of strings.")
+        return value
+
     class Config:
         """Pydantic config class."""
 
@@ -269,6 +306,11 @@ class Config:
                 "source_lang": "en",
                 "timestamps": "s",
                 "use_batch": False,
+                "vocab": [
+                    "custom company name",
+                    "custom product name",
+                    "custom co-worker name",
+                ],
                 "word_timestamps": False,
             }
         }
@@ -289,6 +331,11 @@ class Config:
                 "source_lang": "en",
                 "timestamps": "s",
                 "use_batch": False,
+                "vocab": [
+                    "custom company name",
+                    "custom product name",
+                    "custom co-worker name",
+                ],
                 "word_timestamps": False,
                 "dual_channel": False,
             }

diff --git a/wordcab_transcribe/router/v1/audio_file_endpoint.py b/wordcab_transcribe/router/v1/audio_file_endpoint.py
@@ -14,7 +14,7 @@
 """Audio file endpoint for the Wordcab Transcribe API."""
 
 import asyncio
-from typing import Union
+from typing import List, Union
 
 import shortuuid
 from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, UploadFile
@@ -45,6 +45,7 @@ async def inference_with_audio(
     source_lang: str = Form("en"),  # noqa: B008
     timestamps: str = Form("s"),  # noqa: B008
     use_batch: bool = Form(False),  # noqa: B008
+    vocab: List[str] = Form([]),  # noqa: B008
     word_timestamps: bool = Form(False),  # noqa: B008
     file: UploadFile = File(...),  # noqa: B008
 ) -> AudioResponse:
@@ -64,6 +65,7 @@ async def inference_with_audio(
         source_lang=source_lang,
         timestamps=timestamps,
         use_batch=use_batch,
+        vocab=vocab,
         word_timestamps=word_timestamps,
         dual_channel=dual_channel,
     )
@@ -91,6 +93,7 @@ async def inference_with_audio(
             source_lang=data.source_lang,
             timestamps_format=data.timestamps,
             use_batch=data.use_batch,
+            vocab=data.vocab,
             word_timestamps=data.word_timestamps,
         )
     )
@@ -113,5 +116,6 @@ async def inference_with_audio(
             source_lang=data.source_lang,
             timestamps=data.timestamps,
             use_batch=data.use_batch,
+            vocab=data.vocab,
             word_timestamps=data.word_timestamps,
         )
diff --git a/wordcab_transcribe/router/v1/audio_url_endpoint.py b/wordcab_transcribe/router/v1/audio_url_endpoint.py
@@ -70,6 +70,7 @@ async def inference_with_audio_url(
             source_lang=data.source_lang,
             timestamps_format=data.timestamps,
             use_batch=data.use_batch,
+            vocab=data.vocab,
             word_timestamps=data.word_timestamps,
         )
     )
@@ -92,5 +93,6 @@ async def inference_with_audio_url(
             source_lang=data.source_lang,
             timestamps=data.timestamps,
             use_batch=data.use_batch,
+            vocab=data.vocab,
             word_timestamps=data.word_timestamps,
         )
diff --git a/wordcab_transcribe/router/v1/cortex_endpoint.py b/wordcab_transcribe/router/v1/cortex_endpoint.py
@@ -71,6 +71,7 @@ async def run_cortex(
                 source_lang=payload.source_lang,
                 timestamps=payload.timestamps,
                 use_batch=payload.use_batch,
+                vocab=payload.vocab,
                 word_timestamps=payload.word_timestamps,
             )
             utterances: AudioResponse = await inference_with_audio_url(
@@ -86,6 +87,7 @@ async def run_cortex(
                 source_lang=payload.source_lang,
                 timestamps=payload.timestamps,
                 use_batch=payload.use_batch,
+                vocab=payload.vocab,
                 word_timestamps=payload.word_timestamps,
             )
             utterances: YouTubeResponse = await inference_with_youtube(

diff --git a/wordcab_transcribe/services/asr_service.py b/wordcab_transcribe/services/asr_service.py
@@ -172,6 +172,7 @@ async def process_input(
         source_lang: str,
         timestamps_format: str,
         use_batch: bool,
+        vocab: List[str],
         word_timestamps: bool,
     ) -> Union[List[dict], Exception]:
         """Process the input request and return the results.
@@ -190,6 +191,7 @@ async def process_input(
             source_lang (str): Source language of the audio file.
             timestamps_format (str): Timestamps format to use.
             use_batch (bool): Whether to use batch processing or not.
+            vocab (List[str]): List of words to use for the vocabulary.
             word_timestamps (bool): Whether to return word timestamps or not.
 
         Returns:
@@ -203,8 +205,8 @@ async def process_input(
             "source_lang": source_lang,
             "timestamps_format": timestamps_format,
             "use_batch": use_batch,
+            "vocab": vocab,
             "word_timestamps": word_timestamps,
-            "post_processed": False,
             "transcription_result": None,
             "transcription_done": asyncio.Event(),
             "diarization_result": None,
@@ -324,6 +326,7 @@ def process_transcription(
             task["input"],
             source_lang=task["source_lang"],
             suppress_blank=False,
+            vocab=None if task["vocab"] == [] else task["vocab"],
             word_timestamps=True,
             vad_service=self.services["vad"] if task["dual_channel"] else None,
             use_batch=task["use_batch"],

diff --git a/wordcab_transcribe/services/transcribe_service.py b/wordcab_transcribe/services/transcribe_service.py
@@ -322,6 +322,7 @@ def __call__(
         audio: Union[str, torch.Tensor, Tuple[str, str]],
         source_lang: str,
         suppress_blank: bool = False,
+        vocab: Optional[List[str]] = None,
         word_timestamps: bool = True,
         vad_service: Optional[VadService] = None,
         use_batch: bool = True,
@@ -335,6 +336,7 @@ def __call__(
                 audio files.
             source_lang (str): Language of the audio file.
             suppress_blank (bool): Whether to suppress blank at the beginning of the sampling.
+            vocab (Optional[List[str]]): Vocabulary to use during generation if not None.
             word_timestamps (bool): Whether to return word timestamps.
             vad_service (Optional[VADService]): VADService to use for voice activity detection in the dual_channel case.
             use_batch (bool): Whether to use batch inference.
@@ -365,9 +367,11 @@ def __call__(
             self.loaded_model_lang = "multi"
 
         if not use_batch:
+            prompt = ", ".join(vocab) if vocab else None
             segments, _ = self.model.transcribe(
                 audio,
                 language=source_lang,
+                initial_prompt=prompt,
                 suppress_blank=False,
                 word_timestamps=True,
             )