Merge pull request #24 from uezo/support-custom-listeners

Support Azure Speech Services
uezo · Feb 27, 2024 · eb6e7b5 · eb6e7b5
2 parents bd80134 + 9fbd00c
commit eb6e7b5
Show file tree

Hide file tree

Showing 10 changed files with 420 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@
 # 🍩 Requirements
 
 - VOICEVOX API in your computer or network reachable machine (Text-to-Speech)
-- API key for Google Speech Services (Speech-to-Text)
+- API key for Speech Services of Google or Azure (Speech-to-Text)
 - API key for OpenAI API (ChatGPT)
 - Python 3.10 (Runtime)
 
@@ -185,6 +185,84 @@ Launch VRChat as desktop mode on the machine that runs `run.py` and log in with
 
 That's all! Let's chat with the AIAvatar. Log in to VRChat on another machine (or Quest) and go to the world the AIAvatar is in.
 
+# 🟦 Use Azure Listeners
+
+We strongly recommend using AzureWakewordListener and AzureRequestListner that are more stable than the default listners. Check [examples/run_azure.py](https://github.com/uezo/aiavatarkit/blob/main/examples/run_azure.py) that works out-of-the-box.
+
+Install Azure SpeechSDK.
+
+```sh
+$ pip install azure-cognitiveservices-speech
+```
+
+Change script to use AzureRequestListener and AzureWakewordListener.
+
+```python
+YOUR_SUBSCRIPTION_KEY = "YOUR_SUBSCRIPTION_KEY"
+YOUR_REGION_NAME = "japanwest"
+
+# Create AzureRequestListener
+from aiavatar.listeners.azurevoicerequest import AzureVoiceRequestListener
+request_listener = AzureVoiceRequestListener(
+    YOUR_SUBSCRIPTION_KEY,
+    YOUR_REGION_NAME,
+)
+
+# Create AIAVater with AzureRequestListener
+app = AIAvatar(
+    openai_api_key=OPENAI_API_KEY,
+    system_message_content=system_message_content,
+    request_listener=request_listener,
+    voicevox_url=VV_URL,
+    voicevox_speaker_id=VV_SPEAKER,
+)
+
+# Create AzureWakewordListner
+async def on_wakeword(text):
+    logger.info(f"Wakeword: {text}")
+    await app.start_chat()
+
+from aiavatar.listeners.azurewakeword import AzureWakewordListener
+wakeword_listener = AzureWakewordListener(
+    YOUR_SUBSCRIPTION_KEY,
+    YOUR_REGION_NAME,
+    on_wakeword=on_wakeword,
+    wakewords=["こんにちは"]
+)
+```
+
+To specify the microphone device by setting `device_name` argument.
+See Microsoft Learn to know how to check the device UID on each platform.
+https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-select-audio-input-devices
+
+We provide [a script for MacOS](https://github.com/uezo/aiavatarkit/blob/main/examples/audio_device_checker/main.m). Just run it on Xcode.
+
+```
+Device UID: BuiltInMicrophoneDevice, Name: MacBook Proのマイク
+Device UID: com.vbaudio.vbcableA:XXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX, Name: VB-Cable A
+Device UID: com.vbaudio.vbcableB:XXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX, Name: VB-Cable B
+```
+
+For example, the UID for the built-in microphone on MacOS is `BuiltInMicrophoneDevice`.
+
+Then, set it as the value of `device_name`.
+
+```python
+request_listener = AzureVoiceRequestListener(
+    YOUR_SUBSCRIPTION_KEY,
+    YOUR_REGION_NAME,
+    device_name="BuiltInMicrophoneDevice"
+)
+
+wakeword_listener = AzureWakewordListener(
+    YOUR_SUBSCRIPTION_KEY,
+    YOUR_REGION_NAME,
+    on_wakeword=on_wakeword,
+    wakewords=["こんにちは"],
+    device_name="BuiltInMicrophoneDevice"
+)
+```
+
 
 # ⚡️ Function Calling
 

diff --git a/aiavatar/__init__.py b/aiavatar/__init__.py
@@ -3,6 +3,8 @@
 # Processor
 from .processors.chatgpt import ChatGPTProcessor
 # Listener
+from .listeners import WakewordListenerBase
+from .listeners import RequestListenerBase
 from .listeners.wakeword import WakewordListener
 from .listeners.voicerequest import VoiceRequestListener
 # Avatar

diff --git a/aiavatar/bot.py b/aiavatar/bot.py
@@ -7,6 +7,7 @@
 # Processor
 from .processors.chatgpt import ChatGPTProcessor
 # Listener
+from .listeners import RequestListenerBase
 from .listeners.voicerequest import VoiceRequestListener
 # Avatar
 from .speech.voicevox import VoicevoxSpeechController
@@ -17,31 +18,33 @@
 class AIAvatar:
     def __init__(
         self,
-        google_api_key: str,
+        *,
+        # AI
         openai_api_key: str,
-        voicevox_url: str,
-        voicevox_speaker_id: int=46,
-        volume_threshold: int=3000,
-        start_voice: str="どうしたの",
         model: str="gpt-3.5-turbo",
         functions: dict=None,
         system_message_content: str=None,
+        # Speech-to-Text
+        google_api_key: str=None,
+        volume_threshold: int=3000,
+        request_listener: RequestListenerBase=None,
+        # Text-to-Speech
+        voicevox_url: str,
+        voicevox_speaker_id: int=46,
+        # Audio device
+        input_device: int=-1,
+        output_device: int=-1,
+        # Avatar
         animation_controller: AnimationController=None,
         face_controller: FaceController=None,
         avatar_request_parser: Callable=None,
-        input_device: int=-1,
-        output_device: int=-1
+        # Chat
+        start_voice: str="どうしたの",
     ):
 
         self.logger = getLogger(__name__)
         self.logger.addHandler(NullHandler())
 
-        self.google_api_key = google_api_key
-        self.openai_api_key = openai_api_key
-        self.voicevox_url = voicevox_url
-        self.voicevox_speaker_id = voicevox_speaker_id
-        self.volume_threshold = volume_threshold
-
         # Audio Devices
         if isinstance(input_device, int):
             if input_device < 0:
@@ -74,13 +77,20 @@ def __init__(
         self.logger.info(f"Output device: [{output_device}] {output_device_info['name']}")
 
         # Processor
+        self.openai_api_key = openai_api_key
         self.chat_processor = ChatGPTProcessor(api_key=self.openai_api_key, model=model, functions=functions, system_message_content=system_message_content)
 
         # Listeners
-        self.request_listener = VoiceRequestListener(self.google_api_key, volume_threshold=volume_threshold, device_index=self.input_device)
+        self.google_api_key = google_api_key
+        self.volume_threshold = volume_threshold
+        self.request_listener = request_listener or VoiceRequestListener(self.google_api_key, volume_threshold=volume_threshold, device_index=self.input_device)
 
-        # Avatar
+        # Speech
+        self.voicevox_url = voicevox_url
+        self.voicevox_speaker_id = voicevox_speaker_id
         speech_controller = VoicevoxSpeechController(self.voicevox_url, self.voicevox_speaker_id, device_index=self.output_device)
+
+        # Avatar
         animation_controller = animation_controller or AnimationControllerDummy()
         face_controller = face_controller or FaceControllerDummy()
         self.avatar_controller = AvatarController(speech_controller, animation_controller, face_controller, avatar_request_parser)

diff --git a/aiavatar/listeners/__init__.py b/aiavatar/listeners/__init__.py
@@ -1,3 +1,4 @@
+from abc import ABC, abstractmethod
 import base64
 from logging import getLogger, NullHandler
 import numpy
@@ -7,6 +8,19 @@
 import aiohttp
 import sounddevice
 
+
+class RequestListenerBase(ABC):
+    @abstractmethod
+    async def get_request(self):
+        ...
+
+
+class WakewordListenerBase(ABC):
+    @abstractmethod
+    def start(self):
+        ...
+
+
 class SpeechListenerBase:
     def __init__(self, api_key: str, on_speech_recognized: Callable, volume_threshold: int=3000, timeout: float=1.0, detection_timeout: float=0.0, min_duration: float=0.3, max_duration: float=20.0, lang: str="ja-JP", rate: int=44100, channels: int=1, device_index: int=-1):
         self.logger = getLogger(__name__)

diff --git a/aiavatar/listeners/azurevoicerequest.py b/aiavatar/listeners/azurevoicerequest.py
@@ -0,0 +1,39 @@
+# pip install azure-cognitiveservices-speech
+from logging import getLogger, NullHandler
+import azure.cognitiveservices.speech as speechsdk
+from azure.cognitiveservices.speech import PropertyId
+from . import RequestListenerBase
+
+class AzureVoiceRequestListener(RequestListenerBase):
+    def __init__(self, api_key: str, region: str, timeout: float=0.5, detection_timeout: float=10.0, lang: str="ja-JP", device_name: str=None):
+        self.logger = getLogger(__name__)
+        self.logger.addHandler(NullHandler())
+
+        self.speech_config = speechsdk.SpeechConfig(subscription=api_key, region=region)
+        self.speech_config.set_property(PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, str(detection_timeout * 1000))
+        self.speech_config.set_property(PropertyId.Speech_SegmentationSilenceTimeoutMs, str(timeout * 1000))
+
+        if device_name:
+            # NOTE: You can see the way to check the device_name at Microsoft Learn.
+            # https://learn.microsoft.com/ja-jp/azure/ai-services/speech-service/how-to-select-audio-input-devices
+            self.audio_config = speechsdk.AudioConfig(device_name=device_name)
+        else:
+            self.audio_config = speechsdk.AudioConfig(use_default_microphone=True)
+
+        self.speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config, audio_config=self.audio_config, language=lang)
+
+        self.on_start_listening = None
+
+    async def get_request(self):
+        if self.on_start_listening:
+            await self.on_start_listening()
+
+        self.logger.info(f"Listening... ({self.__class__.__name__})")
+        result = self.speech_recognizer.recognize_once()
+
+        if result.text:
+            self.logger.info(f"AzureVoiceRequestListener: {result.text}")
+        else:
+            self.logger.info(f"AzureVoiceRequestListener: No speech recognized.")
+
+        return result.text
diff --git a/aiavatar/listeners/azurewakeword.py b/aiavatar/listeners/azurewakeword.py
@@ -0,0 +1,47 @@
+# pip install azure-cognitiveservices-speech
+import asyncio
+from logging import getLogger, NullHandler
+from threading import Thread
+from typing import Callable
+import azure.cognitiveservices.speech as speechsdk
+
+class AzureWakewordListener:
+    def __init__(self, api_key: str, region: str, wakewords: list, on_wakeword: Callable, lang: str="ja-JP", device_name: str=None, verbose: bool=False):
+        self.logger = getLogger(__name__)
+        self.logger.addHandler(NullHandler())
+
+        self.speech_config = speechsdk.SpeechConfig(subscription=api_key, region=region)
+
+        if device_name:
+            # NOTE: You can see the way to check the device_name at Microsoft Learn.
+            # https://learn.microsoft.com/ja-jp/azure/ai-services/speech-service/how-to-select-audio-input-devices
+            self.audio_config = speechsdk.AudioConfig(device_name=device_name)
+        else:
+            self.audio_config = speechsdk.AudioConfig(use_default_microphone=True)
+
+        self.speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config, audio_config=self.audio_config, language=lang)
+        self.speech_recognizer.recognized.connect(lambda evt: self.on_recognized(evt))
+
+        self.wakewords = wakewords
+        self.on_wakeword = on_wakeword
+        self.verbose = verbose
+
+    def on_recognized(self, evt):
+        recognized_text = evt.result.text.replace("。", "").replace("、", "").replace("!", "").replace("！", "").replace("?", "").replace("？", "").strip()
+
+        if self.verbose:
+            self.logger.info(f"AzureWakeWordListener: {recognized_text}")
+
+        if recognized_text in self.wakewords:
+            asyncio.run(self.on_wakeword(recognized_text))
+
+    async def start_listening(self):
+        self.logger.info(f"Listening... ({self.__class__.__name__})")
+        self.speech_recognizer.start_continuous_recognition()
+        while True:
+            await asyncio.sleep(0.1)
+
+    def start(self):
+        th = Thread(target=asyncio.run, args=(self.start_listening(),), daemon=True)
+        th.start()
+        return th
diff --git a/aiavatar/listeners/voicerequest.py b/aiavatar/listeners/voicerequest.py
@@ -1,6 +1,6 @@
-from . import SpeechListenerBase
+from . import RequestListenerBase, SpeechListenerBase
 
-class VoiceRequestListener(SpeechListenerBase):
+class VoiceRequestListener(RequestListenerBase, SpeechListenerBase):
     def __init__(self, api_key: str, volume_threshold: int=3000, timeout: float=1.0, detection_timeout: float=10.0, min_duration: float=0.3, max_duration: float=20.0, lang: str="ja-JP", rate: int=44100, channels: int=1, device_index: int=-1):
         super().__init__(api_key, self.on_request, volume_threshold, timeout, detection_timeout, min_duration, max_duration, lang, rate, channels, device_index)
         self.last_recognized_text = None

diff --git a/aiavatar/listeners/wakeword.py b/aiavatar/listeners/wakeword.py
@@ -1,9 +1,9 @@
 import asyncio
 from threading import Thread
 from typing import Callable
-from . import SpeechListenerBase
+from . import WakewordListenerBase, SpeechListenerBase
 
-class WakewordListener(SpeechListenerBase):
+class WakewordListener(WakewordListenerBase, SpeechListenerBase):
     def __init__(self, api_key: str, wakewords: list, on_wakeword: Callable, volume_threshold: int=3000, timeout: float=0.3, min_duration: float=0.2, max_duration: float=2, lang: str="ja-JP", rate: int=44100, chennels: int=1, device_index: int=-1, verbose: bool=False):
         super().__init__(api_key, self.invoke_on_wakeword, volume_threshold, timeout, 0.0, min_duration, max_duration, lang, rate, chennels, device_index)
         self.wakewords = wakewords