Skip to content

Commit

Permalink
Merge pull request #24 from uezo/support-custom-listeners
Browse files Browse the repository at this point in the history
Support Azure Speech Services
  • Loading branch information
uezo authored Feb 27, 2024
2 parents bd80134 + 9fbd00c commit eb6e7b5
Show file tree
Hide file tree
Showing 10 changed files with 420 additions and 20 deletions.
80 changes: 79 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# 🍩 Requirements

- VOICEVOX API in your computer or network reachable machine (Text-to-Speech)
- API key for Google Speech Services (Speech-to-Text)
- API key for Speech Services of Google or Azure (Speech-to-Text)
- API key for OpenAI API (ChatGPT)
- Python 3.10 (Runtime)

Expand Down Expand Up @@ -185,6 +185,84 @@ Launch VRChat as desktop mode on the machine that runs `run.py` and log in with
That's all! Let's chat with the AIAvatar. Log in to VRChat on another machine (or Quest) and go to the world the AIAvatar is in.
# 🟦 Use Azure Listeners
We strongly recommend using AzureWakewordListener and AzureRequestListner that are more stable than the default listners. Check [examples/run_azure.py](https://github.com/uezo/aiavatarkit/blob/main/examples/run_azure.py) that works out-of-the-box.
Install Azure SpeechSDK.
```sh
$ pip install azure-cognitiveservices-speech
```
Change script to use AzureRequestListener and AzureWakewordListener.
```python
YOUR_SUBSCRIPTION_KEY = "YOUR_SUBSCRIPTION_KEY"
YOUR_REGION_NAME = "japanwest"
# Create AzureRequestListener
from aiavatar.listeners.azurevoicerequest import AzureVoiceRequestListener
request_listener = AzureVoiceRequestListener(
YOUR_SUBSCRIPTION_KEY,
YOUR_REGION_NAME,
)
# Create AIAVater with AzureRequestListener
app = AIAvatar(
openai_api_key=OPENAI_API_KEY,
system_message_content=system_message_content,
request_listener=request_listener,
voicevox_url=VV_URL,
voicevox_speaker_id=VV_SPEAKER,
)
# Create AzureWakewordListner
async def on_wakeword(text):
logger.info(f"Wakeword: {text}")
await app.start_chat()
from aiavatar.listeners.azurewakeword import AzureWakewordListener
wakeword_listener = AzureWakewordListener(
YOUR_SUBSCRIPTION_KEY,
YOUR_REGION_NAME,
on_wakeword=on_wakeword,
wakewords=["こんにちは"]
)
```
To specify the microphone device by setting `device_name` argument.
See Microsoft Learn to know how to check the device UID on each platform.
https://learn.microsoft.com/en-us/azure/ai-services/speech-service/how-to-select-audio-input-devices
We provide [a script for MacOS](https://github.com/uezo/aiavatarkit/blob/main/examples/audio_device_checker/main.m). Just run it on Xcode.
```
Device UID: BuiltInMicrophoneDevice, Name: MacBook Proのマイク
Device UID: com.vbaudio.vbcableA:XXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX, Name: VB-Cable A
Device UID: com.vbaudio.vbcableB:XXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX, Name: VB-Cable B
```
For example, the UID for the built-in microphone on MacOS is `BuiltInMicrophoneDevice`.
Then, set it as the value of `device_name`.
```python
request_listener = AzureVoiceRequestListener(
YOUR_SUBSCRIPTION_KEY,
YOUR_REGION_NAME,
device_name="BuiltInMicrophoneDevice"
)
wakeword_listener = AzureWakewordListener(
YOUR_SUBSCRIPTION_KEY,
YOUR_REGION_NAME,
on_wakeword=on_wakeword,
wakewords=["こんにちは"],
device_name="BuiltInMicrophoneDevice"
)
```
# ⚡️ Function Calling
Expand Down
2 changes: 2 additions & 0 deletions aiavatar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# Processor
from .processors.chatgpt import ChatGPTProcessor
# Listener
from .listeners import WakewordListenerBase
from .listeners import RequestListenerBase
from .listeners.wakeword import WakewordListener
from .listeners.voicerequest import VoiceRequestListener
# Avatar
Expand Down
40 changes: 25 additions & 15 deletions aiavatar/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Processor
from .processors.chatgpt import ChatGPTProcessor
# Listener
from .listeners import RequestListenerBase
from .listeners.voicerequest import VoiceRequestListener
# Avatar
from .speech.voicevox import VoicevoxSpeechController
Expand All @@ -17,31 +18,33 @@
class AIAvatar:
def __init__(
self,
google_api_key: str,
*,
# AI
openai_api_key: str,
voicevox_url: str,
voicevox_speaker_id: int=46,
volume_threshold: int=3000,
start_voice: str="どうしたの",
model: str="gpt-3.5-turbo",
functions: dict=None,
system_message_content: str=None,
# Speech-to-Text
google_api_key: str=None,
volume_threshold: int=3000,
request_listener: RequestListenerBase=None,
# Text-to-Speech
voicevox_url: str,
voicevox_speaker_id: int=46,
# Audio device
input_device: int=-1,
output_device: int=-1,
# Avatar
animation_controller: AnimationController=None,
face_controller: FaceController=None,
avatar_request_parser: Callable=None,
input_device: int=-1,
output_device: int=-1
# Chat
start_voice: str="どうしたの",
):

self.logger = getLogger(__name__)
self.logger.addHandler(NullHandler())

self.google_api_key = google_api_key
self.openai_api_key = openai_api_key
self.voicevox_url = voicevox_url
self.voicevox_speaker_id = voicevox_speaker_id
self.volume_threshold = volume_threshold

# Audio Devices
if isinstance(input_device, int):
if input_device < 0:
Expand Down Expand Up @@ -74,13 +77,20 @@ def __init__(
self.logger.info(f"Output device: [{output_device}] {output_device_info['name']}")

# Processor
self.openai_api_key = openai_api_key
self.chat_processor = ChatGPTProcessor(api_key=self.openai_api_key, model=model, functions=functions, system_message_content=system_message_content)

# Listeners
self.request_listener = VoiceRequestListener(self.google_api_key, volume_threshold=volume_threshold, device_index=self.input_device)
self.google_api_key = google_api_key
self.volume_threshold = volume_threshold
self.request_listener = request_listener or VoiceRequestListener(self.google_api_key, volume_threshold=volume_threshold, device_index=self.input_device)

# Avatar
# Speech
self.voicevox_url = voicevox_url
self.voicevox_speaker_id = voicevox_speaker_id
speech_controller = VoicevoxSpeechController(self.voicevox_url, self.voicevox_speaker_id, device_index=self.output_device)

# Avatar
animation_controller = animation_controller or AnimationControllerDummy()
face_controller = face_controller or FaceControllerDummy()
self.avatar_controller = AvatarController(speech_controller, animation_controller, face_controller, avatar_request_parser)
Expand Down
14 changes: 14 additions & 0 deletions aiavatar/listeners/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from abc import ABC, abstractmethod
import base64
from logging import getLogger, NullHandler
import numpy
Expand All @@ -7,6 +8,19 @@
import aiohttp
import sounddevice


class RequestListenerBase(ABC):
@abstractmethod
async def get_request(self):
...


class WakewordListenerBase(ABC):
@abstractmethod
def start(self):
...


class SpeechListenerBase:
def __init__(self, api_key: str, on_speech_recognized: Callable, volume_threshold: int=3000, timeout: float=1.0, detection_timeout: float=0.0, min_duration: float=0.3, max_duration: float=20.0, lang: str="ja-JP", rate: int=44100, channels: int=1, device_index: int=-1):
self.logger = getLogger(__name__)
Expand Down
39 changes: 39 additions & 0 deletions aiavatar/listeners/azurevoicerequest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# pip install azure-cognitiveservices-speech
from logging import getLogger, NullHandler
import azure.cognitiveservices.speech as speechsdk
from azure.cognitiveservices.speech import PropertyId
from . import RequestListenerBase

class AzureVoiceRequestListener(RequestListenerBase):
def __init__(self, api_key: str, region: str, timeout: float=0.5, detection_timeout: float=10.0, lang: str="ja-JP", device_name: str=None):
self.logger = getLogger(__name__)
self.logger.addHandler(NullHandler())

self.speech_config = speechsdk.SpeechConfig(subscription=api_key, region=region)
self.speech_config.set_property(PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, str(detection_timeout * 1000))
self.speech_config.set_property(PropertyId.Speech_SegmentationSilenceTimeoutMs, str(timeout * 1000))

if device_name:
# NOTE: You can see the way to check the device_name at Microsoft Learn.
# https://learn.microsoft.com/ja-jp/azure/ai-services/speech-service/how-to-select-audio-input-devices
self.audio_config = speechsdk.AudioConfig(device_name=device_name)
else:
self.audio_config = speechsdk.AudioConfig(use_default_microphone=True)

self.speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config, audio_config=self.audio_config, language=lang)

self.on_start_listening = None

async def get_request(self):
if self.on_start_listening:
await self.on_start_listening()

self.logger.info(f"Listening... ({self.__class__.__name__})")
result = self.speech_recognizer.recognize_once()

if result.text:
self.logger.info(f"AzureVoiceRequestListener: {result.text}")
else:
self.logger.info(f"AzureVoiceRequestListener: No speech recognized.")

return result.text
47 changes: 47 additions & 0 deletions aiavatar/listeners/azurewakeword.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# pip install azure-cognitiveservices-speech
import asyncio
from logging import getLogger, NullHandler
from threading import Thread
from typing import Callable
import azure.cognitiveservices.speech as speechsdk

class AzureWakewordListener:
def __init__(self, api_key: str, region: str, wakewords: list, on_wakeword: Callable, lang: str="ja-JP", device_name: str=None, verbose: bool=False):
self.logger = getLogger(__name__)
self.logger.addHandler(NullHandler())

self.speech_config = speechsdk.SpeechConfig(subscription=api_key, region=region)

if device_name:
# NOTE: You can see the way to check the device_name at Microsoft Learn.
# https://learn.microsoft.com/ja-jp/azure/ai-services/speech-service/how-to-select-audio-input-devices
self.audio_config = speechsdk.AudioConfig(device_name=device_name)
else:
self.audio_config = speechsdk.AudioConfig(use_default_microphone=True)

self.speech_recognizer = speechsdk.SpeechRecognizer(speech_config=self.speech_config, audio_config=self.audio_config, language=lang)
self.speech_recognizer.recognized.connect(lambda evt: self.on_recognized(evt))

self.wakewords = wakewords
self.on_wakeword = on_wakeword
self.verbose = verbose

def on_recognized(self, evt):
recognized_text = evt.result.text.replace("。", "").replace("、", "").replace("!", "").replace("!", "").replace("?", "").replace("?", "").strip()

if self.verbose:
self.logger.info(f"AzureWakeWordListener: {recognized_text}")

if recognized_text in self.wakewords:
asyncio.run(self.on_wakeword(recognized_text))

async def start_listening(self):
self.logger.info(f"Listening... ({self.__class__.__name__})")
self.speech_recognizer.start_continuous_recognition()
while True:
await asyncio.sleep(0.1)

def start(self):
th = Thread(target=asyncio.run, args=(self.start_listening(),), daemon=True)
th.start()
return th
4 changes: 2 additions & 2 deletions aiavatar/listeners/voicerequest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from . import SpeechListenerBase
from . import RequestListenerBase, SpeechListenerBase

class VoiceRequestListener(SpeechListenerBase):
class VoiceRequestListener(RequestListenerBase, SpeechListenerBase):
def __init__(self, api_key: str, volume_threshold: int=3000, timeout: float=1.0, detection_timeout: float=10.0, min_duration: float=0.3, max_duration: float=20.0, lang: str="ja-JP", rate: int=44100, channels: int=1, device_index: int=-1):
super().__init__(api_key, self.on_request, volume_threshold, timeout, detection_timeout, min_duration, max_duration, lang, rate, channels, device_index)
self.last_recognized_text = None
Expand Down
4 changes: 2 additions & 2 deletions aiavatar/listeners/wakeword.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import asyncio
from threading import Thread
from typing import Callable
from . import SpeechListenerBase
from . import WakewordListenerBase, SpeechListenerBase

class WakewordListener(SpeechListenerBase):
class WakewordListener(WakewordListenerBase, SpeechListenerBase):
def __init__(self, api_key: str, wakewords: list, on_wakeword: Callable, volume_threshold: int=3000, timeout: float=0.3, min_duration: float=0.2, max_duration: float=2, lang: str="ja-JP", rate: int=44100, chennels: int=1, device_index: int=-1, verbose: bool=False):
super().__init__(api_key, self.invoke_on_wakeword, volume_threshold, timeout, 0.0, min_duration, max_duration, lang, rate, chennels, device_index)
self.wakewords = wakewords
Expand Down
Loading

0 comments on commit eb6e7b5

Please sign in to comment.