Skip to content

Commit

Permalink
replace speech-to-text with speech-to-intent
Browse files Browse the repository at this point in the history
  • Loading branch information
Jef808 committed Feb 25, 2024
1 parent ee0ec7c commit ed20c86
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 67 deletions.
16 changes: 12 additions & 4 deletions data/computer-commands.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
context:
expressions:
getScript:
- "@promptOpenAI (me) [a, an] ($programmingLanguage:language) script"
- "@promptVerb (me) [a, an] ($programmingLanguage:language) script"
answerQuestion:
- "@promptOpenAI (me) [an, the, the following] answer to [this, that, the,
- "@promptVerb (me) [an, the, the following] answer to [this, that, the,
the following] question"
- Answer (me) [this, the, the following, that] question
simplyTranscribe:
- Send (this, that, the following, what follows) to (the) keyboard
- Write [this, that, the following, what follows]
- Transcribe
focusWindow:
Expand All @@ -19,9 +20,16 @@ context:
setVolume:
- $volumeSetting:volumeSetting (the volume)
- (Set) (the) [volume, sound] (to) $pv.Percent:percentage
cancel:
- (no) wait a second
- (no) cancel
- (no) never mind
slots:
programmingLanguage:
- home slash j f a
- stump
- lisp
- e lisp
- c plus plus
- javascript
- emacs
- shell
Expand All @@ -45,7 +53,7 @@ context:
- unmute
- mute
macros:
promptOpenAI:
promptVerb:
- I need
- I want
- generate
Expand Down
24 changes: 15 additions & 9 deletions echo_crafter/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@

def get_project_root() -> Path:
"""Get the root of the project."""
return Path(os.environ.get('EC_ROOT', '')) \
or Path(__file__).parent.parent.parent
path = Path(os.environ.get('EC_ROOT', '')) \
or Path(__file__).resolve().parent.parent.parent
return path


def get_picovoice_api_key() -> str:
Expand All @@ -23,13 +24,18 @@ def get_picovoice_api_key() -> str:
return api_key


def build_path(rel_path: str) -> str:
"""Build an absolute path from the given path relative to root."""
return str(get_project_root() / rel_path)


@dataclass(init=False, frozen=True)
class _Config(TypedDict):
"""Configuration for the echo-crafter package."""

PROJECT_ROOT: str
DATA_DIR: str
PYTHON_PACKAGES_DIR: str
PYTHON_PACKAGES: str

PICOVOICE_API_KEY: str
CHEETAH_MODEL_FILE: str
Expand All @@ -44,16 +50,16 @@ class _Config(TypedDict):

Config: _Config = {
"PROJECT_ROOT": str(get_project_root()),
"DATA_DIR": str(get_project_root()/"data"),
"PYTHON_PACKAGES_DIR": str(get_project_root()/".venv/lib/python3.11/site-packages/python_packages"),
"DATA_DIR": build_path("data"),
"PYTHON_PACKAGES": build_path(".venv/lib/python3.11/site-packages/python_packages"),

"PICOVOICE_API_KEY": str(os.getenv('PICOVOICE_API_KEY')),
"CHEETAH_MODEL_FILE": str(get_project_root()/"data/speech-command-cheetah-v2.pv"),
"RHINO_CONTEXT_FILE": str(get_project_root()/"data/computer-commands_en_linux_v3_0_0.rhn"),
"CHEETAH_MODEL_FILE": build_path("data/speech-command-cheetah-v2.pv"),
"RHINO_CONTEXT_FILE": build_path("data/computer-commands_en_linux_v3_0_0.rhn"),
"FRAME_LENGTH": 512,
"ENDPOINT_DURATION_SEC": 1.5,

"TRANSCRIPT_BEGIN_WAV": str(get_project_root()/"data/transcript_begin.wav"),
"TRANSCRIPT_SUCCESS_WAV": str(get_project_root()/"data/transcript_success.wav"),
"TRANSCRIPT_BEGIN_WAV": build_path("data/transcript_begin.wav"),
"TRANSCRIPT_SUCCESS_WAV": build_path("data/transcript_success.wav"),
"SOCKET_PATH": str(Path(os.environ.get('EC_SOCKET_FILE', '/tmp/echo-crafter.sock')))
}
64 changes: 36 additions & 28 deletions echo_crafter/listener/listener_with_wake_word.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,75 @@
"""Listen for a wake word and transcribe speech until endpoint is detected."""

import json
import subprocess
import traceback
import socket
from contextlib import contextmanager
from typing import List

from echo_crafter.logger import setup_logger
from echo_crafter.config import Config

from echo_crafter.listener.utils import (
Intent,
microphone
)
from echo_crafter.logger import setup_logger
from echo_crafter.config import Config

def play_sound(wav_file):
logger = setup_logger(__name__)

def play_sound(wav_file) -> None:
"""Play a ding sound to indicate that the wake word was detected."""
subprocess.Popen(["aplay", "-q", wav_file])
subprocess.Popen(['aplay', wav_file])


def wake_word_callback():
def on_wake_word_detected() -> None:
"""Play a ding sound to indicate that the wake word was detected."""
play_sound(Config['TRANSCRIPT_BEGIN_WAV'])


@contextmanager
def create_transcription_callback():
"""Connect to the transcription socket and send it all partial transcripts."""
def on_intent_inferred(intent_obj: Intent) -> None:
"""Log the inferred intent and slots."""
logger.info("Intent inferred: %s", json.dumps(intent_obj))
print(json.dumps(intent_obj, indent=2))
play_sound(Config['TRANSCRIPT_SUCCESS_WAV'])

client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
try:
client.connect(Config['SOCKET_PATH'])

def callback(partial_transcript):
"""Send the partial transcript to the active window."""
client.sendall((partial_transcript).encode())
partial_transcripts: List[str] = []

yield callback

finally:
client.close()
def on_partial_transcript(partial_transcript: str) -> None:
"""Send the partial transcript to the active window."""
partial_transcripts.append(partial_transcript)
subprocess.Popen(
['xdotool', 'type', '--clearmodifiers', '--delay', '0', partial_transcript]
)


def transcription_success_callback():
"""Play a ding sound to indicate that the final transcript was received."""
play_sound(Config['TRANSCRIPT_SUCCESS_WAV'])
def on_final_transcript() -> None:
"""Log the accumulated partial transcripts"""
final_transcript = ''.join(partial_transcripts)
partial_transcripts.clear()
logger.info("Final transcript: %s", final_transcript)


def main():
"""Upon detection of a wake word, transcribe speech until endpoint is detected."""
logger = setup_logger()
logger = setup_logger(__name__)

with microphone() as mic:
try:
while True:
with create_transcription_callback() as transcription_callback:
mic.wait_for_wake_word(wake_word_callback)

mic.process_and_transmit_utterance(transcription_callback, transcription_success_callback)
mic.wait_for_wake_word(on_wake_word_detected)
mic.infer_intent(on_intent_inferred)
#mic.process_and_transmit_utterance(on_partial_transcript, on_final_transcript)

except KeyboardInterrupt:
pass
mic.set_is_recording(False)

except Exception as e:
logger.error("An error occured %s", e)
logger.error(traceback.format_exc())

finally:
mic.set_is_recording(False)

if __name__ == '__main__':
main()
8 changes: 6 additions & 2 deletions echo_crafter/listener/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from .microphone import microphone
from .sockets import socket_connection
from .microphone import *
from .sockets import *
from .types import *

__all__ = [
'Intent',
'Slot',
'MicrophoneCallbacks',
'socket_connection',
'microphone'
]
50 changes: 30 additions & 20 deletions echo_crafter/listener/utils/microphone.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from contextlib import contextmanager
from collections import deque
from pathlib import Path

import pvcheetah
import pvporcupine
Expand All @@ -17,7 +15,7 @@ def porcupine_context_manager():
try:
porcupine_instance = pvporcupine.create(
keywords=['computer'],
sensitivities=[0.1],
sensitivities=[0.5],
access_key=Config['PICOVOICE_API_KEY']
)
yield porcupine_instance
Expand Down Expand Up @@ -65,7 +63,8 @@ def rhino_context_manager():
try:
rhino_instance = pvrhino.create(
access_key=Config['PICOVOICE_API_KEY'],
context_path=Config['RHINO_CONTEXT_FILE']
context_path=Config['RHINO_CONTEXT_FILE'],
sensitivity=0.8
)
yield rhino_instance
finally:
Expand All @@ -81,7 +80,6 @@ def __init__(self, porcupine, rhino, cheetah, recorder):
self.rhino = rhino
self.cheetah = cheetah
self.recorder = recorder
self.is_listening = False
self.wake_word_frame = None


Expand All @@ -94,17 +92,16 @@ def get_next_frame(self):
return self.recorder.read()


def set_is_listening(self, is_listening):
"""Set the is_listening attribute of the recorder."""
self.is_listening = is_listening
if is_listening and not self.recorder.is_recording:
self.recorder.start()
elif not is_listening and self.recorder.is_recording:
def set_is_recording(self, is_recording):
"""Set the is_recording attribute of the recorder."""
if self.recorder.is_recording and not is_recording:
self.recorder.stop()
elif not self.recorder.is_recording and is_recording:
self.recorder.start()

def wait_for_wake_word(self, wake_word_callback=None):
"""Wait for the wake word to be detected."""
while True:
while self.recorder.is_recording:
pcm_frame = self.recorder.read()
keyword_index = self.porcupine.process(pcm_frame)
if keyword_index >= 0:
Expand All @@ -114,19 +111,32 @@ def wait_for_wake_word(self, wake_word_callback=None):
break


def process_and_transmit_utterance(self, transcription_callback, transcription_success_callback=None):
"""Process the utterance and transmit the partial transcript to the client."""
def infer_intent(self, on_intent_inferred):
"""Infer the user intent and pass it to the callback once collected."""
while self.recorder.is_recording:
pcm_frame = self.get_next_frame()
rhino_is_finalized = self.rhino.process(pcm_frame)
if rhino_is_finalized:
intent = self.rhino.get_inference()
on_intent_inferred(intent)
break
self.rhino.reset()


def process_and_transmit_utterance(self, on_partial_transcript, on_final_transcript):
"""Process the utterance and pass the transcripts the callback."""
is_endpoint = False
is_transcription_success = False
while not is_endpoint:
while self.recorder.is_recording:
pcm_frame = self.get_next_frame()
partial_transcript, is_endpoint = self.cheetah.process(pcm_frame)
if is_endpoint:
partial_transcript += (self.cheetah.flush() + 'STOP')
partial_transcript += (self.cheetah.flush())
is_transcription_success = True
transcription_callback(partial_transcript)
if is_transcription_success and transcription_success_callback is not None:
transcription_success_callback()
on_partial_transcript(partial_transcript)
if is_transcription_success:
on_final_transcript()
break


@contextmanager
Expand All @@ -140,5 +150,5 @@ def microphone():
rhino,
cheetah,
recorder)
mic.set_is_listening(True)
mic.set_is_recording(True)
yield mic
41 changes: 41 additions & 0 deletions echo_crafter/listener/utils/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from enum import StrEnum
from typing import Any, NamedTuple, Tuple, List, Callable, TypeVar, Type

AudioFrame = List[int]

class IntentName(StrEnum):
"""Intent names for the voice commands."""

UNKNOWN = "unknown"
GET_SCRIPT = "getScript"
ANSWER_QUESTION = "answerQuestion"
TRANSCRIBE_TO_KEYBOARD = "simplyTranscribe"
FOCUS_WINDOW = "focusWindow"
OPEN_WINDOW = "openWindow"
SET_VOLUME = "setVolume"
CANCEL = "cancel"


class Slot(StrEnum):
"""Slot names for the intent's named parameters."""

PROGRAMMING_LANGUAGE = "programmingLanguage"
PROMPT_TYPE = "promptType"
WINDOW_NAME = "windowName"
VOLUME_SETTING = "volumeSetting"


class Intent(NamedTuple):
"""Named tuple for the intent."""

intent: IntentName
slots: List[Slot]


class MicrophoneCallbacks(NamedTuple):
"""Named tuple for the microphone callbacks."""

on_wake_word: Callable[[], Any]
on_intent: Callable[[Intent], Any]
on_partial_transcript: Callable[[str], Any]
on_final_transcript: Callable[[], Any]
6 changes: 2 additions & 4 deletions echo_crafter/logger/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,11 @@ class CustomRecord(logging.LogRecord):
"""Custom LogRecord class with a timestamp attribute."""

def __init__(self, name, level, pathname, lineno, msg, args,
exc_info, func=None, sinfo=None, **kwargs):
exc_info, func=None, sinfo=None):
"""Initialize a CustomRecord instance."""
super().__init__(name, level, pathname, lineno, msg, args,
exc_info, func=func, sinfo=sinfo)
self.timestamp = time.time()
self.intent = kwargs.get('intent', '')
self.slots = kwargs.get('slots', {})


class JsonFormatter(logging.Formatter):
Expand All @@ -31,7 +29,7 @@ def format(self, record):
return json.dumps(log_dict)


def setup_logger(name=__name__, level=logging.INFO):
def setup_logger(name, level=logging.INFO):
"""Set up a logger with a JSON formatter."""
logger = logging.getLogger(name)
logger.setLevel(level)
Expand Down

0 comments on commit ed20c86

Please sign in to comment.