replace speech-to-text with speech-to-intent

Jef808 · Feb 25, 2024 · ed20c86 · ed20c86
1 parent ee0ec7c
commit ed20c86
Show file tree

Hide file tree

Showing 7 changed files with 142 additions and 67 deletions.
diff --git a/data/computer-commands.yml b/data/computer-commands.yml
@@ -1,12 +1,13 @@
 context:
   expressions:
     getScript:
-      - "@promptOpenAI (me) [a, an] ($programmingLanguage:language) script"
+      - "@promptVerb (me) [a, an] ($programmingLanguage:language) script"
     answerQuestion:
-      - "@promptOpenAI (me) [an, the, the following] answer to [this, that, the,
+      - "@promptVerb (me) [an, the, the following] answer to [this, that, the,
         the following] question"
       - Answer (me) [this, the, the following, that] question
     simplyTranscribe:
+      - Send (this, that, the following, what follows) to (the) keyboard
       - Write [this, that, the following, what follows]
       - Transcribe
     focusWindow:
@@ -19,9 +20,16 @@ context:
     setVolume:
       - $volumeSetting:volumeSetting (the volume)
       - (Set) (the) [volume, sound] (to) $pv.Percent:percentage
+    cancel:
+      - (no) wait a second
+      - (no) cancel
+      - (no) never mind
   slots:
     programmingLanguage:
-      - home slash j f a
+      - stump
+      - lisp
+      - e lisp
+      - c plus plus
       - javascript
       - emacs
       - shell
@@ -45,7 +53,7 @@ context:
       - unmute
       - mute
   macros:
-    promptOpenAI:
+    promptVerb:
       - I need
       - I want
       - generate

diff --git a/echo_crafter/config/config.py b/echo_crafter/config/config.py
@@ -9,8 +9,9 @@
 
 def get_project_root() -> Path:
     """Get the root of the project."""
-    return Path(os.environ.get('EC_ROOT', '')) \
-        or Path(__file__).parent.parent.parent
+    path = Path(os.environ.get('EC_ROOT', '')) \
+        or Path(__file__).resolve().parent.parent.parent
+    return path
 
 
 def get_picovoice_api_key() -> str:
@@ -23,13 +24,18 @@ def get_picovoice_api_key() -> str:
     return api_key
 
 
+def build_path(rel_path: str) -> str:
+    """Build an absolute path from the given path relative to root."""
+    return str(get_project_root() / rel_path)
+
+
 @dataclass(init=False, frozen=True)
 class _Config(TypedDict):
     """Configuration for the echo-crafter package."""
 
     PROJECT_ROOT: str
     DATA_DIR: str
-    PYTHON_PACKAGES_DIR: str
+    PYTHON_PACKAGES: str
 
     PICOVOICE_API_KEY: str
     CHEETAH_MODEL_FILE: str
@@ -44,16 +50,16 @@ class _Config(TypedDict):
 
 Config: _Config = {
     "PROJECT_ROOT":           str(get_project_root()),
-    "DATA_DIR":               str(get_project_root()/"data"),
-    "PYTHON_PACKAGES_DIR":    str(get_project_root()/".venv/lib/python3.11/site-packages/python_packages"),
+    "DATA_DIR":               build_path("data"),
+    "PYTHON_PACKAGES":        build_path(".venv/lib/python3.11/site-packages/python_packages"),
 
     "PICOVOICE_API_KEY":      str(os.getenv('PICOVOICE_API_KEY')),
-    "CHEETAH_MODEL_FILE":     str(get_project_root()/"data/speech-command-cheetah-v2.pv"),
-    "RHINO_CONTEXT_FILE":     str(get_project_root()/"data/computer-commands_en_linux_v3_0_0.rhn"),
+    "CHEETAH_MODEL_FILE":     build_path("data/speech-command-cheetah-v2.pv"),
+    "RHINO_CONTEXT_FILE":     build_path("data/computer-commands_en_linux_v3_0_0.rhn"),
     "FRAME_LENGTH":           512,
     "ENDPOINT_DURATION_SEC":  1.5,
 
-    "TRANSCRIPT_BEGIN_WAV":   str(get_project_root()/"data/transcript_begin.wav"),
-    "TRANSCRIPT_SUCCESS_WAV": str(get_project_root()/"data/transcript_success.wav"),
+    "TRANSCRIPT_BEGIN_WAV":   build_path("data/transcript_begin.wav"),
+    "TRANSCRIPT_SUCCESS_WAV": build_path("data/transcript_success.wav"),
     "SOCKET_PATH":            str(Path(os.environ.get('EC_SOCKET_FILE', '/tmp/echo-crafter.sock')))
 }
diff --git a/echo_crafter/listener/listener_with_wake_word.py b/echo_crafter/listener/listener_with_wake_word.py
@@ -1,67 +1,75 @@
 """Listen for a wake word and transcribe speech until endpoint is detected."""
 
+import json
 import subprocess
 import traceback
-import socket
-from contextlib import contextmanager
+from typing import List
+
+from echo_crafter.logger import setup_logger
+from echo_crafter.config import Config
+
 from echo_crafter.listener.utils import (
+    Intent,
     microphone
 )
-from echo_crafter.logger import setup_logger
-from echo_crafter.config import Config
 
-def play_sound(wav_file):
+logger = setup_logger(__name__)
+
+def play_sound(wav_file) -> None:
     """Play a ding sound to indicate that the wake word was detected."""
-    subprocess.Popen(["aplay", "-q", wav_file])
+    subprocess.Popen(['aplay', wav_file])
 
 
-def wake_word_callback():
+def on_wake_word_detected() -> None:
     """Play a ding sound to indicate that the wake word was detected."""
     play_sound(Config['TRANSCRIPT_BEGIN_WAV'])
 
 
-@contextmanager
-def create_transcription_callback():
-    """Connect to the transcription socket and send it all partial transcripts."""
+def on_intent_inferred(intent_obj: Intent) -> None:
+    """Log the inferred intent and slots."""
+    logger.info("Intent inferred: %s", json.dumps(intent_obj))
+    print(json.dumps(intent_obj, indent=2))
+    play_sound(Config['TRANSCRIPT_SUCCESS_WAV'])
 
-    client = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
-    try:
-        client.connect(Config['SOCKET_PATH'])
 
-        def callback(partial_transcript):
-            """Send the partial transcript to the active window."""
-            client.sendall((partial_transcript).encode())
+partial_transcripts: List[str] = []
 
-        yield callback
 
-    finally:
-        client.close()
+def on_partial_transcript(partial_transcript: str) -> None:
+    """Send the partial transcript to the active window."""
+    partial_transcripts.append(partial_transcript)
+    subprocess.Popen(
+        ['xdotool', 'type', '--clearmodifiers', '--delay', '0', partial_transcript]
+    )
 
 
-def transcription_success_callback():
-    """Play a ding sound to indicate that the final transcript was received."""
-    play_sound(Config['TRANSCRIPT_SUCCESS_WAV'])
+def on_final_transcript() -> None:
+    """Log the accumulated partial transcripts"""
+    final_transcript = ''.join(partial_transcripts)
+    partial_transcripts.clear()
+    logger.info("Final transcript: %s", final_transcript)
 
 
 def main():
     """Upon detection of a wake word, transcribe speech until endpoint is detected."""
-    logger = setup_logger()
+    logger = setup_logger(__name__)
 
     with microphone() as mic:
         try:
             while True:
-                with create_transcription_callback() as transcription_callback:
-                    mic.wait_for_wake_word(wake_word_callback)
-
-                    mic.process_and_transmit_utterance(transcription_callback, transcription_success_callback)
+                mic.wait_for_wake_word(on_wake_word_detected)
+                mic.infer_intent(on_intent_inferred)
+                #mic.process_and_transmit_utterance(on_partial_transcript, on_final_transcript)
 
         except KeyboardInterrupt:
-            pass
+            mic.set_is_recording(False)
 
         except Exception as e:
             logger.error("An error occured %s", e)
             logger.error(traceback.format_exc())
 
+        finally:
+            mic.set_is_recording(False)
 
 if __name__ == '__main__':
     main()
diff --git a/echo_crafter/listener/utils/__init__.py b/echo_crafter/listener/utils/__init__.py
@@ -1,7 +1,11 @@
-from .microphone import microphone
-from .sockets import socket_connection
+from .microphone import *
+from .sockets import *
+from .types import *
 
 __all__ = [
+    'Intent',
+    'Slot',
+    'MicrophoneCallbacks',
     'socket_connection',
     'microphone'
 ]
diff --git a/echo_crafter/listener/utils/microphone.py b/echo_crafter/listener/utils/microphone.py
@@ -1,6 +1,4 @@
 from contextlib import contextmanager
-from collections import deque
-from pathlib import Path
 
 import pvcheetah
 import pvporcupine
@@ -17,7 +15,7 @@ def porcupine_context_manager():
     try:
         porcupine_instance = pvporcupine.create(
             keywords=['computer'],
-            sensitivities=[0.1],
+            sensitivities=[0.5],
             access_key=Config['PICOVOICE_API_KEY']
         )
         yield porcupine_instance
@@ -65,7 +63,8 @@ def rhino_context_manager():
     try:
         rhino_instance = pvrhino.create(
             access_key=Config['PICOVOICE_API_KEY'],
-            context_path=Config['RHINO_CONTEXT_FILE']
+            context_path=Config['RHINO_CONTEXT_FILE'],
+            sensitivity=0.8
         )
         yield rhino_instance
     finally:
@@ -81,7 +80,6 @@ def __init__(self, porcupine, rhino, cheetah, recorder):
         self.rhino = rhino
         self.cheetah = cheetah
         self.recorder = recorder
-        self.is_listening = False
         self.wake_word_frame = None
 
 
@@ -94,17 +92,16 @@ def get_next_frame(self):
         return self.recorder.read()
 
 
-    def set_is_listening(self, is_listening):
-        """Set the is_listening attribute of the recorder."""
-        self.is_listening = is_listening
-        if is_listening and not self.recorder.is_recording:
-            self.recorder.start()
-        elif not is_listening and self.recorder.is_recording:
+    def set_is_recording(self, is_recording):
+        """Set the is_recording attribute of the recorder."""
+        if self.recorder.is_recording and not is_recording:
             self.recorder.stop()
+        elif not self.recorder.is_recording and is_recording:
+            self.recorder.start()
 
     def wait_for_wake_word(self, wake_word_callback=None):
         """Wait for the wake word to be detected."""
-        while True:
+        while self.recorder.is_recording:
             pcm_frame = self.recorder.read()
             keyword_index = self.porcupine.process(pcm_frame)
             if keyword_index >= 0:
@@ -114,19 +111,32 @@ def wait_for_wake_word(self, wake_word_callback=None):
                 break
 
 
-    def process_and_transmit_utterance(self, transcription_callback, transcription_success_callback=None):
-        """Process the utterance and transmit the partial transcript to the client."""
+    def infer_intent(self, on_intent_inferred):
+        """Infer the user intent and pass it to the callback once collected."""
+        while self.recorder.is_recording:
+            pcm_frame = self.get_next_frame()
+            rhino_is_finalized = self.rhino.process(pcm_frame)
+            if rhino_is_finalized:
+                intent = self.rhino.get_inference()
+                on_intent_inferred(intent)
+                break
+        self.rhino.reset()
+
+
+    def process_and_transmit_utterance(self, on_partial_transcript, on_final_transcript):
+        """Process the utterance and pass the transcripts the callback."""
         is_endpoint = False
         is_transcription_success = False
-        while not is_endpoint:
+        while self.recorder.is_recording:
             pcm_frame = self.get_next_frame()
             partial_transcript, is_endpoint = self.cheetah.process(pcm_frame)
             if is_endpoint:
-                partial_transcript += (self.cheetah.flush() + 'STOP')
+                partial_transcript += (self.cheetah.flush())
                 is_transcription_success = True
-            transcription_callback(partial_transcript)
-            if is_transcription_success and transcription_success_callback is not None:
-                    transcription_success_callback()
+            on_partial_transcript(partial_transcript)
+            if is_transcription_success:
+                on_final_transcript()
+                break
 
 
 @contextmanager
@@ -140,5 +150,5 @@ def microphone():
                           rhino,
                           cheetah,
                           recorder)
-        mic.set_is_listening(True)
+        mic.set_is_recording(True)
         yield mic
diff --git a/echo_crafter/listener/utils/types.py b/echo_crafter/listener/utils/types.py
@@ -0,0 +1,41 @@
+from enum import StrEnum
+from typing import Any, NamedTuple, Tuple, List, Callable, TypeVar, Type
+
+AudioFrame = List[int]
+
+class IntentName(StrEnum):
+    """Intent names for the voice commands."""
+
+    UNKNOWN = "unknown"
+    GET_SCRIPT = "getScript"
+    ANSWER_QUESTION = "answerQuestion"
+    TRANSCRIBE_TO_KEYBOARD = "simplyTranscribe"
+    FOCUS_WINDOW = "focusWindow"
+    OPEN_WINDOW = "openWindow"
+    SET_VOLUME = "setVolume"
+    CANCEL = "cancel"
+
+
+class Slot(StrEnum):
+    """Slot names for the intent's named parameters."""
+
+    PROGRAMMING_LANGUAGE = "programmingLanguage"
+    PROMPT_TYPE = "promptType"
+    WINDOW_NAME = "windowName"
+    VOLUME_SETTING = "volumeSetting"
+
+
+class Intent(NamedTuple):
+    """Named tuple for the intent."""
+
+    intent: IntentName
+    slots: List[Slot]
+
+
+class MicrophoneCallbacks(NamedTuple):
+    """Named tuple for the microphone callbacks."""
+
+    on_wake_word: Callable[[], Any]
+    on_intent: Callable[[Intent], Any]
+    on_partial_transcript: Callable[[str], Any]
+    on_final_transcript: Callable[[], Any]
diff --git a/echo_crafter/logger/__init__.py b/echo_crafter/logger/__init__.py
@@ -12,13 +12,11 @@ class CustomRecord(logging.LogRecord):
     """Custom LogRecord class with a timestamp attribute."""
 
     def __init__(self, name, level, pathname, lineno, msg, args,
-                 exc_info, func=None, sinfo=None, **kwargs):
+                 exc_info, func=None, sinfo=None):
         """Initialize a CustomRecord instance."""
         super().__init__(name, level, pathname, lineno, msg, args,
                          exc_info, func=func, sinfo=sinfo)
         self.timestamp = time.time()
-        self.intent = kwargs.get('intent', '')
-        self.slots = kwargs.get('slots', {})
 
 
 class JsonFormatter(logging.Formatter):
@@ -31,7 +29,7 @@ def format(self, record):
         return json.dumps(log_dict)
 
 
-def setup_logger(name=__name__, level=logging.INFO):
+def setup_logger(name, level=logging.INFO):
     """Set up a logger with a JSON formatter."""
     logger = logging.getLogger(name)
     logger.setLevel(level)