fix issue where text messages where not being considered correctly wh…

…en audio is enabled
pablomarin · Jan 3, 2025 · 56ba596 · 56ba596
1 parent 307da24
commit 56ba596
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 40 deletions.
diff --git a/15-FastAPI-API.ipynb b/15-FastAPI-API.ipynb
@@ -351,7 +351,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "id": "21f0382f-3960-46ed-8f68-77ac735c90c2",
    "metadata": {
     "tags": []
@@ -365,14 +365,12 @@
       "[Tool Start] Starting documents_retrieval\n",
       "\n",
       "[Tool End] Done documents_retrieval\n",
-      "The scene where Joey asks Rachel to marry him occurs in a moment of misunderstanding. Joey, feeling a special bond with Rachel and concerned about her being a single mom, offers to marry her. He says, \"Rachel Green will you marry me?\" Rachel is surprised and responds, \"What?\" Joey insists, \"I want you to know you're not gonna be alone in this\" [[source]](https://blobstorageixqo5iaqmpzwc.blob.core.windows.net/friends/s08/e02/c11.txt?sv=2022-11-02&ss=b&srt=sco&sp=rltfx&se=2026-01-02T09:04:19Z&st=2025-01-02T01:04:19Z&spr=https&sig=q%2FjY9R25rdc%2BIH1iiq1uPIBm82xECsN9d%2B2ftdM1SJI%3D).\n",
-      "\n",
-      "Rachel, touched by Joey's gesture, declines the proposal, saying, \"Oh you're so sweet. You're so-so sweet, honey. But I'm not, I'm not looking for a husband\" [[source]](https://blobstorageixqo5iaqmpzwc.blob.core.windows.net/friends/s08/e02/c11.txt?sv=2022-11-02&ss=b&srt=sco&sp=rltfx&se=2026-01-02T09:04:19Z&st=2025-01-02T01:04:19Z&spr=https&sig=q%2FjY9R25rdc%2BIH1iiq1uPIBm82xECsN9d%2B2ftdM1SJI%3D)."
+      "The scene where Joey wears all of Chandler's clothes is a humorous moment from the show \"Friends.\" Joey, in retaliation for Chandler hiding his clothes, decides to wear everything Chandler owns. He walks into the room wearing multiple layers of Chandler's clothing and exclaims, \"Look at me! I'm Chandler! Could I be wearing any more clothes?\" He even jokes about going commando, saying, \"Maybe if I wasn't going commando...\" The scene is made even funnier by Joey's exaggerated movements and the sheer volume of clothes he's wearing, which makes it difficult for him to move comfortably [[source]](https://blobstorageixqo5iaqmpzwc.blob.core.windows.net/friends/s06/e20/c09.txt?sv=2022-11-02&ss=b&srt=sco&sp=rltfx&se=2026-01-02T09:04:19Z&st=2025-01-02T01:04:19Z&spr=https&sig=q%2FjY9R25rdc%2BIH1iiq1uPIBm82xECsN9d%2B2ftdM1SJI%3D)."
      ]
     }
    ],
    "source": [
-    "stream_question = \"@docsearch, how is the scene where joey asks rachel to marry\"\n",
+    "stream_question = \"@docsearch, describe the scene where Joey wears all of Chandler's clothes\"\n",
     "call_stream(stream_question, thread_id=random_session_id)"
    ]
   },

diff --git a/apps/frontend/app/pages/3_FastAPI_Chat.py b/apps/frontend/app/pages/3_FastAPI_Chat.py
@@ -3,24 +3,30 @@
 import os
 import sys
 
-# Import STT and TTS functions from audio_utils.py
+# -----------------------------------------------------------------------------
+# Imports
+# -----------------------------------------------------------------------------
+# Import STT (speech-to-text) and TTS (text-to-speech) functions from audio_utils.py
 try:
     from audio_utils import (
         speech_to_text_from_bytes as speech_to_text,
         text_to_speech,
     )
 except Exception as e:
-    # Add the path four levels up
+    # If local import fails, add the path four levels up and import from there
     sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../../')))
     from common.audio_utils import (
         speech_to_text_from_bytes as speech_to_text,
         text_to_speech,
     )
 
-
 import streamlit as st
 
-from app import model_name, api_url, get_env_var
+from app import (
+    model_name,
+    api_url,
+    get_env_var,
+)
 from langchain_core.messages import AIMessage, HumanMessage
 from helpers.streamlit_helpers import (
     configure_page,
@@ -35,26 +41,28 @@
 from audio_recorder_streamlit import audio_recorder
 
 # -----------------------------------------------------------------------------
-# Configuration
+# Page Configuration
 # -----------------------------------------------------------------------------
 page_title = get_env_var("AGENT_PAGE_TITLE", default_value="AI Agent", required=True)
 configure_page(page_title, "💬")
+
 logger = get_logger(__name__)
 logger.info(f"Page configured with title: {page_title}")
 
 # -----------------------------------------------------------------------------
-# Session IDs and Chat History
+# Initialize Session IDs and Chat History
 # -----------------------------------------------------------------------------
 session_id, user_id = get_or_create_ids()
 initialize_chat_history(model_name)
 
 # -----------------------------------------------------------------------------
-# Sidebar with optional voice input
+# Sidebar (Voice Input Option)
 # -----------------------------------------------------------------------------
 with st.sidebar:
     st.header("Voice Input")
     voice_enabled = st.checkbox("Enable Voice Capabilities")
 
+    # If voice is enabled, provide audio recorder
     audio_bytes = None
     if voice_enabled:
         audio_bytes = audio_recorder(
@@ -68,48 +76,60 @@
             logger.info("Audio recorded from user microphone.")
 
 # -----------------------------------------------------------------------------
-# Display existing chat messages
+# Display Existing Chat Messages
 # -----------------------------------------------------------------------------
 display_chat_history()
 logger.debug("Displayed existing chat history.")
 
-
 # -----------------------------------------------------------------------------
 # Handle User Input (Text & Audio)
 # -----------------------------------------------------------------------------
-user_query = st.chat_input("Type your message here...")
-
-# Track whether a new user message was added
 new_user_message = False
 
-# Handle audio input
-if audio_bytes:
-    transcript = speech_to_text(audio_bytes)
-    logger.debug(f"Transcript from STT: {transcript}")
-    if transcript:
-        st.session_state.chat_history.append(HumanMessage(content=transcript))
+# Text query from the st.chat_input
+user_query = st.chat_input("Type your message here...")
+typed_query = user_query.strip() if user_query else None
+
+# 1) If voice is enabled, we allow typed OR voice input
+if voice_enabled:
+    if typed_query:
+        # A typed query takes priority if present
+        st.session_state.chat_history.append(HumanMessage(content=typed_query))
         with st.chat_message("Human"):
-            st.write(transcript)
-        logger.info("Transcript added to chat history.")
+            st.markdown(typed_query)
+        logger.info("User typed query added to chat history: %s", typed_query)
+        new_user_message = True
+    elif audio_bytes:
+        # Only if there's no typed input, process recorded audio
+        transcript = speech_to_text(audio_bytes)
+        logger.debug(f"Transcript from STT: {transcript}")
+        if transcript:
+            st.session_state.chat_history.append(HumanMessage(content=transcript))
+            with st.chat_message("Human"):
+                st.write(transcript)
+            logger.info("Transcript added to chat history.")
+            new_user_message = True
+
+# 2) If voice is disabled, we only process typed input
+else:
+    if typed_query:
+        st.session_state.chat_history.append(HumanMessage(content=typed_query))
+        with st.chat_message("Human"):
+            st.markdown(typed_query)
+        logger.info("User typed query added to chat history: %s", typed_query)
         new_user_message = True
-
-# Handle text input (st.chat_input)
-if user_query is not None and user_query.strip() and not new_user_message:
-    st.session_state.chat_history.append(HumanMessage(content=user_query))
-    with st.chat_message("Human"):
-        st.markdown(user_query)
-    logger.info("User query added to chat history: %s", user_query)
-    new_user_message = True
 
 # -----------------------------------------------------------------------------
-# Generate AI response if the last message is from a Human
+# Generate AI Response (If We Have a New User Message)
 # -----------------------------------------------------------------------------
-if new_user_message and not isinstance(st.session_state.chat_history[-1], AIMessage):
+if new_user_message:
+    # The last message is now from a Human; let's call the AI
     with st.chat_message("AI"):
         try:
-            logger.info("Sending request to SSE /stream endpoint with user query.")
             user_text = st.session_state.chat_history[-1].content
-
+            logger.info("Sending request to SSE /stream endpoint with user query.")
+
+            # Stream the AI response using your SSE consumption function
             ai_response = st.write_stream(
                 consume_api(api_url, user_text, session_id, user_id)
             )
@@ -123,7 +143,7 @@
         if ai_response:
             st.session_state.chat_history.append(AIMessage(content=ai_response))
 
-            # Voice Output (if enabled)
+            # If voice is enabled, convert AI response text to speech and auto-play
             if voice_enabled:
                 try:
                     audio_file_path = text_to_speech(ai_response)
@@ -134,5 +154,3 @@
                         logger.info("Temporary audio file removed.")
                 except Exception as ex:
                     logger.error(f"Error generating or playing audio: {ex}", exc_info=True)
-
-
diff --git a/apps/frontend/frontend.zip b/apps/frontend/frontend.zip