Skip to content

Commit

Permalink
feat: Add code for chatting with a website
Browse files Browse the repository at this point in the history
  • Loading branch information
jirimoravcik committed Aug 21, 2023
1 parent 3adc06d commit fb17475
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
OPENAI_API_KEY=your_api_key
APIFY_API_TOKEN=your_api_key
WEBSITE_URL="https://docs.apify.com/platform"
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# Ignore the folder with the vector database's data
db/
27 changes: 25 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,25 @@
# chat-with-a-website
A simple app that lets you chat with a given website.
# Chat with a website

Chat with a website using Apify and ChatGPT.

## Setup

Before getting started, be sure to sign up for an [Apify](https://console.apify.com/sign-up) and [OpenAI](https://openai.com/) account and create API keys.

To set up and run this project, follow these steps:

1. Install the required packages with `pip`:
```
pip install -r requirements.txt
```
2. Rename the `.env.example` file to `.env` and replace the variables. Here's an explanation of the variables in the .env file:

`OPENAI_API_KEY`: Your OpenAI API key. You can obtain it from your OpenAI account dashboard.
`APIFY_API_TOKEN`: Your Apify API token. You can obtain it from [Apify settings](https://console.apify.com/account/integrations).
`WEBSITE_URL`: The full URL of the website you'd like to chat with.

3. Run the `scrape.py` script to scrape the website's data using Apify's [Website content crawler](https://apify.com/apify/website-content-crawler).
4. Run the Streamlit chat app, which should default to `http://localhost:8501` and allow you to chat with the website:
```
streamlit run chat.py
```
61 changes: 61 additions & 0 deletions chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os

import streamlit as st
from dotenv import load_dotenv
from langchain.callbacks.base import BaseCallbackHandler
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
from langchain.vectorstores import Chroma

load_dotenv()

website_url = os.environ.get('WEBSITE_URL', 'a website')

st.set_page_config(page_title=f'Chat with {website_url}')
st.title('Chat with a website')

@st.cache_resource(ttl='1h')
def get_retriever():
embeddings = OpenAIEmbeddings()
vectordb = Chroma(persist_directory='db', embedding_function=embeddings)

retriever = vectordb.as_retriever(search_type='mmr')

return retriever

class StreamHandler(BaseCallbackHandler):
def __init__(self, container: st.delta_generator.DeltaGenerator, initial_text: str = ''):
self.container = container
self.text = initial_text

def on_llm_new_token(self, token: str, **kwargs) -> None:
self.text += token
self.container.markdown(self.text)

retriever = get_retriever()

msgs = StreamlitChatMessageHistory()
memory = ConversationBufferMemory(memory_key='chat_history', chat_memory=msgs, return_messages=True)

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0, streaming=True)
qa_chain = ConversationalRetrievalChain.from_llm(
llm, retriever=retriever, memory=memory, verbose=False
)

if st.sidebar.button('Clear message history') or len(msgs.messages) == 0:
msgs.clear()
msgs.add_ai_message(f'Ask me anything about {website_url}!')

avatars = {'human': 'user', 'ai': 'assistant'}
for msg in msgs.messages:
st.chat_message(avatars[msg.type]).write(msg.content)

if user_query := st.chat_input(placeholder='Ask me anything!'):
st.chat_message('user').write(user_query)

with st.chat_message('assistant'):
stream_handler = StreamHandler(st.empty())
response = qa_chain.run(user_query, callbacks=[stream_handler])
7 changes: 7 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apify-client
chromadb
langchain
openai
python-dotenv
streamlit
tiktoken
40 changes: 40 additions & 0 deletions scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os

from apify_client import ApifyClient
from dotenv import load_dotenv
from langchain.document_loaders import ApifyDatasetLoader
from langchain.document_loaders.base import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

# Load environment variables from a .env file
load_dotenv()

if __name__ == '__main__':
apify_client = ApifyClient(os.environ.get('APIFY_API_TOKEN'))
website_url = os.environ.get('WEBSITE_URL')
print(f'Extracting data from "{website_url}". Please wait...')
actor_run_info = apify_client.actor('apify/website-content-crawler').call(
run_input={'startUrls': [{'url': website_url}]}
)
print('Saving data into the vector database. Please wait...')
loader = ApifyDatasetLoader(
dataset_id=actor_run_info['defaultDatasetId'],
dataset_mapping_function=lambda item: Document(
page_content=item['text'] or '', metadata={'source': item['url']}
),
)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
docs = text_splitter.split_documents(documents)

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(
documents=docs,
embedding=embedding,
persist_directory='db2',
)
vectordb.persist()
print('All done!')

0 comments on commit fb17475

Please sign in to comment.