From 73b9ca54cbf8074fa1a11f627f1942b0634f6f38 Mon Sep 17 00:00:00 2001 From: berkedilekoglu Date: Wed, 13 Sep 2023 21:39:56 +0300 Subject: [PATCH] Using batches for update document with a new function in ChromaDB (#6561) https://github.com/hwchase17/langchain/blob/2a4b32dee24c22159805f643b87eece107224951/langchain/vectorstores/chroma.py#L355-L375 Currently, the defined update_document function only takes a single document and its ID for updating. However, Chroma can update multiple documents by taking a list of IDs and documents for batch updates. If we update 'update_document' function both document_id and document can be `Union[str, List[str]]` but we need to do type check. Because embed_documents and update functions takes List for text and document_ids variables. I believe that, writing a new function is the best option. I update the Chroma vectorstore with refreshed information from my website every 20 minutes. Updating the update_document function to perform simultaneous updates for each changed piece of information would significantly reduce the update time in such use cases. For my case I update a total of 8810 chunks. Updating these 8810 individual chunks using the current function takes a total of 8.5 minutes. However, if we process the inputs in batches and update them collectively, all 8810 separate chunks can be updated in just 1 minute. This significantly reduces the time it takes for users of actively used chatbots to access up-to-date information. I can add an integration test and an example for the documentation for the new update_document_batch function. @hwchase17 [berkedilekoglu](https://twitter.com/berkedilekoglu) --- .../langchain/vectorstores/chroma.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/libs/langchain/langchain/vectorstores/chroma.py b/libs/langchain/langchain/vectorstores/chroma.py index 3b8edc2212e8d..7994e2326dbc0 100644 --- a/libs/langchain/langchain/vectorstores/chroma.py +++ b/libs/langchain/langchain/vectorstores/chroma.py @@ -541,19 +541,28 @@ def update_document(self, document_id: str, document: Document) -> None: document_id (str): ID of the document to update. document (Document): Document to update. """ - text = document.page_content - metadata = document.metadata + return self.update_documents([document_id], [document]) + + def update_documents(self, ids: List[str], documents: List[Document]) -> None: + """Update a document in the collection. + + Args: + ids (List[str]): List of ids of the document to update. + documents (List[Document]): List of documents to update. + """ + text = [document.page_content for document in documents] + metadata = [document.metadata for document in documents] if self._embedding_function is None: raise ValueError( "For update, you must specify an embedding function on creation." ) - embeddings = self._embedding_function.embed_documents([text]) + embeddings = self._embedding_function.embed_documents(text) self._collection.update( - ids=[document_id], + ids=ids, embeddings=embeddings, - documents=[text], - metadatas=[metadata], + documents=text, + metadatas=metadata, ) @classmethod