diff --git a/.gitignore b/.gitignore index ae00d01e1ae23..c9451f40d2d86 100644 --- a/.gitignore +++ b/.gitignore @@ -149,4 +149,7 @@ wandb/ # integration test artifacts data_map* -\[('_type', 'fake'), ('stop', None)] \ No newline at end of file +\[('_type', 'fake'), ('stop', None)] + +# Replit files +*replit* \ No newline at end of file diff --git a/docs/modules/indexes/vectorstores/examples/elasticsearch.ipynb b/docs/modules/indexes/vectorstores/examples/elasticsearch.ipynb index 9f4a34da6bd14..ca2a26b1ee56c 100644 --- a/docs/modules/indexes/vectorstores/examples/elasticsearch.ipynb +++ b/docs/modules/indexes/vectorstores/examples/elasticsearch.ipynb @@ -1,238 +1,580 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "683953b3", - "metadata": {}, - "source": [ - "# ElasticSearch\n", - "\n", - ">[Elasticsearch](https://www.elastic.co/elasticsearch/) is a distributed, RESTful search and analytics engine. It provides a distributed, multitenant-capable full-text search engine with an HTTP web interface and schema-free JSON documents.\n", - "\n", - "This notebook shows how to use functionality related to the `Elasticsearch` database." - ] - }, - { - "cell_type": "markdown", - "id": "b66c12b2-2a07-4136-ac77-ce1c9fa7a409", - "metadata": { - "tags": [] - }, - "source": [ - "## Installation" - ] - }, - { - "cell_type": "markdown", - "id": "81f43794-f002-477c-9b68-4975df30e718", - "metadata": {}, - "source": [ - "Check out [Elasticsearch installation instructions](https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html).\n", - "\n", - "To connect to an Elasticsearch instance that does not require\n", - "login credentials, pass the Elasticsearch URL and index name along with the\n", - "embedding object to the constructor.\n", - "\n", - "Example:\n", - "```python\n", - " from langchain import ElasticVectorSearch\n", - " from langchain.embeddings import OpenAIEmbeddings\n", - "\n", - " embedding = OpenAIEmbeddings()\n", - " elastic_vector_search = ElasticVectorSearch(\n", - " elasticsearch_url=\"http://localhost:9200\",\n", - " index_name=\"test_index\",\n", - " embedding=embedding\n", - " )\n", - "```\n", - "\n", - "To connect to an Elasticsearch instance that requires login credentials,\n", - "including Elastic Cloud, use the Elasticsearch URL format\n", - "https://username:password@es_host:9243. For example, to connect to Elastic\n", - "Cloud, create the Elasticsearch URL with the required authentication details and\n", - "pass it to the ElasticVectorSearch constructor as the named parameter\n", - "elasticsearch_url.\n", - "\n", - "You can obtain your Elastic Cloud URL and login credentials by logging in to the\n", - "Elastic Cloud console at https://cloud.elastic.co, selecting your deployment, and\n", - "navigating to the \"Deployments\" page.\n", - "\n", - "To obtain your Elastic Cloud password for the default \"elastic\" user:\n", - "1. Log in to the Elastic Cloud console at https://cloud.elastic.co\n", - "2. Go to \"Security\" > \"Users\"\n", - "3. Locate the \"elastic\" user and click \"Edit\"\n", - "4. Click \"Reset password\"\n", - "5. Follow the prompts to reset the password\n", - "\n", - "Format for Elastic Cloud URLs is\n", - "https://username:password@cluster_id.region_id.gcp.cloud.es.io:9243.\n", - "\n", - "Example:\n", - "```python\n", - " from langchain import ElasticVectorSearch\n", - " from langchain.embeddings import OpenAIEmbeddings\n", - "\n", - " embedding = OpenAIEmbeddings()\n", - "\n", - " elastic_host = \"cluster_id.region_id.gcp.cloud.es.io\"\n", - " elasticsearch_url = f\"https://username:password@{elastic_host}:9243\"\n", - " elastic_vector_search = ElasticVectorSearch(\n", - " elasticsearch_url=elasticsearch_url,\n", - " index_name=\"test_index\",\n", - " embedding=embedding\n", - " )\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6197931-cbe5-460c-a5e6-b5eedb83887c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!pip install elasticsearch" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "67ab8afa-f7c6-4fbf-b596-cb512da949da", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "OpenAI API Key: ········\n" - ] + "cells": [ + { + "cell_type": "markdown", + "id": "683953b3", + "metadata": { + "id": "683953b3" + }, + "source": [ + "# ElasticSearch\n", + "\n", + ">[Elasticsearch](https://www.elastic.co/elasticsearch/) is a distributed, RESTful search and analytics engine. It provides a distributed, multitenant-capable full-text search engine with an HTTP web interface and schema-free JSON documents.\n", + "\n", + "This notebook shows how to use functionality related to the `Elasticsearch` database." + ] + }, + { + "cell_type": "markdown", + "source": [ + "# ElasticVectorSearch class" + ], + "metadata": { + "id": "tKSYjyTBtSLc" + }, + "id": "tKSYjyTBtSLc" + }, + { + "cell_type": "markdown", + "id": "b66c12b2-2a07-4136-ac77-ce1c9fa7a409", + "metadata": { + "tags": [], + "id": "b66c12b2-2a07-4136-ac77-ce1c9fa7a409" + }, + "source": [ + "## Installation" + ] + }, + { + "cell_type": "markdown", + "id": "81f43794-f002-477c-9b68-4975df30e718", + "metadata": { + "id": "81f43794-f002-477c-9b68-4975df30e718" + }, + "source": [ + "Check out [Elasticsearch installation instructions](https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html).\n", + "\n", + "To connect to an Elasticsearch instance that does not require\n", + "login credentials, pass the Elasticsearch URL and index name along with the\n", + "embedding object to the constructor.\n", + "\n", + "Example:\n", + "```python\n", + " from langchain import ElasticVectorSearch\n", + " from langchain.embeddings import OpenAIEmbeddings\n", + "\n", + " embedding = OpenAIEmbeddings()\n", + " elastic_vector_search = ElasticVectorSearch(\n", + " elasticsearch_url=\"http://localhost:9200\",\n", + " index_name=\"test_index\",\n", + " embedding=embedding\n", + " )\n", + "```\n", + "\n", + "To connect to an Elasticsearch instance that requires login credentials,\n", + "including Elastic Cloud, use the Elasticsearch URL format\n", + "https://username:password@es_host:9243. For example, to connect to Elastic\n", + "Cloud, create the Elasticsearch URL with the required authentication details and\n", + "pass it to the ElasticVectorSearch constructor as the named parameter\n", + "elasticsearch_url.\n", + "\n", + "You can obtain your Elastic Cloud URL and login credentials by logging in to the\n", + "Elastic Cloud console at https://cloud.elastic.co, selecting your deployment, and\n", + "navigating to the \"Deployments\" page.\n", + "\n", + "To obtain your Elastic Cloud password for the default \"elastic\" user:\n", + "1. Log in to the Elastic Cloud console at https://cloud.elastic.co\n", + "2. Go to \"Security\" > \"Users\"\n", + "3. Locate the \"elastic\" user and click \"Edit\"\n", + "4. Click \"Reset password\"\n", + "5. Follow the prompts to reset the password\n", + "\n", + "Format for Elastic Cloud URLs is\n", + "https://username:password@cluster_id.region_id.gcp.cloud.es.io:9243.\n", + "\n", + "Example:\n", + "```python\n", + " from langchain import ElasticVectorSearch\n", + " from langchain.embeddings import OpenAIEmbeddings\n", + "\n", + " embedding = OpenAIEmbeddings()\n", + "\n", + " elastic_host = \"cluster_id.region_id.gcp.cloud.es.io\"\n", + " elasticsearch_url = f\"https://username:password@{elastic_host}:9243\"\n", + " elastic_vector_search = ElasticVectorSearch(\n", + " elasticsearch_url=elasticsearch_url,\n", + " index_name=\"test_index\",\n", + " embedding=embedding\n", + " )\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6197931-cbe5-460c-a5e6-b5eedb83887c", + "metadata": { + "tags": [], + "id": "d6197931-cbe5-460c-a5e6-b5eedb83887c" + }, + "outputs": [], + "source": [ + "!pip install elasticsearch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67ab8afa-f7c6-4fbf-b596-cb512da949da", + "metadata": { + "tags": [], + "id": "67ab8afa-f7c6-4fbf-b596-cb512da949da", + "outputId": "fd16b37f-cb76-40a9-b83f-eab58dd0d912" + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "OpenAI API Key: ········\n" + ] + } + ], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')" + ] + }, + { + "cell_type": "markdown", + "id": "f6030187-0bd7-4798-8372-a265036af5e0", + "metadata": { + "tags": [], + "id": "f6030187-0bd7-4798-8372-a265036af5e0" + }, + "source": [ + "## Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aac9563e", + "metadata": { + "tags": [], + "id": "aac9563e" + }, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import ElasticVectorSearch\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3c3999a", + "metadata": { + "tags": [], + "id": "a3c3999a" + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "loader = TextLoader('../../../state_of_the_union.txt')\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12eb86d8", + "metadata": { + "tags": [], + "id": "12eb86d8" + }, + "outputs": [], + "source": [ + "db = ElasticVectorSearch.from_documents(docs, embeddings, elasticsearch_url=\"http://localhost:9200\")\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = db.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b172de8", + "metadata": { + "id": "4b172de8", + "outputId": "ca05a209-4514-4b5c-f6cb-2348f58c19a2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. \n", + "\n", + "We cannot let this happen. \n", + "\n", + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" + ] + } + ], + "source": [ + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# ElasticKnnSearch Class\n", + "The `ElasticKnnSearch` implements features allowing storing vectors and documents in Elasticsearch for use with approximate [kNN search](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html)" + ], + "metadata": { + "id": "FheGPztJsrRB" + }, + "id": "FheGPztJsrRB" + }, + { + "cell_type": "code", + "source": [ + "!pip install langchain elasticsearch" + ], + "metadata": { + "id": "gRVcbh5zqCJQ" + }, + "execution_count": null, + "outputs": [], + "id": "gRVcbh5zqCJQ" + }, + { + "cell_type": "code", + "source": [ + "from langchain.vectorstores.elastic_vector_search import ElasticKnnSearch\n", + "from langchain.embeddings import ElasticsearchEmbeddings\n", + "import elasticsearch" + ], + "metadata": { + "id": "TJtqiw5AqBp8" + }, + "execution_count": null, + "outputs": [], + "id": "TJtqiw5AqBp8" + }, + { + "cell_type": "code", + "source": [ + "# Initialize ElasticsearchEmbeddings\n", + "model_id = \"\" \n", + "dims = dim_count\n", + "es_cloud_id = \"ESS_CLOUD_ID\"\n", + "es_user = \"es_user\"\n", + "es_password = \"es_pass\"\n", + "test_index = \"\"\n", + "#input_field = \"your_input_field\" # if different from 'text_field'" + ], + "metadata": { + "id": "XHfC0As6qN3T" + }, + "execution_count": null, + "outputs": [], + "id": "XHfC0As6qN3T" + }, + { + "cell_type": "code", + "source": [ + "# Generate embedding object\n", + "embeddings = ElasticsearchEmbeddings.from_credentials(\n", + " model_id,\n", + " #input_field=input_field,\n", + " es_cloud_id=es_cloud_id,\n", + " es_user=es_user,\n", + " es_password=es_password,\n", + ")" + ], + "metadata": { + "id": "UkTipx1lqc3h" + }, + "execution_count": null, + "outputs": [], + "id": "UkTipx1lqc3h" + }, + { + "cell_type": "code", + "source": [ + "# Initialize ElasticKnnSearch\n", + "knn_search = ElasticKnnSearch(\n", + "\tes_cloud_id=es_cloud_id, \n", + "\tes_user=es_user, \n", + "\tes_password=es_password, \n", + "\tindex_name= test_index, \n", + "\tembedding= embeddings\n", + ")" + ], + "metadata": { + "id": "74psgD0oqjYK" + }, + "execution_count": null, + "outputs": [], + "id": "74psgD0oqjYK" + }, + { + "cell_type": "markdown", + "source": [ + "## Test adding vectors" + ], + "metadata": { + "id": "7AfgIKLWqnQl" + }, + "id": "7AfgIKLWqnQl" + }, + { + "cell_type": "code", + "source": [ + "# Test `add_texts` method\n", + "texts = [\"Hello, world!\", \"Machine learning is fun.\", \"I love Python.\"]\n", + "knn_search.add_texts(texts)\n", + "\n", + "# Test `from_texts` method\n", + "new_texts = [\"This is a new text.\", \"Elasticsearch is powerful.\", \"Python is great for data analysis.\"]\n", + "knn_search.from_texts(new_texts, dims=dims)" + ], + "metadata": { + "id": "yNUUIaL9qmze" + }, + "execution_count": null, + "outputs": [], + "id": "yNUUIaL9qmze" + }, + { + "cell_type": "markdown", + "source": [ + "## Test knn search using query vector builder " + ], + "metadata": { + "id": "0zdR-Iubquov" + }, + "id": "0zdR-Iubquov" + }, + { + "cell_type": "code", + "source": [ + "# Test `knn_search` method with model_id and query_text\n", + "query = \"Hello\"\n", + "knn_result = knn_search.knn_search(query = query, model_id= model_id, k=2)\n", + "print(f\"kNN search results for query '{query}': {knn_result}\")\n", + "print(f\"The 'text' field value from the top hit is: '{knn_result['hits']['hits'][0]['_source']['text']}'\")\n", + "\n", + "# Test `hybrid_search` method\n", + "query = \"Hello\"\n", + "hybrid_result = knn_search.knn_hybrid_search(query = query, model_id= model_id, k=2)\n", + "print(f\"Hybrid search results for query '{query}': {hybrid_result}\")\n", + "print(f\"The 'text' field value from the top hit is: '{hybrid_result['hits']['hits'][0]['_source']['text']}'\")" + ], + "metadata": { + "id": "bwR4jYvqqxTo" + }, + "execution_count": null, + "outputs": [], + "id": "bwR4jYvqqxTo" + }, + { + "cell_type": "markdown", + "source": [ + "## Test knn search using pre generated vector \n" + ], + "metadata": { + "id": "ltXYqp0qqz7R" + }, + "id": "ltXYqp0qqz7R" + }, + { + "cell_type": "code", + "source": [ + "# Generate embedding for tests\n", + "query_text = 'Hello'\n", + "query_embedding = embeddings.embed_query(query_text)\n", + "print(f\"Length of embedding: {len(query_embedding)}\\nFirst two items in embedding: {query_embedding[:2]}\")\n", + "\n", + "# Test knn Search\n", + "knn_result = knn_search.knn_search(query_vector = query_embedding, k=2)\n", + "print(f\"The 'text' field value from the top hit is: '{knn_result['hits']['hits'][0]['_source']['text']}'\")\n", + "\n", + "# Test hybrid search - Requires both query_text and query_vector\n", + "knn_result = knn_search.knn_hybrid_search(query_vector = query_embedding, query=query_text, k=2)\n", + "print(f\"The 'text' field value from the top hit is: '{knn_result['hits']['hits'][0]['_source']['text']}'\")" + ], + "metadata": { + "id": "O5COtpTqq23t" + }, + "execution_count": null, + "outputs": [], + "id": "O5COtpTqq23t" + }, + { + "cell_type": "markdown", + "source": [ + "## Test source option" + ], + "metadata": { + "id": "0dnmimcJq42C" + }, + "id": "0dnmimcJq42C" + }, + { + "cell_type": "code", + "source": [ + "# Test `knn_search` method with model_id and query_text\n", + "query = \"Hello\"\n", + "knn_result = knn_search.knn_search(query = query, model_id= model_id, k=2, source=False)\n", + "assert not '_source' in knn_result['hits']['hits'][0].keys()\n", + "\n", + "# Test `hybrid_search` method\n", + "query = \"Hello\"\n", + "hybrid_result = knn_search.knn_hybrid_search(query = query, model_id= model_id, k=2, source=False)\n", + "assert not '_source' in hybrid_result['hits']['hits'][0].keys()" + ], + "metadata": { + "id": "v4_B72nHq7g1" + }, + "execution_count": null, + "outputs": [], + "id": "v4_B72nHq7g1" + }, + { + "cell_type": "markdown", + "source": [ + "## Test fields option " + ], + "metadata": { + "id": "teHgJgrlq-Jb" + }, + "id": "teHgJgrlq-Jb" + }, + { + "cell_type": "code", + "source": [ + "# Test `knn_search` method with model_id and query_text\n", + "query = \"Hello\"\n", + "knn_result = knn_search.knn_search(query = query, model_id= model_id, k=2, fields=['text'])\n", + "assert 'text' in knn_result['hits']['hits'][0]['fields'].keys()\n", + "\n", + "# Test `hybrid_search` method\n", + "query = \"Hello\"\n", + "hybrid_result = knn_search.knn_hybrid_search(query = query, model_id= model_id, k=2, fields=['text'])\n", + "assert 'text' in hybrid_result['hits']['hits'][0]['fields'].keys()" + ], + "metadata": { + "id": "utNBbpZYrAYW" + }, + "execution_count": null, + "outputs": [], + "id": "utNBbpZYrAYW" + }, + { + "cell_type": "markdown", + "source": [ + "### Test with es client connection rather than cloud_id " + ], + "metadata": { + "id": "hddsIFferBy1" + }, + "id": "hddsIFferBy1" + }, + { + "cell_type": "code", + "source": [ + "# Create Elasticsearch connection\n", + "es_connection = Elasticsearch(\n", + " hosts=['https://es_cluster_url:port'], \n", + " basic_auth=('user', 'password')\n", + ")" + ], + "metadata": { + "id": "bXqrUnoirFia" + }, + "execution_count": null, + "outputs": [], + "id": "bXqrUnoirFia" + }, + { + "cell_type": "code", + "source": [ + "# Instantiate ElasticsearchEmbeddings using es_connection\n", + "embeddings = ElasticsearchEmbeddings.from_es_connection(\n", + " model_id,\n", + " es_connection,\n", + ")" + ], + "metadata": { + "id": "TIM__Hm8rSEW" + }, + "execution_count": null, + "outputs": [], + "id": "TIM__Hm8rSEW" + }, + { + "cell_type": "code", + "source": [ + "# Initialize ElasticKnnSearch\n", + "knn_search = ElasticKnnSearch(\n", + "\tes_connection = es_connection,\n", + "\tindex_name= test_index, \n", + "\tembedding= embeddings\n", + ")" + ], + "metadata": { + "id": "1-CdnOrArVc_" + }, + "execution_count": null, + "outputs": [], + "id": "1-CdnOrArVc_" + }, + { + "cell_type": "code", + "source": [ + "# Test `knn_search` method with model_id and query_text\n", + "query = \"Hello\"\n", + "knn_result = knn_search.knn_search(query = query, model_id= model_id, k=2)\n", + "print(f\"kNN search results for query '{query}': {knn_result}\")\n", + "print(f\"The 'text' field value from the top hit is: '{knn_result['hits']['hits'][0]['_source']['text']}'\")\n" + ], + "metadata": { + "id": "0kgyaL6QrYVF" + }, + "execution_count": null, + "outputs": [], + "id": "0kgyaL6QrYVF" } - ], - "source": [ - "import os\n", - "import getpass\n", - "\n", - "os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')" - ] - }, - { - "cell_type": "markdown", - "id": "f6030187-0bd7-4798-8372-a265036af5e0", - "metadata": { - "tags": [] - }, - "source": [ - "## Example" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "aac9563e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.embeddings.openai import OpenAIEmbeddings\n", - "from langchain.text_splitter import CharacterTextSplitter\n", - "from langchain.vectorstores import ElasticVectorSearch\n", - "from langchain.document_loaders import TextLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "a3c3999a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.document_loaders import TextLoader\n", - "loader = TextLoader('../../../state_of_the_union.txt')\n", - "documents = loader.load()\n", - "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", - "docs = text_splitter.split_documents(documents)\n", - "\n", - "embeddings = OpenAIEmbeddings()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12eb86d8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "db = ElasticVectorSearch.from_documents(docs, embeddings, elasticsearch_url=\"http://localhost:9200\")\n", - "\n", - "query = \"What did the president say about Ketanji Brown Jackson\"\n", - "docs = db.similarity_search(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "4b172de8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. \n", - "\n", - "We cannot let this happen. \n", - "\n", - "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", - "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", - "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", - "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" - ] + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + }, + "colab": { + "provenance": [] } - ], - "source": [ - "print(docs[0].page_content)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a359ed74", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/langchain/vectorstores/elastic_vector_search.py b/langchain/vectorstores/elastic_vector_search.py index d67974d162cca..ea7a836f5d710 100644 --- a/langchain/vectorstores/elastic_vector_search.py +++ b/langchain/vectorstores/elastic_vector_search.py @@ -3,13 +3,26 @@ import uuid from abc import ABC -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Iterable, + List, + Mapping, + Optional, + Tuple, + Union, +) from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings from langchain.utils import get_from_env from langchain.vectorstores.base import VectorStore +if TYPE_CHECKING: + from elasticsearch import Elasticsearch + def _default_text_mapping(dim: int) -> Dict: return { @@ -304,3 +317,239 @@ def client_search( index=index_name, body={"query": script_query, "size": size} ) return response + + +class ElasticKnnSearch(ElasticVectorSearch): + """ + A class for performing k-Nearest Neighbors (k-NN) search on an Elasticsearch index. + The class is designed for a text search scenario where documents are text strings + and their embeddings are vector representations of those strings. + """ + + def __init__( + self, + index_name: str, + embedding: Embeddings, + es_connection: Optional["Elasticsearch"] = None, + es_cloud_id: Optional[str] = None, + es_user: Optional[str] = None, + es_password: Optional[str] = None, + ): + """ + Initializes an instance of the ElasticKnnSearch class and sets up the + Elasticsearch client. + + Args: + index_name: The name of the Elasticsearch index. + embedding: An instance of the Embeddings class, used to generate vector + representations of text strings. + es_connection: An existing Elasticsearch connection. + es_cloud_id: The Cloud ID of the Elasticsearch instance. Required if + creating a new connection. + es_user: The username for the Elasticsearch instance. Required if + creating a new connection. + es_password: The password for the Elasticsearch instance. Required if + creating a new connection. + """ + + try: + import elasticsearch + except ImportError: + raise ImportError( + "Could not import elasticsearch python package. " + "Please install it with `pip install elasticsearch`." + ) + + self.embedding = embedding + self.index_name = index_name + + # If a pre-existing Elasticsearch connection is provided, use it. + if es_connection is not None: + self.client = es_connection + else: + # If credentials for a new Elasticsearch connection are provided, + # create a new connection. + if es_cloud_id and es_user and es_password: + self.client = elasticsearch.Elasticsearch( + cloud_id=es_cloud_id, basic_auth=(es_user, es_password) + ) + else: + raise ValueError( + """Either provide a pre-existing Elasticsearch connection, \ + or valid credentials for creating a new connection.""" + ) + + @staticmethod + def _default_knn_mapping(dims: int) -> Dict: + """Generates a default index mapping for kNN search.""" + return { + "properties": { + "text": {"type": "text"}, + "vector": { + "type": "dense_vector", + "dims": dims, + "index": True, + "similarity": "dot_product", + }, + } + } + + @staticmethod + def _default_knn_query( + query_vector: Optional[List[float]] = None, + query: Optional[str] = None, + model_id: Optional[str] = None, + field: Optional[str] = "vector", + k: Optional[int] = 10, + num_candidates: Optional[int] = 10, + ) -> Dict: + knn: Dict = { + "field": field, + "k": k, + "num_candidates": num_candidates, + } + + # Case 1: `query_vector` is provided, but not `model_id` -> use query_vector + if query_vector and not model_id: + knn["query_vector"] = query_vector + + # Case 2: `query` and `model_id` are provided, -> use query_vector_builder + elif query and model_id: + knn["query_vector_builder"] = { + "text_embedding": { + "model_id": model_id, # use 'model_id' argument + "model_text": query, # use 'query' argument + } + } + + else: + raise ValueError( + "Either `query_vector` or `model_id` must be provided, but not both." + ) + + return knn + + def knn_search( + self, + query: Optional[str] = None, + k: Optional[int] = 10, + query_vector: Optional[List[float]] = None, + model_id: Optional[str] = None, + size: Optional[int] = 10, + source: Optional[bool] = True, + fields: Optional[ + Union[List[Mapping[str, Any]], Tuple[Mapping[str, Any], ...], None] + ] = None, + ) -> Dict: + """ + Performs a k-nearest neighbor (k-NN) search on the Elasticsearch index. + + The search can be conducted using either a raw query vector or a model ID. + The method first generates + the body of the search query, which can be interpreted by Elasticsearch. + It then performs the k-NN + search on the Elasticsearch index and returns the results. + + Args: + query: The query or queries to be used for the search. Required if + `query_vector` is not provided. + k: The number of nearest neighbors to return. Defaults to 10. + query_vector: The query vector to be used for the search. Required if + `query` is not provided. + model_id: The ID of the model to use for generating the query vector, if + `query` is provided. + size: The number of search hits to return. Defaults to 10. + source: Whether to include the source of each hit in the results. + fields: The fields to include in the source of each hit. If None, all + fields are included. + + Returns: + The search results. + + Raises: + ValueError: If neither `query_vector` nor `model_id` is provided, or if + both are provided. + """ + + knn_query_body = self._default_knn_query( + query_vector=query_vector, query=query, model_id=model_id, k=k + ) + + # Perform the kNN search on the Elasticsearch index and return the results. + res = self.client.search( + index=self.index_name, + knn=knn_query_body, + size=size, + source=source, + fields=fields, + ) + return dict(res) + + def knn_hybrid_search( + self, + query: Optional[str] = None, + k: Optional[int] = 10, + query_vector: Optional[List[float]] = None, + model_id: Optional[str] = None, + size: Optional[int] = 10, + source: Optional[bool] = True, + knn_boost: Optional[float] = 0.9, + query_boost: Optional[float] = 0.1, + fields: Optional[ + Union[List[Mapping[str, Any]], Tuple[Mapping[str, Any], ...], None] + ] = None, + ) -> Dict[Any, Any]: + """Performs a hybrid k-nearest neighbor (k-NN) and text-based search on the + Elasticsearch index. + + The search can be conducted using either a raw query vector or a model ID. + The method first generates + the body of the k-NN search query and the text-based query, which can be + interpreted by Elasticsearch. + It then performs the hybrid search on the Elasticsearch index and returns the + results. + + Args: + query: The query or queries to be used for the search. Required if + `query_vector` is not provided. + k: The number of nearest neighbors to return. Defaults to 10. + query_vector: The query vector to be used for the search. Required if + `query` is not provided. + model_id: The ID of the model to use for generating the query vector, if + `query` is provided. + size: The number of search hits to return. Defaults to 10. + source: Whether to include the source of each hit in the results. + knn_boost: The boost factor for the k-NN part of the search. + query_boost: The boost factor for the text-based part of the search. + fields + The fields to include in the source of each hit. If None, all fields are + included. Defaults to None. + + Returns: + The search results. + + Raises: + ValueError: If neither `query_vector` nor `model_id` is provided, or if + both are provided. + """ + + knn_query_body = self._default_knn_query( + query_vector=query_vector, query=query, model_id=model_id, k=k + ) + + # Modify the knn_query_body to add a "boost" parameter + knn_query_body["boost"] = knn_boost + + # Generate the body of the standard Elasticsearch query + match_query_body = {"match": {"text": {"query": query, "boost": query_boost}}} + + # Perform the hybrid search on the Elasticsearch index and return the results. + res = self.client.search( + index=self.index_name, + query=match_query_body, + knn=knn_query_body, + fields=fields, + size=size, + source=source, + ) + return dict(res)