diff --git a/.gitignore b/.gitignore index 354e08c2d86d0..0d7c9ce69292e 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,4 @@ wandb/ # asdf tool versions .tool-versions +/.ruff_cache/ diff --git a/langchain/vectorstores/elastic_vector_search.py b/langchain/vectorstores/elastic_vector_search.py index e808e80f69c9b..e0eb7b15c6e3c 100644 --- a/langchain/vectorstores/elastic_vector_search.py +++ b/langchain/vectorstores/elastic_vector_search.py @@ -241,7 +241,7 @@ def from_texts( raise ValueError( "Your elasticsearch client string is misformatted. " f"Got error: {e} " ) - index_name = uuid.uuid4().hex + index_name = kwargs.get("index_name", uuid.uuid4().hex) embeddings = embedding.embed_documents(texts) dim = len(embeddings[0]) mapping = _default_text_mapping(dim) diff --git a/poetry.lock b/poetry.lock index eb5c572cea930..c6f07a717ebd2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "absl-py" @@ -1144,7 +1144,6 @@ files = [ {file = "debugpy-1.6.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b5d1b13d7c7bf5d7cf700e33c0b8ddb7baf030fcf502f76fc061ddd9405d16c"}, {file = "debugpy-1.6.6-cp38-cp38-win32.whl", hash = "sha256:70ab53918fd907a3ade01909b3ed783287ede362c80c75f41e79596d5ccacd32"}, {file = "debugpy-1.6.6-cp38-cp38-win_amd64.whl", hash = "sha256:c05349890804d846eca32ce0623ab66c06f8800db881af7a876dc073ac1c2225"}, - {file = "debugpy-1.6.6-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:11a0f3a106f69901e4a9a5683ce943a7a5605696024134b522aa1bfda25b5fec"}, {file = "debugpy-1.6.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a771739902b1ae22a120dbbb6bd91b2cae6696c0e318b5007c5348519a4211c6"}, {file = "debugpy-1.6.6-cp39-cp39-win32.whl", hash = "sha256:549ae0cb2d34fc09d1675f9b01942499751d174381b6082279cf19cdb3c47cbe"}, {file = "debugpy-1.6.6-cp39-cp39-win_amd64.whl", hash = "sha256:de4a045fbf388e120bb6ec66501458d3134f4729faed26ff95de52a754abddb1"}, @@ -4153,14 +4152,14 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] [[package]] name = "openai" -version = "0.27.3" +version = "0.27.4" description = "Python client library for the OpenAI API" category = "main" -optional = true +optional = false python-versions = ">=3.7.1" files = [ - {file = "openai-0.27.3-py3-none-any.whl", hash = "sha256:d5fca76f541f123a43d27baa5987a8a2949ae2b758180bdebd29b52f67d5ac4c"}, - {file = "openai-0.27.3.tar.gz", hash = "sha256:0941a7322dc1ddbf15ed76702bb88d4f0c7586c3536433906dbd24cf6f2398d9"}, + {file = "openai-0.27.4-py3-none-any.whl", hash = "sha256:3b82c867d531e1fd2003d9de2131e1c4bfd4c70b1a3149e0543a555b30807b70"}, + {file = "openai-0.27.4.tar.gz", hash = "sha256:9f9d27d26e62c6068f516c0729449954b5ef6994be1a6cbfe7dbefbc84423a04"}, ] [package.dependencies] @@ -6813,7 +6812,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} +greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""} [package.extras] aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] @@ -7552,7 +7551,7 @@ name = "tqdm" version = "4.65.0" description = "Fast, Extensible Progress Meter" category = "main" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"}, @@ -8473,13 +8472,13 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] [extras] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm"] +all = ["aleph-alpha-client", "anthropic", "beautifulsoup4", "boto3", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "psycopg2-binary", "pyowm", "pypdf", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] cohere = ["cohere"] -llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] +llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "d8c864a82150029b9f7cd66dd8bf9b77c985ace08475d9fbca4d9d11712c53d6" +content-hash = "bd1c3cfb286c9e27e189bad22cfa272223234a38fec4f6c7220fe181d133aa78" diff --git a/pyproject.toml b/pyproject.toml index 3db5082d9ce91..2b269d7cb1be3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,6 +82,13 @@ freezegun = "^1.2.2" responses = "^0.22.0" pytest-asyncio = "^0.20.3" +[tool.poetry.group.test_integration] +optional = true + +[tool.poetry.group.test_integration.dependencies] +openai = "^0.27.4" +elasticsearch = {extras = ["async"], version = "^8.6.2"} + [tool.poetry.group.lint.dependencies] ruff = "^0.0.249" types-toml = "^0.10.8.1" diff --git a/tests/integration_tests/vectorstores/docker-compose/elasticsearch.yml b/tests/integration_tests/vectorstores/docker-compose/elasticsearch.yml new file mode 100644 index 0000000000000..609cf4e9f7fc6 --- /dev/null +++ b/tests/integration_tests/vectorstores/docker-compose/elasticsearch.yml @@ -0,0 +1,30 @@ +version: "3" + +services: + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.7.0 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - xpack.security.http.ssl.enabled=false + - ELASTIC_PASSWORD=password + ports: + - "9200:9200" + healthcheck: + test: [ "CMD-SHELL", "curl --silent --fail http://localhost:9200/_cluster/health || exit 1" ] + interval: 1s + retries: 360 + + kibana: + image: docker.elastic.co/kibana/kibana:8.7.0 + environment: + - ELASTICSEARCH_URL=http://elasticsearch:9200 + - ELASTICSEARCH_USERNAME=kibana_system + - ELASTICSEARCH_PASSWORD=password + - KIBANA_PASSWORD=password + ports: + - "5601:5601" + healthcheck: + test: [ "CMD-SHELL", "curl --silent --fail http://localhost:5601/login || exit 1" ] + interval: 10s + retries: 60 diff --git a/tests/integration_tests/vectorstores/fixtures/sharks.txt b/tests/integration_tests/vectorstores/fixtures/sharks.txt new file mode 100644 index 0000000000000..b2aeb8f20c056 --- /dev/null +++ b/tests/integration_tests/vectorstores/fixtures/sharks.txt @@ -0,0 +1,7 @@ +Sharks are a group of elasmobranch fish characterized by a cartilaginous skeleton, five to seven gill slits on the sides of the head, and pectoral fins that are not fused to the head. Modern sharks are classified within the clade Selachimorpha (or Selachii) and are the sister group to the Batoidea (rays and kin). Some sources extend the term "shark" as an informal category including extinct members of Chondrichthyes (cartilaginous fish) with a shark-like morphology, such as hybodonts and xenacanths. Shark-like chondrichthyans such as Cladoselache and Doliodus first appeared in the Devonian Period (419-359 Ma), though some fossilized chondrichthyan-like scales are as old as the Late Ordovician (458-444 Ma). The oldest modern sharks (selachians) are known from the Early Jurassic, about 200 Ma. + +Sharks range in size from the small dwarf lanternshark (Etmopterus perryi), a deep sea species that is only 17 centimetres (6.7 in) in length, to the whale shark (Rhincodon typus), the largest fish in the world, which reaches approximately 12 metres (40 ft) in length. They are found in all seas and are common to depths up to 2,000 metres (6,600 ft). They generally do not live in freshwater, although there are a few known exceptions, such as the bull shark and the river shark, which can be found in both seawater and freshwater.[3] Sharks have a covering of dermal denticles that protects their skin from damage and parasites in addition to improving their fluid dynamics. They have numerous sets of replaceable teeth. + +Several species are apex predators, which are organisms that are at the top of their food chain. Select examples include the tiger shark, blue shark, great white shark, mako shark, thresher shark, and hammerhead shark. + +Sharks are caught by humans for shark meat or shark fin soup. Many shark populations are threatened by human activities. Since 1970, shark populations have been reduced by 71%, mostly from overfishing. \ No newline at end of file diff --git a/tests/integration_tests/vectorstores/test_elasticsearch.py b/tests/integration_tests/vectorstores/test_elasticsearch.py index 075fab4adae6f..c5222565f4c5f 100644 --- a/tests/integration_tests/vectorstores/test_elasticsearch.py +++ b/tests/integration_tests/vectorstores/test_elasticsearch.py @@ -1,29 +1,137 @@ """Test ElasticSearch functionality.""" +import logging +import os +from typing import Generator, List, Union + +import pytest +from elasticsearch import Elasticsearch from langchain.docstore.document import Document +from langchain.document_loaders import TextLoader +from langchain.embeddings import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings +logging.basicConfig(level=logging.DEBUG) + +""" +cd tests/integration_tests/vectorstores/docker-compose +docker-compose -f elasticsearch.yml up +""" + + +class TestElasticsearch: + @pytest.fixture(scope="class", autouse=True) + def elasticsearch_url(self) -> Union[str, Generator[str, None, None]]: + """Return the elasticsearch url.""" + url = "http://localhost:9200" + yield url + es = Elasticsearch(hosts=url) + + # Clear all indexes + index_names = es.indices.get(index="_all").keys() + for index_name in index_names: + # print(index_name) + es.indices.delete(index=index_name) + + @pytest.fixture(scope="class", autouse=True) + def openai_api_key(self) -> Union[str, Generator[str, None, None]]: + """Return the OpenAI API key.""" + openai_api_key = os.getenv("OPENAI_API_KEY") + if not openai_api_key: + raise ValueError("OPENAI_API_KEY environment variable is not set") + + yield openai_api_key + + @pytest.fixture(scope="class") + def documents(self) -> Generator[List[Document], None, None]: + """Return a generator that yields a list of documents.""" + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + + documents = TextLoader( + os.path.join(os.path.dirname(__file__), "fixtures", "sharks.txt") + ).load() + yield text_splitter.split_documents(documents) + + def test_similarity_search_without_metadata(self, elasticsearch_url: str) -> None: + """Test end to end construction and search without metadata.""" + texts = ["foo", "bar", "baz"] + docsearch = ElasticVectorSearch.from_texts( + texts, FakeEmbeddings(), elasticsearch_url=elasticsearch_url + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + def test_similarity_search_with_metadata(self, elasticsearch_url: str) -> None: + """Test end to end construction and search with metadata.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = ElasticVectorSearch.from_texts( + texts, + FakeEmbeddings(), + metadatas=metadatas, + elasticsearch_url=elasticsearch_url, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": 0})] + + def test_default_index_from_documents( + self, documents: List[Document], openai_api_key: str, elasticsearch_url: str + ) -> None: + """This test checks the construction of a default + ElasticSearch index using the 'from_documents'.""" + embedding = OpenAIEmbeddings(openai_api_key=openai_api_key) + + elastic_vector_search = ElasticVectorSearch.from_documents( + documents=documents, + embedding=embedding, + elasticsearch_url=elasticsearch_url, + ) + + search_result = elastic_vector_search.similarity_search("sharks") + + print(search_result) + assert len(search_result) != 0 + + def test_custom_index_from_documents( + self, documents: List[Document], openai_api_key: str, elasticsearch_url: str + ) -> None: + """This test checks the construction of a custom + ElasticSearch index using the 'from_documents'.""" + embedding = OpenAIEmbeddings(openai_api_key=openai_api_key) + elastic_vector_search = ElasticVectorSearch.from_documents( + documents=documents, + embedding=embedding, + elasticsearch_url=elasticsearch_url, + index_name="custom_index", + ) + es = Elasticsearch(hosts=elasticsearch_url) + index_names = es.indices.get(index="_all").keys() + assert "custom_index" in index_names + + search_result = elastic_vector_search.similarity_search("sharks") + print(search_result) + + assert len(search_result) != 0 + + def test_custom_index_add_documents( + self, documents: List[Document], openai_api_key: str, elasticsearch_url: str + ) -> None: + """This test checks the construction of a custom + ElasticSearch index using the 'add_documents'.""" + embedding = OpenAIEmbeddings(openai_api_key=openai_api_key) + elastic_vector_search = ElasticVectorSearch( + embedding=embedding, + elasticsearch_url=elasticsearch_url, + index_name="custom_index", + ) + es = Elasticsearch(hosts=elasticsearch_url) + index_names = es.indices.get(index="_all").keys() + assert "custom_index" in index_names + + elastic_vector_search.add_documents(documents) + search_result = elastic_vector_search.similarity_search("sharks") + print(search_result) -def test_elasticsearch() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - docsearch = ElasticVectorSearch.from_texts( - texts, FakeEmbeddings(), elasticsearch_url="http://localhost:9200" - ) - output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo")] - - -def test_elasticsearch_with_metadatas() -> None: - """Test end to end construction and search.""" - texts = ["foo", "bar", "baz"] - metadatas = [{"page": i} for i in range(len(texts))] - docsearch = ElasticVectorSearch.from_texts( - texts, - FakeEmbeddings(), - metadatas=metadatas, - elasticsearch_url="http://localhost:9200", - ) - output = docsearch.similarity_search("foo", k=1) - assert output == [Document(page_content="foo", metadata={"page": 0})] + assert len(search_result) != 0