VEC-223: Documentation for sparse and hybrid indexes

Added them under features, also updated the REST API specification.
upstash · Dec 12, 2024 · cfeaa66 · cfeaa66
1 parent f56b544
commit cfeaa66
Show file tree

Hide file tree

Showing 14 changed files with 915 additions and 16 deletions.
diff --git a/mint.json b/mint.json
@@ -766,7 +766,9 @@
             "vector/features/filtering",
             "vector/features/embeddingmodels",
             "vector/features/namespaces",
-            "vector/features/resumablequery"
+            "vector/features/resumablequery",
+            "vector/features/sparseindexes",
+            "vector/features/hybridindexes"
           ]
         },
         {

diff --git a/vector/api/endpoints/fetch-random.mdx b/vector/api/endpoints/fetch-random.mdx
@@ -23,8 +23,11 @@ The response will be `null` if the namespace is empty.
 <ResponseField name="id" type="string" required>
   The id of the vector.
 </ResponseField>
-<ResponseField name="vector" type="number[]" required>
-  The vector value.
+<ResponseField name="vector" type="number[]">
+  The dense vector value for dense and hybrid indexes.
+</ResponseField>
+<ResponseField name="sparseVector" type="Object[]">
+  The sparse vector value for sparse and hybrid indexes.
 </ResponseField>
 
 <RequestExample>

diff --git a/vector/api/endpoints/fetch.mdx b/vector/api/endpoints/fetch.mdx
@@ -49,7 +49,10 @@ their vector ids.
       The id of the vector.
     </ResponseField>
     <ResponseField name="vector" type="number[]">
-      The vector value.
+      The dense vector value for dense and hybrid indexes.
+    </ResponseField>
+    <ResponseField name="sparseVector" type="Object[]">
+      The sparse vector value for sparse and hybrid indexes.
     </ResponseField>
     <ResponseField name="metadata" type="Object">
       The metadata of the vector, if any.

diff --git a/vector/api/endpoints/query-data.mdx b/vector/api/endpoints/query-data.mdx
@@ -44,6 +44,23 @@ of fields below.
 <ParamField body="filter" type="string" default="">
   [Metadata filter](/vector/features/filtering) to apply.
 </ParamField>
+<ParamField body="weightingStrategy" type="string">
+  For sparse vectors of sparse and hybrid indexes, specifies what kind of
+  weighting strategy should be used while querying the matching non-zero
+  dimension values of the query vector with the documents.
+
+  If not provided, no weighting will be used.
+
+  Only possible value is `IDF` (inverse document frequency).
+</ParamField>
+<ParamField body="fusionAlgorithm" type="string">
+  Fusion algorithm to use while fusing scores
+  from dense and sparse components of a hybrid index.
+
+  If not provided, defaults to `RRF` (Reciprocal Rank Fusion).
+
+  Other possible value is `DBSF` (Distribution-Based Score Fusion).
+</ParamField>
 
 ## Path
 
@@ -61,9 +78,12 @@ If the request was an array of more than one items, an array of
 objects below is returned, one for each query item.
 
 <Note>
-  The score is normalized to always be between 0 and 1.
+  For dense indexes, the score is normalized to always be between 0 and 1.
   The closer the score is to 1, the more similar the vector is to the query vector.
   This does not depend on the distance metric you use.
+
+  For sparse and hybrid indexes, scores can be arbitrary values, but the score
+  will be higher for more similar vectors.
 </Note>
 
 <ResponseField name="Scores" type="Object[]">
@@ -75,7 +95,10 @@ objects below is returned, one for each query item.
       The similarity score of the vector, calculated based on the distance metric of your index.
     </ResponseField>
     <ResponseField name="vector" type="number[]">
-      The vector value.
+      The dense vector value for dense and hybrid indexes.
+    </ResponseField>
+    <ResponseField name="sparseVector" type="Object[]">
+      The sparse vector value for sparse and hybrid indexes.
     </ResponseField>
     <ResponseField name="metadata" type="Object">
       The metadata of the vector, if any.

diff --git a/vector/api/endpoints/query.mdx b/vector/api/endpoints/query.mdx
@@ -40,6 +40,23 @@ of fields below.
 <ParamField body="filter" type="string" default="">
   [Metadata filter](/vector/features/filtering) to apply.
 </ParamField>
+<ParamField body="weightingStrategy" type="string">
+  For sparse vectors of sparse and hybrid indexes, specifies what kind of
+  weighting strategy should be used while querying the matching non-zero
+  dimension values of the query vector with the documents.
+
+  If not provided, no weighting will be used.
+
+  Only possible value is `IDF` (inverse document frequency).
+</ParamField>
+<ParamField body="fusionAlgorithm" type="string">
+  Fusion algorithm to use while fusing scores
+  from dense and sparse components of a hybrid index.
+
+  If not provided, defaults to `RRF` (Reciprocal Rank Fusion).
+
+  Other possible value is `DBSF` (Distribution-Based Score Fusion).
+</ParamField>
 
 ## Path
 
@@ -57,9 +74,12 @@ If the request was an array of more than one items, an array of
 objects below is returned, one for each query item.
 
 <Note>
-  The score is normalized to always be between 0 and 1.
+  For dense indexes, the score is normalized to always be between 0 and 1.
   The closer the score is to 1, the more similar the vector is to the query vector.
   This does not depend on the distance metric you use.
+
+  For sparse and hybrid indexes, scores can be arbitrary values, but the score
+  will be higher for more similar vectors.
 </Note>
 
 <ResponseField name="Scores" type="Object[]">
@@ -71,7 +91,10 @@ objects below is returned, one for each query item.
       The similarity score of the vector, calculated based on the distance metric of your index.
     </ResponseField>
     <ResponseField name="vector" type="number[]">
-      The vector value.
+      The dense vector value for dense and hybrid indexes.
+    </ResponseField>
+    <ResponseField name="sparseVector" type="Object[]">
+      The sparse vector value for sparse and hybrid indexes.
     </ResponseField>
     <ResponseField name="metadata" type="Object">
       The metadata of the vector, if any.

diff --git a/vector/api/endpoints/range.mdx b/vector/api/endpoints/range.mdx
@@ -52,8 +52,11 @@ authMethod: "GET"
     <ResponseField name="id" type="string" required>
       The id of the vector.
     </ResponseField>
-    <ResponseField name="vector" type="number[]" required>
-      The vector value.
+    <ResponseField name="vector" type="number[]">
+      The dense vector value for dense and hybrid indexes.
+    </ResponseField>
+    <ResponseField name="sparseVector" type="Object[]">
+      The sparse vector value for sparse and hybrid indexes.
     </ResponseField>
     <ResponseField name="metadata" type="Object">
       The metadata of the vector, if any.

diff --git a/vector/api/endpoints/resumable-query/resume.mdx b/vector/api/endpoints/resumable-query/resume.mdx
@@ -27,7 +27,10 @@ authMethod: "bearer"
       metric of your index.
     </ResponseField>
     <ResponseField name="vector" type="number[]">
-      The vector value.
+      The dense vector value for dense and hybrid indexes.
+    </ResponseField>
+    <ResponseField name="sparseVector" type="Object[]">
+      The sparse vector value for sparse and hybrid indexes.
     </ResponseField>
     <ResponseField name="metadata" type="Object">
       The metadata of the vector, if any.

diff --git a/vector/api/endpoints/resumable-query/start-with-data.mdx b/vector/api/endpoints/resumable-query/start-with-data.mdx
@@ -40,6 +40,25 @@ authMethod: "bearer"
   Maximum idle time for the resumable query in seconds.
 </ParamField>
 
+<ParamField body="weightingStrategy" type="string">
+  For sparse vectors of sparse and hybrid indexes, specifies what kind of
+  weighting strategy should be used while querying the matching non-zero
+  dimension values of the query vector with the documents.
+
+  If not provided, no weighting will be used.
+
+  Only possible value is `IDF` (inverse document frequency).
+</ParamField>
+
+<ParamField body="fusionAlgorithm" type="string">
+  Fusion algorithm to use while fusing scores
+  from dense and sparse components of a hybrid index.
+
+  If not provided, defaults to `RRF` (Reciprocal Rank Fusion).
+
+  Other possible value is `DBSF` (Distribution-Based Score Fusion).
+</ParamField>
+
 ## Path
 
 <ParamField path="namespace" type="string" default="">

diff --git a/vector/api/endpoints/resumable-query/start-with-vector.mdx b/vector/api/endpoints/resumable-query/start-with-vector.mdx
@@ -46,6 +46,25 @@ authMethod: "bearer"
   Maximum idle time for the resumable query in seconds.
 </ParamField>
 
+<ParamField body="weightingStrategy" type="string">
+  For sparse vectors of sparse and hybrid indexes, specifies what kind of
+  weighting strategy should be used while querying the matching non-zero
+  dimension values of the query vector with the documents.
+
+  If not provided, no weighting will be used.
+
+  Only possible value is `IDF` (inverse document frequency).
+</ParamField>
+
+<ParamField body="fusionAlgorithm" type="string">
+  Fusion algorithm to use while fusing scores
+  from dense and sparse components of a hybrid index.
+
+  If not provided, defaults to `RRF` (Reciprocal Rank Fusion).
+
+  Other possible value is `DBSF` (Distribution-Based Score Fusion).
+</ParamField>
+
 ## Path
 
 <ParamField path="namespace" type="string" default="">
@@ -69,7 +88,10 @@ authMethod: "bearer"
       metric of your index.
     </ResponseField>
     <ResponseField name="vector" type="number[]">
-      The vector value.
+      The dense vector value for dense and hybrid indexes.
+    </ResponseField>
+    <ResponseField name="sparseVector" type="Object[]">
+      The sparse vector value for sparse and hybrid indexes.
     </ResponseField>
     <ResponseField name="metadata" type="Object">
       The metadata of the vector, if any.

diff --git a/vector/api/endpoints/update.mdx b/vector/api/endpoints/update.mdx
@@ -19,9 +19,12 @@ of those.
   The id of the vector.
 </ParamField>
 <ParamField body="vector" type="number[]">
-  The vector value to update to.
+  The dense vector value to update to for dense and hybrid indexes.
   <Note>The vector should have the same dimensions as your index.</Note>
 </ParamField>
+<ParamField body="sparseVector" type="Object[]">
+  The sparse vector value to update to for sparse and hybrid indexes.
+</ParamField>
 <ParamField body="data" type="string">
   The raw text data to update to.
   <Note>If the index is created with an [embedding model](/vector/features/embeddingmodels)
@@ -38,6 +41,11 @@ of those.
   `OVERWRITE` for overwrite, `PATCH` for patch.
 </ParamField>
 
+<Note>
+For hybrid indexes either none or both of `vector` and `sparseVector` fields
+must be present. It is not allowed to update only `vector` or `sparseVector`.
+</Note>
+
 ## Path
 
 <ParamField path="namespace" type="string" default="">

diff --git a/vector/api/endpoints/upsert.mdx b/vector/api/endpoints/upsert.mdx
@@ -17,10 +17,13 @@ You can either upsert a single vector, or multiple vectors in an array.
 <ParamField body="id" type="string" required>
   The id of the vector.
 </ParamField>
-<ParamField body="vector" type="number[]" required>
-  The vector value.
+<ParamField body="vector" type="number[]">
+  The dense vector value for dense and hybrid indexes.
   <Note>The vector should have the same dimensions as your index.</Note>
 </ParamField>
+<ParamField body="sparseVector" type="Object[]">
+  The sparse vector value for sparse and hybrid indexes.
+</ParamField>
 <ParamField body="metadata" type="Object">
   The metadata of the vector. This makes identifying vectors
   on retrieval easier and can be used to with filters on queries.
@@ -30,6 +33,14 @@ You can either upsert a single vector, or multiple vectors in an array.
   data, which can be anything associated with this vector.
 </ParamField>
 
+<Note>
+For dense indexes, only `vector` should be provided, and `sparseVector` should not be set.
+
+For sparse indexes, only `sparseVector` should be provided, and `vector` should not be set.
+
+For hybrid indexes both of `vector` and `sparseVector` must be present.
+</Note>
+
 ## Path
 
 <ParamField path="namespace" type="string" default="">

diff --git a/vector/features/embeddingmodels.mdx b/vector/features/embeddingmodels.mdx
@@ -32,7 +32,7 @@ Upstash Vector comes with a variety of embedding models that score well in the
 for measuring the performance of embedding models. They support use cases such
 as classification, clustering, or retrieval.
 
-You can choose the following general purpose models:
+You can choose the following general purpose models for dense and hybrid indexes:
 
 | Name                                                                                                    | Dimension | Sequence Length | MTEB  |
 | ------------------------------------------------------------------------------------------------------- | --------- | --------------- | ----- |
@@ -56,6 +56,15 @@ You can choose the following general purpose models:
   MTEB score for the `BAAI/bge-m3` is not fully measured.
 </Note>
 
+For sparse and hybrid indexes, on the following models can be selected:
+
+| Name                                              |
+| ------------------------------------------------- |
+| [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) |
+| [BM25](https://en.wikipedia.org/wiki/Okapi_BM25)  |
+
+See [Creating Sparse Vectors](/vector/features/sparseindexes#creating-sparse-vectors) for the details of the above models.
+
 ## Using a Model
 
 To start using embedding models, create the index with a model of your choice.