Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

.Net: Improve langchain interop sample further #9472

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Text.Json;
using System.Text.Json.Nodes;
using System.Text.Json.Serialization;
using Azure.Search.Documents.Indexes;
using Microsoft.Extensions.VectorData;
using Microsoft.SemanticKernel.Connectors.AzureAISearch;

namespace Memory.VectorStoreLangchainInterop;

/// <summary>
/// Contains a factory method that can be used to create an Azure AI Search vector store that is compatible with datasets ingested using Langchain.
/// </summary>
/// <remarks>
/// This class is used with the <see cref="VectorStore_Langchain_Interop"/> sample.
/// </remarks>
public static class AzureAISearchFactory
{
/// <summary>
/// Record definition that matches the storage format used by Langchain for Azure AI Search.
/// </summary>
private static readonly VectorStoreRecordDefinition s_recordDefinition = new()
{
Properties = new List<VectorStoreRecordProperty>
{
new VectorStoreRecordKeyProperty("id", typeof(string)),
new VectorStoreRecordDataProperty("content", typeof(string)),
new VectorStoreRecordDataProperty("metadata", typeof(string)),
new VectorStoreRecordVectorProperty("content_vector", typeof(ReadOnlyMemory<float>)) { Dimensions = 1536 }
}
};

/// <summary>
/// Create a new Azure AI Search-backed <see cref="IVectorStore"/> that can be used to read data that was ingested using Langchain.
/// </summary>
/// <param name="searchIndexClient">Azure AI Search client that can be used to manage the list of indices in an Azure AI Search Service.</param>
/// <returns>The <see cref="IVectorStore"/>.</returns>
public static IVectorStore CreateQdrantLangchainInteropVectorStore(SearchIndexClient searchIndexClient)
{
// Create a vector store that uses our custom factory for creating collections
// so that the collection can be configured to be compatible with Langchain.
return new AzureAISearchVectorStore(
searchIndexClient,
new()
{
VectorStoreCollectionFactory = new AzureAISearchVectorStoreRecordCollectionFactory()
});
}

/// <summary>
/// Factory that is used to inject the appropriate <see cref="VectorStoreRecordDefinition"/> and mapper for Langchain interoperability.
/// </summary>
private sealed class AzureAISearchVectorStoreRecordCollectionFactory : IAzureAISearchVectorStoreRecordCollectionFactory
{
public IVectorStoreRecordCollection<TKey, TRecord> CreateVectorStoreRecordCollection<TKey, TRecord>(SearchIndexClient searchIndexClient, string name, VectorStoreRecordDefinition? vectorStoreRecordDefinition) where TKey : notnull
{
if (typeof(TKey) != typeof(string) || typeof(TRecord) != typeof(LangchainDocument<string>))
{
throw new NotSupportedException("This VectorStore is only usable with string keys and LangchainDocument<string> record types");
}

// Create an Azure AI Search collection. To be compatible with Langchain
// we need to use a custom record definition that matches the
// schema used by Langchain. We also need to use a custom mapper
// since the Langchain schema includes a metadata field that is
// a JSON string containing the source property. Parsing this
// string and extracting the source is not supported by the default mapper.
return (new AzureAISearchVectorStoreRecordCollection<TRecord>(
searchIndexClient,
name,
new()
{
VectorStoreRecordDefinition = s_recordDefinition,
JsonObjectCustomMapper = new LangchainInteropMapper() as IVectorStoreRecordMapper<TRecord, JsonObject>
}) as IVectorStoreRecordCollection<TKey, TRecord>)!;
}
}

/// <summary>
/// Custom mapper to map the metadata string field, since it contains JSON as a string and this is not supported
/// automatically by the built in mapper.
/// </summary>
private sealed class LangchainInteropMapper : IVectorStoreRecordMapper<LangchainDocument<string>, JsonObject>
{
public JsonObject MapFromDataToStorageModel(LangchainDocument<string> dataModel)
{
var storageDocument = new AzureAISearchLangchainDocument()
{
Key = dataModel.Key,
Content = dataModel.Content,
Metadata = $"{{\"source\": \"{dataModel.Source}\"}}",
Embedding = dataModel.Embedding
};

return JsonSerializer.SerializeToNode(storageDocument)!.AsObject();
}

public LangchainDocument<string> MapFromStorageToDataModel(JsonObject storageModel, StorageToDataModelMapperOptions options)
{
var storageDocument = JsonSerializer.Deserialize<AzureAISearchLangchainDocument>(storageModel)!;
var metadataDocument = JsonSerializer.Deserialize<JsonObject>(storageDocument.Metadata);
var source = metadataDocument?["source"]?.AsValue()?.ToString();

return new LangchainDocument<string>()
{
Key = storageDocument.Key,
Content = storageDocument.Content,
Source = source!,
Embedding = storageDocument.Embedding
};
}
}

/// <summary>
/// Model class that matches the storage format used by Langchain for Azure AI Search.
/// </summary>
private sealed class AzureAISearchLangchainDocument
{
[JsonPropertyName("id")]
public string Key { get; set; }

[JsonPropertyName("content")]
public string Content { get; set; }

/// <summary>
/// The storage format used by Langchain stores the source information
/// in the metadata field as a JSON string.
/// E.g. {"source": "my-doc"}
/// </summary>
[JsonPropertyName("metadata")]
public string Metadata { get; set; }

[JsonPropertyName("content_vector")]
public ReadOnlyMemory<float> Embedding { get; set; }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright (c) Microsoft. All rights reserved.

namespace Memory.VectorStoreLangchainInterop;

/// <summary>
/// Data model class that matches the data model used by Langchain.
/// This data model is not decorated with vector store attributes since instead
/// a different record definition is used with each vector store implementation.
/// </summary>
/// <remarks>
/// This class is used with the <see cref="VectorStore_Langchain_Interop"/> sample.
/// </remarks>
public class LangchainDocument<TKey>
{
/// <summary>
/// The unique identifier of the record.
/// </summary>
public TKey Key { get; set; }

/// <summary>
/// The text content for which embeddings have been generated.
/// </summary>
public string Content { get; set; }

/// <summary>
/// The source of the content. E.g. where to find the original content.
/// </summary>
public string Source { get; set; }

/// <summary>
/// The embedding for the <see cref="Content"/>.
/// </summary>
public ReadOnlyMemory<float> Embedding { get; set; }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Runtime.CompilerServices;
using Microsoft.Extensions.VectorData;

namespace Memory.VectorStoreLangchainInterop;

/// <summary>
/// Decorator class that allows conversion of keys and records between public and internal representations.
/// </summary>
/// <remarks>
/// This class is useful if a vector store implementation exposes keys or records in a way that is not
/// suitable for the user of the vector store. E.g. let's say that the vector store supports Guid keys
/// but you want to work with string keys that contain Guids. This class allows you to map between the
/// public string Guids and the internal Guids.
/// </remarks>
/// <typeparam name="TPublicKey">The type of the key that the user of this class will use.</typeparam>
/// <typeparam name="TInternalKey">The type of the key that the internal collection exposes.</typeparam>
/// <typeparam name="TPublicRecord">The type of the record that the user of this class will use.</typeparam>
/// <typeparam name="TInternalRecord">The type of the record that the internal collection exposes.</typeparam>
internal sealed class MappingVectorStoreRecordCollection<TPublicKey, TInternalKey, TPublicRecord, TInternalRecord> : IVectorStoreRecordCollection<TPublicKey, TPublicRecord>
where TPublicKey : notnull
where TInternalKey : notnull
{
private readonly IVectorStoreRecordCollection<TInternalKey, TInternalRecord> _collection;
private readonly Func<TPublicKey, TInternalKey> _publicToInternalKeyMapper;
private readonly Func<TInternalKey, TPublicKey> _internalToPublicKeyMapper;
private readonly Func<TPublicRecord, TInternalRecord> _publicToInternalRecordMapper;
private readonly Func<TInternalRecord, TPublicRecord> _internalToPublicRecordMapper;

public MappingVectorStoreRecordCollection(
IVectorStoreRecordCollection<TInternalKey, TInternalRecord> collection,
Func<TPublicKey, TInternalKey> publicToInternalKeyMapper,
Func<TInternalKey, TPublicKey> internalToPublicKeyMapper,
Func<TPublicRecord, TInternalRecord> publicToInternalRecordMapper,
Func<TInternalRecord, TPublicRecord> internalToPublicRecordMapper)
{
this._collection = collection;
this._publicToInternalKeyMapper = publicToInternalKeyMapper;
this._internalToPublicKeyMapper = internalToPublicKeyMapper;
this._publicToInternalRecordMapper = publicToInternalRecordMapper;
this._internalToPublicRecordMapper = internalToPublicRecordMapper;
}

/// <inheritdoc />
public string CollectionName => this._collection.CollectionName;

/// <inheritdoc />
public Task<bool> CollectionExistsAsync(CancellationToken cancellationToken = default)
{
return this._collection.CollectionExistsAsync(cancellationToken);
}

/// <inheritdoc />
public Task CreateCollectionAsync(CancellationToken cancellationToken = default)
{
return this._collection.CreateCollectionAsync(cancellationToken);
}

/// <inheritdoc />
public Task CreateCollectionIfNotExistsAsync(CancellationToken cancellationToken = default)
{
return this._collection.CreateCollectionIfNotExistsAsync(cancellationToken);
}

/// <inheritdoc />
public Task DeleteAsync(TPublicKey key, DeleteRecordOptions? options = null, CancellationToken cancellationToken = default)
{
return this._collection.DeleteAsync(this._publicToInternalKeyMapper(key), options, cancellationToken);
}

/// <inheritdoc />
public Task DeleteBatchAsync(IEnumerable<TPublicKey> keys, DeleteRecordOptions? options = null, CancellationToken cancellationToken = default)
{
return this._collection.DeleteBatchAsync(keys.Select(this._publicToInternalKeyMapper), options, cancellationToken);
}

/// <inheritdoc />
public Task DeleteCollectionAsync(CancellationToken cancellationToken = default)
{
return this._collection.DeleteCollectionAsync(cancellationToken);
}

/// <inheritdoc />
public async Task<TPublicRecord?> GetAsync(TPublicKey key, GetRecordOptions? options = null, CancellationToken cancellationToken = default)
{
var internalRecord = await this._collection.GetAsync(this._publicToInternalKeyMapper(key), options, cancellationToken).ConfigureAwait(false);
if (internalRecord == null)
{
return default;
}

return this._internalToPublicRecordMapper(internalRecord);
}

/// <inheritdoc />
public IAsyncEnumerable<TPublicRecord> GetBatchAsync(IEnumerable<TPublicKey> keys, GetRecordOptions? options = null, CancellationToken cancellationToken = default)
{
var internalRecords = this._collection.GetBatchAsync(keys.Select(this._publicToInternalKeyMapper), options, cancellationToken);
return internalRecords.Select(this._internalToPublicRecordMapper);
}

/// <inheritdoc />
public async Task<TPublicKey> UpsertAsync(TPublicRecord record, UpsertRecordOptions? options = null, CancellationToken cancellationToken = default)
{
var internalRecord = this._publicToInternalRecordMapper(record);
var internalKey = await this._collection.UpsertAsync(internalRecord, options, cancellationToken).ConfigureAwait(false);
return this._internalToPublicKeyMapper(internalKey);
}

/// <inheritdoc />
public async IAsyncEnumerable<TPublicKey> UpsertBatchAsync(IEnumerable<TPublicRecord> records, UpsertRecordOptions? options = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
{
var internalRecords = records.Select(this._publicToInternalRecordMapper);
var internalKeys = this._collection.UpsertBatchAsync(internalRecords, options, cancellationToken);
await foreach (var internalKey in internalKeys.ConfigureAwait(false))
{
yield return this._internalToPublicKeyMapper(internalKey);
}
}

/// <inheritdoc />
public async Task<VectorSearchResults<TPublicRecord>> VectorizedSearchAsync<TVector>(TVector vector, VectorSearchOptions? options = null, CancellationToken cancellationToken = default)
{
var searchResults = await this._collection.VectorizedSearchAsync(vector, options, cancellationToken).ConfigureAwait(false);
var publicResultRecords = searchResults.Results.Select(result => new VectorSearchResult<TPublicRecord>(this._internalToPublicRecordMapper(result.Record), result.Score));

return new VectorSearchResults<TPublicRecord>(publicResultRecords)
{
TotalCount = searchResults.TotalCount,
Metadata = searchResults.Metadata,
};
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.Extensions.VectorData;
using Microsoft.SemanticKernel.Connectors.Pinecone;
using Sdk = Pinecone;

namespace Memory.VectorStoreLangchainInterop;

/// <summary>
/// Contains a factory method that can be used to create a Pinecone vector store that is compatible with datasets ingested using Langchain.
/// </summary>
/// <remarks>
/// This class is used with the <see cref="VectorStore_Langchain_Interop"/> sample.
/// </remarks>
public static class PineconeFactory
{
/// <summary>
/// Record definition that matches the storage format used by Langchain for Pinecone.
/// </summary>
private static readonly VectorStoreRecordDefinition s_recordDefinition = new()
{
Properties = new List<VectorStoreRecordProperty>
{
new VectorStoreRecordKeyProperty("Key", typeof(string)),
new VectorStoreRecordDataProperty("Content", typeof(string)) { StoragePropertyName = "text" },
new VectorStoreRecordDataProperty("Source", typeof(string)) { StoragePropertyName = "source" },
new VectorStoreRecordVectorProperty("Embedding", typeof(ReadOnlyMemory<float>)) { StoragePropertyName = "embedding", Dimensions = 1536 }
}
};

/// <summary>
/// Create a new Pinecone-backed <see cref="IVectorStore"/> that can be used to read data that was ingested using Langchain.
/// </summary>
/// <param name="pineconeClient">Pinecone client that can be used to manage the collections and points in a Pinecone store.</param>
/// <returns>The <see cref="IVectorStore"/>.</returns>
public static IVectorStore CreatePineconeLangchainInteropVectorStore(Sdk.PineconeClient pineconeClient)
{
// Create a vector store that uses our custom factory for creating collections
// so that the collection can be configured to be compatible with Langchain.
return new PineconeVectorStore(
pineconeClient,
new()
{
VectorStoreCollectionFactory = new PineconeVectorStoreRecordCollectionFactory()
});
}

/// <summary>
/// Factory that is used to inject the appropriate <see cref="VectorStoreRecordDefinition"/> for Langchain interoperability.
/// </summary>
private sealed class PineconeVectorStoreRecordCollectionFactory : IPineconeVectorStoreRecordCollectionFactory
{
public IVectorStoreRecordCollection<TKey, TRecord> CreateVectorStoreRecordCollection<TKey, TRecord>(Sdk.PineconeClient pineconeClient, string name, VectorStoreRecordDefinition? vectorStoreRecordDefinition) where TKey : notnull
{
if (typeof(TKey) != typeof(string) || typeof(TRecord) != typeof(LangchainDocument<string>))
{
throw new NotSupportedException("This VectorStore is only usable with string keys and LangchainDocument<string> record types");
}

// Create a Pinecone collection and pass in our custom record definition that matches
// the schema used by Langchain so that the default mapper can use the storage names
// in it, to map to the storage scheme.
return (new PineconeVectorStoreRecordCollection<TRecord>(
pineconeClient,
name,
new()
{
VectorStoreRecordDefinition = s_recordDefinition
}) as IVectorStoreRecordCollection<TKey, TRecord>)!;
}
}
}
Loading
Loading