From f7c2bd7f268ec7994ca21be0236bee20abb1204a Mon Sep 17 00:00:00 2001 From: Devis Lucato Date: Mon, 12 Feb 2024 19:35:56 -0800 Subject: [PATCH] Upgrade to KM Abstractions 0.28 --- Directory.Packages.props | 2 +- .../Program.cs | 106 +++++++----------- extensions/Redis/Redis/RedisConfig.cs | 2 - service/Core/DataFormats/FileSection.cs | 54 --------- .../Handlers/GenerateEmbeddingsHandler.cs | 2 - service/Core/Handlers/SaveRecordsHandler.cs | 12 -- .../Core/Handlers/TextExtractionHandler.cs | 4 - .../Core/Handlers/TextPartitioningHandler.cs | 4 - .../MemoryStorage/MemoryRecordExtensions.cs | 8 -- service/Core/Search/SearchClient.cs | 2 - .../InteractiveSetup/InteractiveSetup.csproj | 2 +- 11 files changed, 41 insertions(+), 157 deletions(-) delete mode 100644 service/Core/DataFormats/FileSection.cs diff --git a/Directory.Packages.props b/Directory.Packages.props index c07a747b3..7c560380f 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -37,7 +37,7 @@ - + diff --git a/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs b/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs index ddee300e9..fa34454f3 100644 --- a/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs +++ b/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs @@ -81,108 +81,80 @@ public static async Task Main() SearchResult relevant = await memory.SearchAsync(query: Query, minRelevance: MinRelevance, limit: Limit); Console.WriteLine($"Relevant documents: {relevant.Results.Count}"); -#if KernelMemoryDev - var relevantDocuments = new Dictionary>(); foreach (Citation result in relevant.Results) { // Store the document IDs so we can load all their records later - relevantDocuments.Add(result.DocumentId, new List()); Console.WriteLine($"Document ID: {result.DocumentId}"); Console.WriteLine($"Relevant partitions: {result.Partitions.Count}"); foreach (Citation.Partition partition in result.Partitions) { - Console.WriteLine("--------------------------"); - Console.WriteLine($"Partition number: {partition.PartitionNumber}"); - Console.WriteLine($"Relevance: {partition.Relevance}\n"); - Console.WriteLine(partition.Text); - - relevantDocuments[result.DocumentId].Add(partition.PartitionNumber); + Console.WriteLine($" * Partition {partition.PartitionNumber}, relevance: {partition.Relevance}"); } - Console.WriteLine(); - } - - // For each relevant document - // Note: loops can be optimized for better perf, this code is only a demo - const int HowManyToAdd = 1; - Console.WriteLine("Fetching all document partitions..."); - foreach (KeyValuePair> relevantPartitionNumbers in relevantDocuments) - { - var docId = relevantPartitionNumbers.Key; - Console.WriteLine($"\nDocument ID: {docId}"); - - // Load all partitions. Note: the list might be out of order. - SearchResult all = await memory.SearchAsync("", filters: new[] { MemoryFilters.ByDocument(docId) }, limit: int.MaxValue); - List allPartitionsContent = all.Results.FirstOrDefault()?.Partitions ?? new(); + Console.WriteLine("--------------------------"); - // Loop through the relevant partitions - foreach (int relevantPartitionNumber in relevantPartitionNumbers.Value) + // For each relevant partition fetch the partition before and one after + foreach (Citation.Partition partition in result.Partitions) { - Console.WriteLine("--------------------------"); + // Collect partitions in a sorted collection + var partitions = new SortedDictionary { [partition.PartitionNumber] = partition }; - // Use a data structure to order partitions by number - var result = new SortedDictionary(); + // Filters to fetch adjacent partitions + var filters = new List + { + MemoryFilters.ByDocument(result.DocumentId).ByTag(Constants.ReservedFilePartitionNumberTag, $"{partition.PartitionNumber - 1}"), + MemoryFilters.ByDocument(result.DocumentId).ByTag(Constants.ReservedFilePartitionNumberTag, $"{partition.PartitionNumber + 1}") + }; - // Loop all partitions, include before and after the relevant ones - foreach (Citation.Partition p in allPartitionsContent) + // Fetch adjacent partitions and add them to the sorted collection + SearchResult adjacentList = await memory.SearchAsync("", filters: filters, limit: 2); + foreach (Citation.Partition adjacent in adjacentList.Results.First().Partitions) { - if (Math.Abs(p.PartitionNumber - relevantPartitionNumber) <= HowManyToAdd) - { - result.Add(p.PartitionNumber, p.Text); - } + partitions[adjacent.PartitionNumber] = adjacent; } - // Show partition and adjacent ones in order - foreach (var p in result) + // Print partitions in order + foreach (var p in partitions) { - Console.WriteLine($"Partition: {p.Key}"); - Console.WriteLine(p.Value); + Console.WriteLine($"# Partition {p.Value.PartitionNumber}"); + Console.WriteLine(p.Value.Text); + Console.WriteLine(); } - Console.WriteLine(); + Console.WriteLine("--------------------------"); } + + Console.WriteLine(); } -#endif } } /* Result: -Token count: 2510 Importing memories... Searching memories... Relevant documents: 1 Document ID: example207 Relevant partitions: 2 +* Partition 27, relevance: 0.8557962 +* Partition 13, relevance: 0.85513425 -------------------------- -Partition number: 27 -Relevance: 0.8557962 - -As scientific interest in [...] or ancient microbial life. --------------------------- -Partition number: 13 -Relevance: 0.85513425 +# Partition 26 +Dr. Mei Lin, a renowned ... -Gerald Marshall, the Chief [...] in astrobiological research." +# Partition 27 +As scientific interest in ... -Fetching all document partitions... - -Document ID: example207 +# Partition 28 +Meanwhile, back on Earth, the ... -------------------------- -Partition: 26 -Dr. Mei Lin, a renowned [...] of life in the universe." -Partition: 27 -As scientific interest [...] ancient microbial life. -Partition: 28 -Meanwhile, back on Earth, [...] meaning in the universe. +# Partition 12 +Appearing as a glowing, translucent ... --------------------------- -Partition: 12 -Appearing as a glowing, [...] including its high CO2 levels. -Partition: 13 -Gerald Marshall, the [...] in astrobiological research." -Partition: 14 -While further studies [...] alien at the same time. +# Partition 13 +Gerald Marshall, the Chief ... +# Partition 14 +While further studies are ... +-------------------------- */ - diff --git a/extensions/Redis/Redis/RedisConfig.cs b/extensions/Redis/Redis/RedisConfig.cs index 4719a8588..52e142d22 100644 --- a/extensions/Redis/Redis/RedisConfig.cs +++ b/extensions/Redis/Redis/RedisConfig.cs @@ -43,9 +43,7 @@ public class RedisConfig { Constants.ReservedDocumentIdTag, '|' }, { Constants.ReservedFileIdTag, '|' }, { Constants.ReservedFilePartitionTag, '|' }, -#if KernelMemoryDev { Constants.ReservedFileSectionNumberTag, '|' }, -#endif { Constants.ReservedFileTypeTag, '|' }, }; diff --git a/service/Core/DataFormats/FileSection.cs b/service/Core/DataFormats/FileSection.cs deleted file mode 100644 index 0e83b922e..000000000 --- a/service/Core/DataFormats/FileSection.cs +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -#if KernelMemoryDev -// See Abstractions -#else -using System.Collections.Generic; -using System.Text.Json.Serialization; - -namespace Microsoft.KernelMemory.DataFormats; - -public class FileContent -{ - [JsonPropertyOrder(0)] - [JsonPropertyName("sections")] - public List Sections { get; set; } = new(); -} - -public class FileSection -{ - /// - /// Text page number/Audio segment number/Video scene number - /// - [JsonPropertyOrder(0)] - [JsonPropertyName("number")] - public int Number { get; } - - /// - /// Whether the first/last sentence may continue from the previous/into - /// the next section (e.g. like PDF docs). - /// true: the first/last sentence do not cross over, the first doesn't - /// continue from the previous section, and the last sentence ends - /// where the section ends (e.g. Powerpoint, Excel). - /// false: the first sentence may be a continuation from the previous section, - /// and the last sentence may continue into the next section. - /// - [JsonPropertyOrder(1)] - [JsonPropertyName("complete")] - public bool SentencesAreComplete { get; } - - /// - /// Page text content - /// - [JsonPropertyOrder(2)] - [JsonPropertyName("content")] - public string Content { get; } - - public FileSection(int number, string? content, bool sentencesAreComplete) - { - this.Number = number; - this.SentencesAreComplete = sentencesAreComplete; - this.Content = content ?? string.Empty; - } -} -#endif diff --git a/service/Core/Handlers/GenerateEmbeddingsHandler.cs b/service/Core/Handlers/GenerateEmbeddingsHandler.cs index 0351c8b08..0f9dddf76 100644 --- a/service/Core/Handlers/GenerateEmbeddingsHandler.cs +++ b/service/Core/Handlers/GenerateEmbeddingsHandler.cs @@ -157,10 +157,8 @@ public GenerateEmbeddingsHandler( Size = text.Length, MimeType = MimeTypes.TextEmbeddingVector, ArtifactType = DataPipeline.ArtifactTypes.TextEmbeddingVector, -#if KernelMemoryDev PartitionNumber = partitionFile.PartitionNumber, SectionNumber = partitionFile.SectionNumber, -#endif Tags = partitionFile.Tags, }; embeddingFileNameDetails.MarkProcessedBy(this); diff --git a/service/Core/Handlers/SaveRecordsHandler.cs b/service/Core/Handlers/SaveRecordsHandler.cs index 6c3874636..77c934f4a 100644 --- a/service/Core/Handlers/SaveRecordsHandler.cs +++ b/service/Core/Handlers/SaveRecordsHandler.cs @@ -133,13 +133,8 @@ public SaveRecordsHandler( fileId: embeddingFile.File.ParentId, partitionFileId: embeddingFile.File.SourcePartitionId, partitionContent: partitionContent, -#if KernelMemoryDev partitionNumber: embeddingFile.File.PartitionNumber, sectionNumber: embeddingFile.File.SectionNumber, -#else - partitionNumber: 0, - sectionNumber: 0, -#endif partitionEmbedding: embeddingData.Vector, embeddingGeneratorProvider: embeddingData.GeneratorProvider, embeddingGeneratorName: embeddingData.GeneratorName, @@ -202,13 +197,8 @@ public SaveRecordsHandler( fileId: file.File.ParentId, partitionFileId: file.File.Id, partitionContent: partitionContent, -#if KernelMemoryDev partitionNumber: partitionFileDetails.PartitionNumber, sectionNumber: partitionFileDetails.SectionNumber, -#else - partitionNumber: 0, - sectionNumber: 0, -#endif partitionEmbedding: new Embedding(), embeddingGeneratorProvider: "", embeddingGeneratorName: "", @@ -368,11 +358,9 @@ private static MemoryRecord PrepareRecord( // Partition ID. Filtering used for purge. record.Tags.Add(Constants.ReservedFilePartitionTag, partitionFileId); -#if KernelMemoryDev // Partition number (starting from 0) and Page number (provided by text extractor) record.Tags.Add(Constants.ReservedFilePartitionNumberTag, $"{partitionNumber}"); record.Tags.Add(Constants.ReservedFileSectionNumberTag, $"{sectionNumber}"); -#endif /* * TIMESTAMP and USER TAGS diff --git a/service/Core/Handlers/TextExtractionHandler.cs b/service/Core/Handlers/TextExtractionHandler.cs index c0cf6aecb..13ecbc1a4 100644 --- a/service/Core/Handlers/TextExtractionHandler.cs +++ b/service/Core/Handlers/TextExtractionHandler.cs @@ -67,9 +67,7 @@ public TextExtractionHandler( var sourceFile = uploadedFile.Name; var destFile = $"{uploadedFile.Name}.extract.txt"; -#if KernelMemoryDev var destFile2 = $"{uploadedFile.Name}.extract.json"; -#endif BinaryData fileContent = await this._orchestrator.ReadFileAsync(pipeline, sourceFile, cancellationToken).ConfigureAwait(false); string text = string.Empty; @@ -104,7 +102,6 @@ public TextExtractionHandler( destFileDetails.MarkProcessedBy(this); uploadedFile.GeneratedFiles.Add(destFile, destFileDetails); -#if KernelMemoryDev // Structured content (pages) this._log.LogDebug("Saving extracted content {0}", destFile2); await this._orchestrator.WriteFileAsync(pipeline, destFile2, new BinaryData(content), cancellationToken).ConfigureAwait(false); @@ -120,7 +117,6 @@ public TextExtractionHandler( }; destFile2Details.MarkProcessedBy(this); uploadedFile.GeneratedFiles.Add(destFile2, destFile2Details); -#endif } uploadedFile.MarkProcessedBy(this); diff --git a/service/Core/Handlers/TextPartitioningHandler.cs b/service/Core/Handlers/TextPartitioningHandler.cs index 77133bfb9..ad67726a1 100644 --- a/service/Core/Handlers/TextPartitioningHandler.cs +++ b/service/Core/Handlers/TextPartitioningHandler.cs @@ -150,9 +150,7 @@ public TextPartitioningHandler( { // TODO: turn partitions in objects with more details, e.g. page number string text = partitions[partitionNumber]; -#if KernelMemoryDev int sectionNumber = 0; // TODO: use this to store the page number (if any) -#endif BinaryData textData = new(text); int tokenCount = this._tokenCounter(text); @@ -169,10 +167,8 @@ public TextPartitioningHandler( Size = text.Length, MimeType = MimeTypes.PlainText, ArtifactType = DataPipeline.ArtifactTypes.TextPartition, -#if KernelMemoryDev PartitionNumber = partitionNumber, SectionNumber = sectionNumber, -#endif Tags = pipeline.Tags, ContentSHA256 = textData.CalculateSHA256(), }; diff --git a/service/Core/MemoryStorage/MemoryRecordExtensions.cs b/service/Core/MemoryStorage/MemoryRecordExtensions.cs index d229800b4..b561837ab 100644 --- a/service/Core/MemoryStorage/MemoryRecordExtensions.cs +++ b/service/Core/MemoryStorage/MemoryRecordExtensions.cs @@ -34,15 +34,11 @@ public static string GetFileId(this MemoryRecord record, ILogger? log = null) /// public static int GetPartitionNumber(this MemoryRecord record, ILogger? log = null) { -#if KernelMemoryDev var value = record.GetTagValue(Constants.ReservedFilePartitionNumberTag, log); if (string.IsNullOrEmpty(value)) { return 0; } -#else - var value = "0"; -#endif return int.TryParse(value, out int number) ? number : 0; } @@ -52,15 +48,11 @@ public static int GetPartitionNumber(this MemoryRecord record, ILogger? log = nu /// public static int GetSectionNumber(this MemoryRecord record, ILogger? log = null) { -#if KernelMemoryDev var value = record.GetTagValue(Constants.ReservedFileSectionNumberTag, log); if (string.IsNullOrEmpty(value)) { return 0; } -#else - var value = "0"; -#endif return int.TryParse(value, out int number) ? number : 0; } diff --git a/service/Core/Search/SearchClient.cs b/service/Core/Search/SearchClient.cs index 56b53e0b4..d22c25c93 100644 --- a/service/Core/Search/SearchClient.cs +++ b/service/Core/Search/SearchClient.cs @@ -157,10 +157,8 @@ public async Task SearchAsync( { Text = partitionText, Relevance = (float)relevance, -#if KernelMemoryDev PartitionNumber = memory.GetPartitionNumber(this._log), SectionNumber = memory.GetSectionNumber(), -#endif LastUpdate = memory.GetLastUpdate(), Tags = memory.Tags, }); diff --git a/tools/InteractiveSetup/InteractiveSetup.csproj b/tools/InteractiveSetup/InteractiveSetup.csproj index ed8eaafde..bd9e04fb8 100644 --- a/tools/InteractiveSetup/InteractiveSetup.csproj +++ b/tools/InteractiveSetup/InteractiveSetup.csproj @@ -9,7 +9,7 @@ - +