Skip to content

Commit

Permalink
Add Image OCR Support (#29)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: Devis Lucato <[email protected]>
  • Loading branch information
crickman and dluc authored Aug 28, 2023
1 parent 0c9d0be commit 50c369e
Show file tree
Hide file tree
Showing 17 changed files with 310 additions and 49 deletions.
2 changes: 1 addition & 1 deletion dotnet/ClientLib/ClientLib.csproj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<Project Sdk="Microsoft.NET.Sdk">
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
Expand Down
50 changes: 46 additions & 4 deletions dotnet/CoreLib/AppBuilders/MemoryClientBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
using Microsoft.SemanticMemory.ContentStorage;
using Microsoft.SemanticMemory.ContentStorage.AzureBlobs;
using Microsoft.SemanticMemory.ContentStorage.DevTools;
using Microsoft.SemanticMemory.DataFormats.Image;
using Microsoft.SemanticMemory.DataFormats.Image.AzureFormRecognizer;
using Microsoft.SemanticMemory.MemoryStorage;
using Microsoft.SemanticMemory.MemoryStorage.DevTools;
using Microsoft.SemanticMemory.MemoryStorage.Qdrant;
Expand Down Expand Up @@ -61,7 +63,7 @@ public MemoryClientBuilder(IServiceCollection? sharedServiceCollection = null)
this.AddSingleton<List<ISemanticMemoryVectorDb>>(this._vectorDbs);

// Default configuration for tests and demos
this.WithCustomMimeTypeDetection(new MimeTypesDetection());
this.WithDefaultMimeTypeDetection();
this.WithSimpleFileStorage(new SimpleFileStorageConfig { Directory = "tmp-memory-files" });
this.WithSimpleVectorDb(new SimpleVectorDbConfig { Directory = "tmp-memory-vectors" });
}
Expand All @@ -77,7 +79,7 @@ public MemoryClientBuilder(WebApplicationBuilder appBuilder)
this.AddSingleton<List<ISemanticMemoryVectorDb>>(this._vectorDbs);

// Default configuration for tests and demos
this.WithCustomMimeTypeDetection(new MimeTypesDetection());
this.WithDefaultMimeTypeDetection();
this.WithSimpleFileStorage(new SimpleFileStorageConfig { Directory = Path.Join(Path.GetTempPath(), "content") });
}

Expand All @@ -95,6 +97,13 @@ public MemoryClientBuilder WithCustomStorage(IContentStorage service)
return this;
}

public MemoryClientBuilder WithDefaultMimeTypeDetection()
{
this.AddSingleton<IMimeTypeDetection, MimeTypesDetection>();

return this;
}

public MemoryClientBuilder WithCustomMimeTypeDetection(IMimeTypeDetection service)
{
service = service ?? throw new ConfigurationException("The MIME type detection instance is NULL");
Expand Down Expand Up @@ -143,12 +152,19 @@ public MemoryClientBuilder WithCustomTextGeneration(ITextGeneration service)
return this;
}

public MemoryClientBuilder WithCustomImageOcr(IOcrEngine service)
{
service = service ?? throw new ConfigurationException("The OCR engine instance is NULL");
this.AddSingleton<IOcrEngine>(service);
return this;
}

public MemoryClientBuilder FromAppSettings()
{
var config = this._appBuilder.Configuration.GetSection(ConfigRoot).Get<SemanticMemoryConfig>();
if (config == null) { throw new ConfigurationException("Unable to parse configuration files"); }

this.WithCustomMimeTypeDetection(new MimeTypesDetection());
this.WithDefaultMimeTypeDetection();

// Ingestion queue
if (string.Equals(config.DataIngestion.OrchestrationType, "Distributed", StringComparison.OrdinalIgnoreCase))
Expand Down Expand Up @@ -314,6 +330,23 @@ public MemoryClientBuilder FromAppSettings()
break;
}

// Image OCR
switch (config.ImageOcrType)
{
case string y when string.IsNullOrWhiteSpace(y):
case string x when x.Equals("None", StringComparison.OrdinalIgnoreCase):
break;

case string x when x.Equals("AzureFormRecognizer", StringComparison.OrdinalIgnoreCase):
this._appBuilder.Services.AddAzureFormRecognizer(this.GetServiceConfig<AzureFormRecognizerConfig>(config, "AzureFormRecognizer"));
this._sharedServiceCollection?.AddAzureFormRecognizer(this.GetServiceConfig<AzureFormRecognizerConfig>(config, "AzureFormRecognizer"));
break;

default:
// NOOP - allow custom implementations, via WithCustomImageOCR()
break;
}

return this;
}

Expand Down Expand Up @@ -394,8 +427,9 @@ public Memory BuildServerlessClient()

var orchestrator = this._app.Services.GetService<InProcessPipelineOrchestrator>() ?? throw new ConfigurationException("Unable to build orchestrator");
var searchClient = this._app.Services.GetService<SearchClient>() ?? throw new ConfigurationException("Unable to build search client");
var ocrEngine = this._app.Services.GetService<IOcrEngine>();

return new Memory(orchestrator, searchClient);
return new Memory(orchestrator, searchClient, ocrEngine);
}
catch (Exception e)
{
Expand Down Expand Up @@ -439,6 +473,14 @@ private MemoryClientBuilder CompleteAsyncClient()
return this;
}

private MemoryClientBuilder AddSingleton<TService>(Func<IServiceProvider, TService> serviceFactory)
where TService : class
{
this._appBuilder.Services.AddSingleton<TService>(serviceFactory);
this._sharedServiceCollection?.AddSingleton<TService>(serviceFactory);
return this;
}

private MemoryClientBuilder AddSingleton<TService>(TService implementationInstance)
where TService : class
{
Expand Down
5 changes: 5 additions & 0 deletions dotnet/CoreLib/Configuration/SemanticMemoryConfig.cs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ public class RetrievalConfig
/// </summary>
public string TextGeneratorType { get; set; } = string.Empty;

/// <summary>
/// The OCR generator used to recognize text in images.
/// </summary>
public string ImageOcrType { get; set; } = string.Empty;

/// <summary>
/// Settings for the upload of documents and memory creation/update.
/// </summary>
Expand Down
1 change: 1 addition & 0 deletions dotnet/CoreLib/CoreLib.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
</ItemGroup>

<ItemGroup>
<PackageReference Include="Azure.AI.FormRecognizer" Version="4.1.0" />
<PackageReference Include="Azure.Identity" Version="1.10.0" />
<PackageReference Include="Azure.Search.Documents" Version="11.5.0-beta.4" />
<PackageReference Include="Azure.Storage.Blobs" Version="12.17.0" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Text.Json.Serialization;

namespace Microsoft.SemanticMemory.DataFormats.Image.AzureFormRecognizer;

public class AzureFormRecognizerConfig
{
[JsonConverter(typeof(JsonStringEnumConverter))]
public enum AuthTypes
{
Unknown = -1,
AzureIdentity,
APIKey,
}

public AuthTypes Auth { get; set; } = AuthTypes.Unknown;

public string Endpoint { get; set; } = string.Empty;

public string APIKey { get; set; } = string.Empty;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using Azure;
using Azure.AI.FormRecognizer.DocumentAnalysis;
using Azure.Identity;
using Microsoft.Extensions.Logging;
using Microsoft.SemanticMemory.Configuration;

namespace Microsoft.SemanticMemory.DataFormats.Image.AzureFormRecognizer;

/// <summary>
/// OCR engine based on Azure.AI.FormRecognizer.
/// </summary>
public class AzureFormRecognizerEngine : IOcrEngine
{
private readonly DocumentAnalysisClient _recognizerClient;
private readonly ILogger<AzureFormRecognizerEngine> _log;

/// <summary>
/// Creates a new instance of the AzureFormRecognizerOcrEngine passing in the Form Recognizer endpoint and key.
/// </summary>
/// <param name="endpoint">The endpoint for accessing a provisioned Azure Form Recognizer instance</param>
/// <param name="config">The AzureFormRecognizerConfig config for this service.</param>
public AzureFormRecognizerEngine(string endpoint, AzureFormRecognizerConfig config, ILogger<AzureFormRecognizerEngine> log)
{
this._log = log;

switch (config.Auth)
{
case AzureFormRecognizerConfig.AuthTypes.AzureIdentity:
this._recognizerClient = new DocumentAnalysisClient(new Uri(endpoint), new DefaultAzureCredential());
break;

case AzureFormRecognizerConfig.AuthTypes.APIKey:
if (string.IsNullOrEmpty(config.APIKey))
{
this._log.LogCritical("Azure Form Recognizer API key is empty");
throw new ConfigurationException("Azure Form Recognizer API key is empty");
}

this._recognizerClient = new DocumentAnalysisClient(new Uri(endpoint), new AzureKeyCredential(config.APIKey));
break;

default:
this._log.LogCritical("Azure Form Recognizer authentication type '{0}' undefined or not supported", config.Auth);
throw new ConfigurationException($"Azure Form Recognizer authentication type '{config.Auth}' undefined or not supported");
}
}

///<inheritdoc/>
public async Task<string> ExtractTextFromImageAsync(Stream imageContent, CancellationToken cancellationToken = default)
{
// Start the OCR operation
var operation = await this._recognizerClient.AnalyzeDocumentAsync(WaitUntil.Completed, "prebuilt-read", imageContent, cancellationToken: cancellationToken).ConfigureAwait(false);

// Wait for the result
Response<AnalyzeResult> operationResponse = await operation.WaitForCompletionAsync(cancellationToken).ConfigureAwait(false);

return operationResponse.Value.Content;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.Extensions.DependencyInjection;
using Microsoft.SemanticMemory.DataFormats.Image;
using Microsoft.SemanticMemory.DataFormats.Image.AzureFormRecognizer;

// ReSharper disable once CheckNamespace
namespace Microsoft.SemanticMemory;

public static partial class MemoryClientBuilderExtensions
{
public static MemoryClientBuilder WithAzureFormRecognizer(this MemoryClientBuilder builder, AzureFormRecognizerConfig config)
{
builder.Services.AddAzureFormRecognizer(config);

return builder;
}
}

public static partial class DependencyInjection
{
public static IServiceCollection AddAzureFormRecognizer(this IServiceCollection services, AzureFormRecognizerConfig config)
{
return services
.AddSingleton<AzureFormRecognizerConfig>(config)
.AddTransient<IOcrEngine, AzureFormRecognizerEngine>();
}

public static IServiceCollection AddAzureFormRecognizer(this IServiceCollection services, string endpoint, string apiKey)
{
var config = new AzureFormRecognizerConfig { Endpoint = endpoint, APIKey = apiKey, Auth = AzureFormRecognizerConfig.AuthTypes.APIKey };
return services.AddAzureFormRecognizer(config);
}
}
19 changes: 19 additions & 0 deletions dotnet/CoreLib/DataFormats/Image/IOcrEngine.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) Microsoft. All rights reserved.

using System.IO;
using System.Threading;
using System.Threading.Tasks;

namespace Microsoft.SemanticMemory.DataFormats.Image;

/// <summary>
/// An OCR engine that can read in text from image files.
/// </summary>
public interface IOcrEngine
{
/// <summary>
/// Reads all text from the image.
/// </summary>
/// <param name="imageContent">The image content stream.</param>
Task<string> ExtractTextFromImageAsync(Stream imageContent, CancellationToken cancellationToken = default);
}
28 changes: 28 additions & 0 deletions dotnet/CoreLib/DataFormats/Image/ImageDecoder.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.IO;
using System.Threading;
using System.Threading.Tasks;

namespace Microsoft.SemanticMemory.DataFormats.Image;

public class ImageDecoder
{
public async Task<string> ImageToTextAsync(IOcrEngine engine, string filename, CancellationToken cancellationToken = default)
{
using var stream = File.OpenRead(filename);
return await this.ImageToTextAsync(engine, stream, cancellationToken).ConfigureAwait(false);
}

public async Task<string> ImageToTextAsync(IOcrEngine engine, BinaryData data, CancellationToken cancellationToken = default)
{
using var stream = data.ToStream();
return await this.ImageToTextAsync(engine, stream, cancellationToken).ConfigureAwait(false);
}

public Task<string> ImageToTextAsync(IOcrEngine engine, Stream data, CancellationToken cancellationToken = default)
{
return engine.ExtractTextFromImageAsync(data, cancellationToken);
}
}
6 changes: 4 additions & 2 deletions dotnet/CoreLib/DataFormats/Office/MsWordDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,14 @@ public class MsWordDecoder
{
public string DocToText(string filename)
{
return this.DocToText(File.OpenRead(filename));
using var stream = File.OpenRead(filename);
return this.DocToText(stream);
}

public string DocToText(BinaryData data)
{
return this.DocToText(data.ToStream());
using var stream = data.ToStream();
return this.DocToText(stream);
}

public string DocToText(Stream data)
Expand Down
6 changes: 4 additions & 2 deletions dotnet/CoreLib/DataFormats/Pdf/PdfDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ public class PdfDecoder
{
public string DocToText(string filename)
{
return this.DocToText(File.OpenRead(filename));
using var stream = File.OpenRead(filename);
return this.DocToText(stream);
}

public string DocToText(BinaryData data)
{
return this.DocToText(data.ToStream());
using var stream = data.ToStream();
return this.DocToText(stream);
}

public string DocToText(Stream data)
Expand Down
Loading

0 comments on commit 50c369e

Please sign in to comment.