Skip to content

Commit

Permalink
🚧 Improving FileSystemIngestor
Browse files Browse the repository at this point in the history
  • Loading branch information
amengus87 committed May 31, 2024
1 parent 228c5be commit cb1ab43
Show file tree
Hide file tree
Showing 14 changed files with 115 additions and 279 deletions.

This file was deleted.

46 changes: 0 additions & 46 deletions backend/src/main/java/ai/dragon/entity/ProviderEntity.java

This file was deleted.

6 changes: 6 additions & 0 deletions backend/src/main/java/ai/dragon/entity/SiloEntity.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ public class SiloEntity implements AbstractEntity {
@Schema(description = "Cron Expression for the Silo's Ingestor Job", example = "Launch the Silo ingestor every 15 minutes : */15 * * * *")
private String ingestorSchedule;

@Schema(description = "Settings to be linked to the Silo's Vector Store in the form of `key = value` pairs.")
private List<String> vectorStoreSettings;

@Schema(description = "Settings to be linked to the Silo's Embedding Model in the form of `key = value` pairs.")
private List<String> embeddingModelSettings;

@Schema(description = "Settings to be linked to the Silo's Ingestor in the form of `key = value` pairs.")
private List<String> ingestorSettings;

Expand Down
34 changes: 21 additions & 13 deletions backend/src/main/java/ai/dragon/enumeration/EmbeddingModelType.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

import java.util.List;

import ai.dragon.job.silo.embedding.EmbeddingModelDefinition;
import dev.langchain4j.model.embedding.bge.small.en.v15.BgeSmallEnV15QuantizedEmbeddingModel;
import dev.langchain4j.model.openai.OpenAiEmbeddingModel;
import dev.langchain4j.model.openai.OpenAiEmbeddingModelName;
import lombok.Builder;
import lombok.Data;

public enum EmbeddingModelType {
BgeSmallEnV15QuantizedEmbeddingModel("BgeSmallEnV15QuantizedEmbeddingModel"),
Expand Down Expand Up @@ -36,6 +37,9 @@ public EmbeddingModelDefinition getModelDefinition() throws ClassNotFoundExcepti
.embeddingModelClassName(
"dev.langchain4j.model.embedding.bge.small.en.v15.BgeSmallEnV15QuantizedEmbeddingModel")
.embeddingModelName("BgeSmallEnV15QuantizedEmbeddingModel")
.embeddingModelWithSettings(parameters -> {
return new BgeSmallEnV15QuantizedEmbeddingModel();
})
.providerType(ProviderType.ONNX)
.dimensions(384)
.maxTokens(512)
Expand All @@ -46,6 +50,11 @@ public EmbeddingModelDefinition getModelDefinition() throws ClassNotFoundExcepti
.languages(List.of("en"))
.embeddingModelClassName("dev.langchain4j.model.openai.OpenAiEmbeddingModel")
.embeddingModelName(OpenAiEmbeddingModelName.TEXT_EMBEDDING_ADA_002.toString())
.embeddingModelWithSettings(parameters -> {
return OpenAiEmbeddingModel.builder()
.modelName(OpenAiEmbeddingModelName.TEXT_EMBEDDING_ADA_002)
.apiKey(parameters.getApiKey()).build();
})
.providerType(ProviderType.OpenAI)
.dimensions(1536)
.maxTokens(8191)
Expand All @@ -56,6 +65,11 @@ public EmbeddingModelDefinition getModelDefinition() throws ClassNotFoundExcepti
.languages(List.of("en"))
.embeddingModelClassName("dev.langchain4j.model.openai.OpenAiEmbeddingModel")
.embeddingModelName(OpenAiEmbeddingModelName.TEXT_EMBEDDING_3_SMALL.toString())
.embeddingModelWithSettings(parameters -> {
return OpenAiEmbeddingModel.builder()
.modelName(OpenAiEmbeddingModelName.TEXT_EMBEDDING_3_SMALL)
.apiKey(parameters.getApiKey()).build();
})
.providerType(ProviderType.OpenAI)
.dimensions(1536)
.maxTokens(8191)
Expand All @@ -66,6 +80,11 @@ public EmbeddingModelDefinition getModelDefinition() throws ClassNotFoundExcepti
.languages(List.of("en"))
.embeddingModelClassName("dev.langchain4j.model.openai.OpenAiEmbeddingModel")
.embeddingModelName(OpenAiEmbeddingModelName.TEXT_EMBEDDING_3_LARGE.toString())
.embeddingModelWithSettings(parameters -> {
return OpenAiEmbeddingModel.builder()
.modelName(OpenAiEmbeddingModelName.TEXT_EMBEDDING_3_LARGE)
.apiKey(parameters.getApiKey()).build();
})
.providerType(ProviderType.OpenAI)
.dimensions(3072)
.maxTokens(8191)
Expand All @@ -80,14 +99,3 @@ public String toString() {
return value;
}
}

@Data
@Builder
class EmbeddingModelDefinition {
private String embeddingModelClassName;
private String embeddingModelName;
private List<String> languages;
private ProviderType providerType;
private int dimensions;
private int maxTokens;
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package ai.dragon.enumeration;

public enum VectorStoreType {
// TODO : Add more VectorStoreType
// PGVector : PgVectorEmbeddingStore
InMemoryEmbeddingStore("InMemoryEmbeddingStore");

private String value;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package ai.dragon.job.silo.embedding;

import java.util.List;
import java.util.function.Function;

import ai.dragon.enumeration.ProviderType;
import dev.langchain4j.model.embedding.EmbeddingModel;
import lombok.Builder;
import lombok.Data;

@Data
@Builder
public class EmbeddingModelDefinition {
private String embeddingModelClassName;
private String embeddingModelName;
private Function<EmbeddingModelSettings, EmbeddingModel> embeddingModelWithSettings;
private List<String> languages;
private ProviderType providerType;
private int dimensions;
private int maxTokens;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package ai.dragon.job.silo.embedding;

import lombok.Data;

@Data
public class EmbeddingModelSettings {
private String apiKey;
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,19 @@ public List<Document> listDocuments() throws Exception {
PathMatcher pathMatcher = fileSystemIngestorSettings.getPathMatcher() != null
? FileSystems.getDefault().getPathMatcher(fileSystemIngestorSettings.getPathMatcher())
: FileSystems.getDefault().getPathMatcher(FileSystemIngestorSettings.DEFAULT_PATH_MATCHER);
return fileSystemIngestorSettings.isRecursive()
? FileSystemDocumentLoader.loadDocumentsRecursively(ingestorPathFile.toPath(), pathMatcher, new ApacheTikaDocumentParser())
: FileSystemDocumentLoader.loadDocuments(ingestorPathFile.toPath(), pathMatcher, new ApacheTikaDocumentParser());
List<Document> documents = fileSystemIngestorSettings.isRecursive()
? FileSystemDocumentLoader.loadDocumentsRecursively(ingestorPathFile.toPath(), pathMatcher,
new ApacheTikaDocumentParser())
: FileSystemDocumentLoader.loadDocuments(ingestorPathFile.toPath(), pathMatcher,
new ApacheTikaDocumentParser());
for (Document document : documents) {
File file = new File(document.metadata().getString("absolute_directory_path"),
document.metadata().getString("file_name"));
document.metadata().put("silo_uuid", entity.getUuid().toString());
document.metadata().put("file_date", file.lastModified());
document.metadata().put("file_size", file.length());
}
return documents;
}

public void checkIngestorSettings() throws Exception {
Expand Down
10 changes: 0 additions & 10 deletions backend/src/main/java/ai/dragon/repository/ProviderRepository.java

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,7 @@ public void clearEmbeddingStore(UUID siloUuid) {
}

private EmbeddingStore<TextSegment> buildEmbeddingStore(SiloEntity siloEntity) {
// TODO
EmbeddingStore<TextSegment> embeddingStore = null;// new InMemoryEmbeddingStore<>();
EmbeddingStore<TextSegment> embeddingStore = null;

switch (siloEntity.getVectorStoreType()) {
case InMemoryEmbeddingStore:
Expand Down
66 changes: 33 additions & 33 deletions backend/src/main/java/ai/dragon/service/IngestorService.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,15 @@
import org.springframework.stereotype.Service;

import ai.dragon.entity.SiloEntity;
import ai.dragon.job.silo.embedding.EmbeddingModelSettings;
import ai.dragon.job.silo.ingestor.AbstractSiloIngestor;
import ai.dragon.job.silo.ingestor.FileSystemIngestor;
import ai.dragon.job.silo.ingestor.dto.SiloIngestLogMessage;
import ai.dragon.util.IniSettingUtil;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.splitter.DocumentSplitters;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.store.embedding.EmbeddingStore;
import dev.langchain4j.store.embedding.EmbeddingStoreIngestor;
import jakarta.activation.UnsupportedDataTypeException;
Expand Down Expand Up @@ -45,6 +49,8 @@ private void ingestDocumentsToSilo(List<Document> documents, SiloEntity siloEnti
Consumer<Integer> progressCallback, Consumer<SiloIngestLogMessage> logCallback)
throws Exception {
EmbeddingStore<TextSegment> embeddingStore = embeddingStoreService.retrieveEmbeddingStore(siloEntity.getUuid());
EmbeddingModel embeddingModel = modelForEntity(siloEntity);
EmbeddingStoreIngestor ingestor = buildIngestor(embeddingStore, embeddingModel);
for (int i = 0; i < documents.size(); i++) {
if (Thread.currentThread().isInterrupted()) {
throw new InterruptedException();
Expand All @@ -53,44 +59,38 @@ private void ingestDocumentsToSilo(List<Document> documents, SiloEntity siloEnti
Document document = documents.get(i);
logCallback.accept(SiloIngestLogMessage.builder()
.message(document.metadata().toString()).build());
/*
* TODO
* public EmbeddingStoreIngestor(DocumentTransformer documentTransformer,
* DocumentSplitter documentSplitter,
* TextSegmentTransformer textSegmentTransformer,
* EmbeddingModel embeddingModel,
* EmbeddingStore<TextSegment> embeddingStore) {
*/
EmbeddingStoreIngestor.ingest(documents, embeddingStore);
ingestor.ingest(documents);
progressCallback.accept(progress);
}
logCallback.accept(SiloIngestLogMessage.builder()
.message("End.").build());
progressCallback.accept(100);

// TODO
/*
* EmbeddingModel embeddingModel = new BgeSmallEnV15QuantizedEmbeddingModel();
* String query = "How to construct a RAG system?";
* Embedding queryEmbedding = embeddingModel.embed(query).content();
*
* Filter onlyForUser1 = metadataKey("userId").isEqualTo("1");
*
* EmbeddingSearchRequest embeddingSearchRequest1 =
* EmbeddingSearchRequest.builder()
* .queryEmbedding(queryEmbedding)
* // .filter(onlyForUser1)
* .build();
*
* EmbeddingSearchResult<TextSegment> embeddingSearchResult1 =
* embeddingStore.search(embeddingSearchRequest1);
* for (EmbeddingMatch<TextSegment> embeddingMatch :
* embeddingSearchResult1.matches()) {
* jobContext().logger().info("=> " + embeddingMatch.score() + " : " +
* embeddingMatch.embedded().metadata());
* jobContext().logger().info(embeddingMatch.embedded().text());
* jobContext().logger().info("=====");
* }
*/
}

private EmbeddingStoreIngestor buildIngestor(EmbeddingStore<TextSegment> embeddingStore,
EmbeddingModel embeddingModel) {
return EmbeddingStoreIngestor.builder()
.documentTransformer(document -> {
document.metadata().put("index_date", System.currentTimeMillis());
return document;
})
// TODO .documentSplitter(DocumentSplitters.recursive(1000, 200, new
// OpenAiTokenizer()))
.documentSplitter(DocumentSplitters.recursive(300, 50))
.textSegmentTransformer(textSegment -> {
textSegment.metadata().put("index_date", System.currentTimeMillis());
return textSegment;
})
.embeddingModel(embeddingModel)
.embeddingStore(embeddingStore)
.build();
}

private EmbeddingModel modelForEntity(SiloEntity siloEntity) throws Exception {
EmbeddingModelSettings embeddingModelSettings = IniSettingUtil.convertIniSettingsToObject(
siloEntity.getEmbeddingModelSettings(), EmbeddingModelSettings.class);
return siloEntity.getEmbeddingModelType().getModelDefinition().getEmbeddingModelWithSettings()
.apply(embeddingModelSettings);
}
}
Loading

0 comments on commit cb1ab43

Please sign in to comment.