Skip to content

Commit

Permalink
feat(chat): sync with master
Browse files Browse the repository at this point in the history
  • Loading branch information
ronal2do committed Dec 21, 2024
2 parents e3b0e55 + 5e4c3de commit 4cf4a53
Show file tree
Hide file tree
Showing 7 changed files with 344 additions and 24 deletions.
1 change: 1 addition & 0 deletions src/context-service/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ export class ContextService {
similarityThreshold: optionsWithDefault.similarityThreshold,
topK: optionsWithDefault.topK,
namespace: optionsWithDefault.namespace,
contextFilter: optionsWithDefault.contextFilter,
});

// Log the result, which will be captured by the outer traceable
Expand Down
3 changes: 3 additions & 0 deletions src/database.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ export type VectorPayload = {
similarityThreshold?: number;
topK?: number;
namespace?: string;
contextFilter?: string;
};

export type ResetOptions = {
Expand Down Expand Up @@ -106,6 +107,7 @@ export class Database {
similarityThreshold = DEFAULT_SIMILARITY_THRESHOLD,
topK = DEFAULT_TOP_K,
namespace,
contextFilter,
}: VectorPayload): Promise<{ data: string; id: string; metadata: TMetadata }[]> {
const index = this.index;
const result = await index.query<Record<string, string>>(
Expand All @@ -114,6 +116,7 @@ export class Database {
topK,
includeData: true,
includeMetadata: true,
...(typeof contextFilter === "string" && { filter: contextFilter }),
},
{ namespace }
);
Expand Down
248 changes: 248 additions & 0 deletions src/file-loader.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
/* eslint-disable @typescript-eslint/no-explicit-any */

import { describe, test, expect } from "bun:test";
import { FileDataLoader } from "./file-loader";
import type { DatasWithFileSource } from "./database";

describe("FileDataLoader Integration Tests", () => {
describe("PDF Loading", () => {
test("should load and transform Wizard of Oz PDF", async () => {
const config: DatasWithFileSource = {
type: "pdf",
fileSource: "./data/the_wonderful_wizard_of_oz.pdf",
options: {
metadata: {
book: "The Wonderful Wizard of Oz",
type: "classic literature",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({
chunkSize: 1000,
chunkOverlap: 200,
});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
book: "The Wonderful Wizard of Oz",
type: "classic literature",
source: expect.stringContaining("the_wonderful_wizard_of_oz.pdf"),
timestamp: expect.any(String),
paragraphNumber: expect.any(Number),
}),
});

const allContent = result.map((document) => document.data).join(" ");
expect(allContent).toContain("Dorothy");
});
});

describe("CSV Loading", () => {
test("should load and transform user info CSV", async () => {
const config: DatasWithFileSource = {
type: "csv",
fileSource: "./data/list_of_user_info.csv",
options: {
metadata: {
dataType: "user_info",
version: "1.0",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
dataType: "user_info",
version: "1.0",
}),
});

for (const document of result) {
expect(document.data).toBeTruthy();
expect(typeof document.data).toBe("string");
}
});
});

describe("Text File Loading", () => {
test("should load and transform Wizard of Oz summary text", async () => {
const chunkSize = 500;
const chunkOverlap = 50;
const config: DatasWithFileSource = {
type: "text-file",
fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt",
options: {
metadata: {
contentType: "summary",
subject: "The Wonderful Wizard of Oz",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({
chunkSize: chunkSize,
chunkOverlap: chunkOverlap,
});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
contentType: "summary",
subject: "The Wonderful Wizard of Oz",
}),
});

for (const document of result) {
expect(document.data.length).toBeLessThanOrEqual(chunkSize + chunkOverlap);
}
});
});

describe("HTML Loading", () => {
test("should load and transform Wizard of Oz summary HTML", async () => {
const config: DatasWithFileSource = {
type: "html",
source: "./data/the_wonderful_wizard_of_oz_summary.html",
options: {
metadata: {
format: "html",
subject: "The Wonderful Wizard of Oz Summary",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
format: "html",
subject: "The Wonderful Wizard of Oz Summary",
}),
});

const content = result[0].data;
expect(content).not.toContain("<html>");
expect(content).not.toContain("<body>");
expect(content).not.toContain("<");
});
});

describe("Multiple File Types", () => {
test("should handle loading different formats with consistent metadata", async () => {
const commonMetadata = {
project: "Wizard of Oz Analysis",
timestamp: new Date().toISOString(),
};

const configs: DatasWithFileSource[] = [
{
type: "pdf",
fileSource: "./data/the_wonderful_wizard_of_oz.pdf",
options: { metadata: commonMetadata },
},
{
type: "text-file",
fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt",
options: { metadata: commonMetadata },
},
{
type: "html",
source: "./data/the_wonderful_wizard_of_oz_summary.html",
options: { metadata: commonMetadata },
},
];

const results = await Promise.all(
configs.map(async (config) => {
const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
return loadFunction({});
})
);

for (const result of results) {
expect(result.length).toBeGreaterThan(0);
expect(result[0].metadata).toMatchObject(commonMetadata);
}

const [pdfContent, txtContent, htmlContent] = results.map((r) =>
r.map((document) => document.data).join(" ")
);

expect(pdfContent).toContain("Dorothy");
expect(txtContent).toContain("Dorothy");
expect(htmlContent).toContain("Dorothy");
});
});

describe("FileDataLoader Error Handling", () => {
describe("Missing Files", () => {
test("should handle non-existent files", () => {
const config: DatasWithFileSource = {
type: "pdf",
fileSource: "./data/does_not_exist.pdf",
};

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow(/no such file/i);
});
});

describe("Invalid Configurations", () => {
test("should error with invalid file type", () => {
const config: DatasWithFileSource = {
type: "invalid" as any,
fileSource: "./data/some_file.txt",
};

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow(/unsupported data type/i);
});

test("should error with missing required options for processors", () => {
const config: DatasWithFileSource = {
fileSource: "test.doc",
processor: {
options: {},
},
} as any;

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow();
});

test("should error with invalid file path", () => {
const config: DatasWithFileSource = {
type: "pdf",
fileSource: "",
};

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow();
});
});
});
});
53 changes: 29 additions & 24 deletions src/file-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -131,27 +131,29 @@ export class FileDataLoader {
case "pdf": {
const splitter = new RecursiveCharacterTextSplitter(args);
const splittedDocuments = await splitter.splitDocuments(documents);

return mapDocumentsIntoInsertPayload(splittedDocuments, (metadata: any, index: number) => ({
source: metadata.source,
timestamp: new Date().toISOString(),
paragraphNumber: index + 1,
pageNumber: metadata.loc?.pageNumber || undefined,
author: metadata.pdf?.info?.Author || undefined,
title: metadata.pdf?.info?.Title || undefined,
totalPages: metadata.pdf?.totalPages || undefined,
language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined,
}));
return this.mapDocumentsIntoInsertPayload(
splittedDocuments,
(metadata: any, index: number) => ({
source: metadata.source,
timestamp: new Date().toISOString(),
paragraphNumber: index + 1,
pageNumber: metadata.loc?.pageNumber || undefined,
author: metadata.pdf?.info?.Author || undefined,
title: metadata.pdf?.info?.Title || undefined,
totalPages: metadata.pdf?.totalPages || undefined,
language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined,
})
);
}

case "csv": {
return mapDocumentsIntoInsertPayload(documents);
return this.mapDocumentsIntoInsertPayload(documents);
}

case "text-file": {
const splitter = new RecursiveCharacterTextSplitter(args);
const splittedDocuments = await splitter.splitDocuments(documents);
return mapDocumentsIntoInsertPayload(splittedDocuments);
return this.mapDocumentsIntoInsertPayload(splittedDocuments);
}

case "html": {
Expand All @@ -162,7 +164,7 @@ export class FileDataLoader {

const newDocuments = await sequence.invoke(documents);

return mapDocumentsIntoInsertPayload(newDocuments);
return this.mapDocumentsIntoInsertPayload(newDocuments);
}

// Processors will be handled here. E.g. "unstructured", "llama-parse"
Expand All @@ -182,17 +184,20 @@ export class FileDataLoader {
throw new Error(`Unsupported data type: ${this.config.type}`);
}
}
}

function mapDocumentsIntoInsertPayload(
splittedDocuments: Document[],
metadataMapper?: (metadata: any, index: number) => Record<string, any>
) {
return splittedDocuments.map((document, index) => ({
data: document.pageContent,
id: nanoid(),
...(metadataMapper ? { metadata: metadataMapper(document.metadata, index) } : {}),
}));
}
private mapDocumentsIntoInsertPayload(
splittedDocuments: Document[],
metadataMapper?: (metadata: any, index: number) => Record<string, any>
) {
return splittedDocuments.map((document, index) => ({
data: document.pageContent,
id: nanoid(),
metadata: {
...(metadataMapper ? metadataMapper(document.metadata, index) : {}),
...this.config.options?.metadata,
},
}));
}
}

Expand Down
Loading

0 comments on commit 4cf4a53

Please sign in to comment.