Skip to content

Commit

Permalink
Add custom metadata to uploaded files (#95)
Browse files Browse the repository at this point in the history
* feat(context): allow user to pass a custom metadata to the context

* test(file-loader): add tests

* fix: use bun in tests

---------

Co-authored-by: Ronaldo Lima <[email protected]>
  • Loading branch information
CahidArda and ronal2do authored Nov 19, 2024
1 parent 0a78037 commit 5e4c3de
Show file tree
Hide file tree
Showing 2 changed files with 277 additions and 24 deletions.
248 changes: 248 additions & 0 deletions src/file-loader.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
/* eslint-disable @typescript-eslint/no-explicit-any */

import { describe, test, expect } from "bun:test";
import { FileDataLoader } from "./file-loader";
import type { DatasWithFileSource } from "./database";

describe("FileDataLoader Integration Tests", () => {
describe("PDF Loading", () => {
test("should load and transform Wizard of Oz PDF", async () => {
const config: DatasWithFileSource = {
type: "pdf",
fileSource: "./data/the_wonderful_wizard_of_oz.pdf",
options: {
metadata: {
book: "The Wonderful Wizard of Oz",
type: "classic literature",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({
chunkSize: 1000,
chunkOverlap: 200,
});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
book: "The Wonderful Wizard of Oz",
type: "classic literature",
source: expect.stringContaining("the_wonderful_wizard_of_oz.pdf"),
timestamp: expect.any(String),
paragraphNumber: expect.any(Number),
}),
});

const allContent = result.map((document) => document.data).join(" ");
expect(allContent).toContain("Dorothy");
});
});

describe("CSV Loading", () => {
test("should load and transform user info CSV", async () => {
const config: DatasWithFileSource = {
type: "csv",
fileSource: "./data/list_of_user_info.csv",
options: {
metadata: {
dataType: "user_info",
version: "1.0",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
dataType: "user_info",
version: "1.0",
}),
});

for (const document of result) {
expect(document.data).toBeTruthy();
expect(typeof document.data).toBe("string");
}
});
});

describe("Text File Loading", () => {
test("should load and transform Wizard of Oz summary text", async () => {
const chunkSize = 500;
const chunkOverlap = 50;
const config: DatasWithFileSource = {
type: "text-file",
fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt",
options: {
metadata: {
contentType: "summary",
subject: "The Wonderful Wizard of Oz",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({
chunkSize: chunkSize,
chunkOverlap: chunkOverlap,
});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
contentType: "summary",
subject: "The Wonderful Wizard of Oz",
}),
});

for (const document of result) {
expect(document.data.length).toBeLessThanOrEqual(chunkSize + chunkOverlap);
}
});
});

describe("HTML Loading", () => {
test("should load and transform Wizard of Oz summary HTML", async () => {
const config: DatasWithFileSource = {
type: "html",
source: "./data/the_wonderful_wizard_of_oz_summary.html",
options: {
metadata: {
format: "html",
subject: "The Wonderful Wizard of Oz Summary",
},
},
};

const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
const result = await loadFunction({});

expect(result.length).toBeGreaterThan(0);
expect(result[0]).toEqual({
data: expect.any(String),
id: expect.any(String),
metadata: expect.objectContaining({
format: "html",
subject: "The Wonderful Wizard of Oz Summary",
}),
});

const content = result[0].data;
expect(content).not.toContain("<html>");
expect(content).not.toContain("<body>");
expect(content).not.toContain("<");
});
});

describe("Multiple File Types", () => {
test("should handle loading different formats with consistent metadata", async () => {
const commonMetadata = {
project: "Wizard of Oz Analysis",
timestamp: new Date().toISOString(),
};

const configs: DatasWithFileSource[] = [
{
type: "pdf",
fileSource: "./data/the_wonderful_wizard_of_oz.pdf",
options: { metadata: commonMetadata },
},
{
type: "text-file",
fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt",
options: { metadata: commonMetadata },
},
{
type: "html",
source: "./data/the_wonderful_wizard_of_oz_summary.html",
options: { metadata: commonMetadata },
},
];

const results = await Promise.all(
configs.map(async (config) => {
const loader = new FileDataLoader(config);
const loadFunction = await loader.loadFile({});
return loadFunction({});
})
);

for (const result of results) {
expect(result.length).toBeGreaterThan(0);
expect(result[0].metadata).toMatchObject(commonMetadata);
}

const [pdfContent, txtContent, htmlContent] = results.map((r) =>
r.map((document) => document.data).join(" ")
);

expect(pdfContent).toContain("Dorothy");
expect(txtContent).toContain("Dorothy");
expect(htmlContent).toContain("Dorothy");
});
});

describe("FileDataLoader Error Handling", () => {
describe("Missing Files", () => {
test("should handle non-existent files", () => {
const config: DatasWithFileSource = {
type: "pdf",
fileSource: "./data/does_not_exist.pdf",
};

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow(/no such file/i);
});
});

describe("Invalid Configurations", () => {
test("should error with invalid file type", () => {
const config: DatasWithFileSource = {
type: "invalid" as any,
fileSource: "./data/some_file.txt",
};

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow(/unsupported data type/i);
});

test("should error with missing required options for processors", () => {
const config: DatasWithFileSource = {
fileSource: "test.doc",
processor: {
options: {},
},
} as any;

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow();
});

test("should error with invalid file path", () => {
const config: DatasWithFileSource = {
type: "pdf",
fileSource: "",
};

const loader = new FileDataLoader(config);
expect(loader.loadFile({})).rejects.toThrow();
});
});
});
});
53 changes: 29 additions & 24 deletions src/file-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -131,27 +131,29 @@ export class FileDataLoader {
case "pdf": {
const splitter = new RecursiveCharacterTextSplitter(args);
const splittedDocuments = await splitter.splitDocuments(documents);

return mapDocumentsIntoInsertPayload(splittedDocuments, (metadata: any, index: number) => ({
source: metadata.source,
timestamp: new Date().toISOString(),
paragraphNumber: index + 1,
pageNumber: metadata.loc?.pageNumber || undefined,
author: metadata.pdf?.info?.Author || undefined,
title: metadata.pdf?.info?.Title || undefined,
totalPages: metadata.pdf?.totalPages || undefined,
language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined,
}));
return this.mapDocumentsIntoInsertPayload(
splittedDocuments,
(metadata: any, index: number) => ({
source: metadata.source,
timestamp: new Date().toISOString(),
paragraphNumber: index + 1,
pageNumber: metadata.loc?.pageNumber || undefined,
author: metadata.pdf?.info?.Author || undefined,
title: metadata.pdf?.info?.Title || undefined,
totalPages: metadata.pdf?.totalPages || undefined,
language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined,
})
);
}

case "csv": {
return mapDocumentsIntoInsertPayload(documents);
return this.mapDocumentsIntoInsertPayload(documents);
}

case "text-file": {
const splitter = new RecursiveCharacterTextSplitter(args);
const splittedDocuments = await splitter.splitDocuments(documents);
return mapDocumentsIntoInsertPayload(splittedDocuments);
return this.mapDocumentsIntoInsertPayload(splittedDocuments);
}

case "html": {
Expand All @@ -162,7 +164,7 @@ export class FileDataLoader {

const newDocuments = await sequence.invoke(documents);

return mapDocumentsIntoInsertPayload(newDocuments);
return this.mapDocumentsIntoInsertPayload(newDocuments);
}

// Processors will be handled here. E.g. "unstructured", "llama-parse"
Expand All @@ -182,17 +184,20 @@ export class FileDataLoader {
throw new Error(`Unsupported data type: ${this.config.type}`);
}
}
}

function mapDocumentsIntoInsertPayload(
splittedDocuments: Document[],
metadataMapper?: (metadata: any, index: number) => Record<string, any>
) {
return splittedDocuments.map((document, index) => ({
data: document.pageContent,
id: nanoid(),
...(metadataMapper ? { metadata: metadataMapper(document.metadata, index) } : {}),
}));
}
private mapDocumentsIntoInsertPayload(
splittedDocuments: Document[],
metadataMapper?: (metadata: any, index: number) => Record<string, any>
) {
return splittedDocuments.map((document, index) => ({
data: document.pageContent,
id: nanoid(),
metadata: {
...(metadataMapper ? metadataMapper(document.metadata, index) : {}),
...this.config.options?.metadata,
},
}));
}
}

Expand Down

0 comments on commit 5e4c3de

Please sign in to comment.