diff --git a/src/file-loader.test.ts b/src/file-loader.test.ts new file mode 100644 index 0000000..9668844 --- /dev/null +++ b/src/file-loader.test.ts @@ -0,0 +1,248 @@ +/* eslint-disable @typescript-eslint/no-unsafe-assignment */ +/* eslint-disable @typescript-eslint/no-explicit-any */ + +import { describe, test, expect } from "bun:test"; +import { FileDataLoader } from "./file-loader"; +import type { DatasWithFileSource } from "./database"; + +describe("FileDataLoader Integration Tests", () => { + describe("PDF Loading", () => { + test("should load and transform Wizard of Oz PDF", async () => { + const config: DatasWithFileSource = { + type: "pdf", + fileSource: "./data/the_wonderful_wizard_of_oz.pdf", + options: { + metadata: { + book: "The Wonderful Wizard of Oz", + type: "classic literature", + }, + }, + }; + + const loader = new FileDataLoader(config); + const loadFunction = await loader.loadFile({}); + const result = await loadFunction({ + chunkSize: 1000, + chunkOverlap: 200, + }); + + expect(result.length).toBeGreaterThan(0); + expect(result[0]).toEqual({ + data: expect.any(String), + id: expect.any(String), + metadata: expect.objectContaining({ + book: "The Wonderful Wizard of Oz", + type: "classic literature", + source: expect.stringContaining("the_wonderful_wizard_of_oz.pdf"), + timestamp: expect.any(String), + paragraphNumber: expect.any(Number), + }), + }); + + const allContent = result.map((document) => document.data).join(" "); + expect(allContent).toContain("Dorothy"); + }); + }); + + describe("CSV Loading", () => { + test("should load and transform user info CSV", async () => { + const config: DatasWithFileSource = { + type: "csv", + fileSource: "./data/list_of_user_info.csv", + options: { + metadata: { + dataType: "user_info", + version: "1.0", + }, + }, + }; + + const loader = new FileDataLoader(config); + const loadFunction = await loader.loadFile({}); + const result = await loadFunction({}); + + expect(result.length).toBeGreaterThan(0); + expect(result[0]).toEqual({ + data: expect.any(String), + id: expect.any(String), + metadata: expect.objectContaining({ + dataType: "user_info", + version: "1.0", + }), + }); + + for (const document of result) { + expect(document.data).toBeTruthy(); + expect(typeof document.data).toBe("string"); + } + }); + }); + + describe("Text File Loading", () => { + test("should load and transform Wizard of Oz summary text", async () => { + const chunkSize = 500; + const chunkOverlap = 50; + const config: DatasWithFileSource = { + type: "text-file", + fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt", + options: { + metadata: { + contentType: "summary", + subject: "The Wonderful Wizard of Oz", + }, + }, + }; + + const loader = new FileDataLoader(config); + const loadFunction = await loader.loadFile({}); + const result = await loadFunction({ + chunkSize: chunkSize, + chunkOverlap: chunkOverlap, + }); + + expect(result.length).toBeGreaterThan(0); + expect(result[0]).toEqual({ + data: expect.any(String), + id: expect.any(String), + metadata: expect.objectContaining({ + contentType: "summary", + subject: "The Wonderful Wizard of Oz", + }), + }); + + for (const document of result) { + expect(document.data.length).toBeLessThanOrEqual(chunkSize + chunkOverlap); + } + }); + }); + + describe("HTML Loading", () => { + test("should load and transform Wizard of Oz summary HTML", async () => { + const config: DatasWithFileSource = { + type: "html", + source: "./data/the_wonderful_wizard_of_oz_summary.html", + options: { + metadata: { + format: "html", + subject: "The Wonderful Wizard of Oz Summary", + }, + }, + }; + + const loader = new FileDataLoader(config); + const loadFunction = await loader.loadFile({}); + const result = await loadFunction({}); + + expect(result.length).toBeGreaterThan(0); + expect(result[0]).toEqual({ + data: expect.any(String), + id: expect.any(String), + metadata: expect.objectContaining({ + format: "html", + subject: "The Wonderful Wizard of Oz Summary", + }), + }); + + const content = result[0].data; + expect(content).not.toContain(""); + expect(content).not.toContain(""); + expect(content).not.toContain("<"); + }); + }); + + describe("Multiple File Types", () => { + test("should handle loading different formats with consistent metadata", async () => { + const commonMetadata = { + project: "Wizard of Oz Analysis", + timestamp: new Date().toISOString(), + }; + + const configs: DatasWithFileSource[] = [ + { + type: "pdf", + fileSource: "./data/the_wonderful_wizard_of_oz.pdf", + options: { metadata: commonMetadata }, + }, + { + type: "text-file", + fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt", + options: { metadata: commonMetadata }, + }, + { + type: "html", + source: "./data/the_wonderful_wizard_of_oz_summary.html", + options: { metadata: commonMetadata }, + }, + ]; + + const results = await Promise.all( + configs.map(async (config) => { + const loader = new FileDataLoader(config); + const loadFunction = await loader.loadFile({}); + return loadFunction({}); + }) + ); + + for (const result of results) { + expect(result.length).toBeGreaterThan(0); + expect(result[0].metadata).toMatchObject(commonMetadata); + } + + const [pdfContent, txtContent, htmlContent] = results.map((r) => + r.map((document) => document.data).join(" ") + ); + + expect(pdfContent).toContain("Dorothy"); + expect(txtContent).toContain("Dorothy"); + expect(htmlContent).toContain("Dorothy"); + }); + }); + + describe("FileDataLoader Error Handling", () => { + describe("Missing Files", () => { + test("should handle non-existent files", () => { + const config: DatasWithFileSource = { + type: "pdf", + fileSource: "./data/does_not_exist.pdf", + }; + + const loader = new FileDataLoader(config); + expect(loader.loadFile({})).rejects.toThrow(/no such file/i); + }); + }); + + describe("Invalid Configurations", () => { + test("should error with invalid file type", () => { + const config: DatasWithFileSource = { + type: "invalid" as any, + fileSource: "./data/some_file.txt", + }; + + const loader = new FileDataLoader(config); + expect(loader.loadFile({})).rejects.toThrow(/unsupported data type/i); + }); + + test("should error with missing required options for processors", () => { + const config: DatasWithFileSource = { + fileSource: "test.doc", + processor: { + options: {}, + }, + } as any; + + const loader = new FileDataLoader(config); + expect(loader.loadFile({})).rejects.toThrow(); + }); + + test("should error with invalid file path", () => { + const config: DatasWithFileSource = { + type: "pdf", + fileSource: "", + }; + + const loader = new FileDataLoader(config); + expect(loader.loadFile({})).rejects.toThrow(); + }); + }); + }); +}); diff --git a/src/file-loader.ts b/src/file-loader.ts index 04772f3..bfdf012 100644 --- a/src/file-loader.ts +++ b/src/file-loader.ts @@ -131,27 +131,29 @@ export class FileDataLoader { case "pdf": { const splitter = new RecursiveCharacterTextSplitter(args); const splittedDocuments = await splitter.splitDocuments(documents); - - return mapDocumentsIntoInsertPayload(splittedDocuments, (metadata: any, index: number) => ({ - source: metadata.source, - timestamp: new Date().toISOString(), - paragraphNumber: index + 1, - pageNumber: metadata.loc?.pageNumber || undefined, - author: metadata.pdf?.info?.Author || undefined, - title: metadata.pdf?.info?.Title || undefined, - totalPages: metadata.pdf?.totalPages || undefined, - language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined, - })); + return this.mapDocumentsIntoInsertPayload( + splittedDocuments, + (metadata: any, index: number) => ({ + source: metadata.source, + timestamp: new Date().toISOString(), + paragraphNumber: index + 1, + pageNumber: metadata.loc?.pageNumber || undefined, + author: metadata.pdf?.info?.Author || undefined, + title: metadata.pdf?.info?.Title || undefined, + totalPages: metadata.pdf?.totalPages || undefined, + language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined, + }) + ); } case "csv": { - return mapDocumentsIntoInsertPayload(documents); + return this.mapDocumentsIntoInsertPayload(documents); } case "text-file": { const splitter = new RecursiveCharacterTextSplitter(args); const splittedDocuments = await splitter.splitDocuments(documents); - return mapDocumentsIntoInsertPayload(splittedDocuments); + return this.mapDocumentsIntoInsertPayload(splittedDocuments); } case "html": { @@ -162,7 +164,7 @@ export class FileDataLoader { const newDocuments = await sequence.invoke(documents); - return mapDocumentsIntoInsertPayload(newDocuments); + return this.mapDocumentsIntoInsertPayload(newDocuments); } // Processors will be handled here. E.g. "unstructured", "llama-parse" @@ -182,17 +184,20 @@ export class FileDataLoader { throw new Error(`Unsupported data type: ${this.config.type}`); } } + } - function mapDocumentsIntoInsertPayload( - splittedDocuments: Document[], - metadataMapper?: (metadata: any, index: number) => Record - ) { - return splittedDocuments.map((document, index) => ({ - data: document.pageContent, - id: nanoid(), - ...(metadataMapper ? { metadata: metadataMapper(document.metadata, index) } : {}), - })); - } + private mapDocumentsIntoInsertPayload( + splittedDocuments: Document[], + metadataMapper?: (metadata: any, index: number) => Record + ) { + return splittedDocuments.map((document, index) => ({ + data: document.pageContent, + id: nanoid(), + metadata: { + ...(metadataMapper ? metadataMapper(document.metadata, index) : {}), + ...this.config.options?.metadata, + }, + })); } }