Add custom metadata to uploaded files (#95)

* feat(context): allow user to pass a custom metadata to the context * test(file-loader): add tests * fix: use bun in tests --------- Co-authored-by: Ronaldo Lima <[email protected]>
upstash · Nov 19, 2024 · 5e4c3de · 5e4c3de
1 parent 0a78037
commit 5e4c3de
Show file tree

Hide file tree

Showing 2 changed files with 277 additions and 24 deletions.
diff --git a/src/file-loader.test.ts b/src/file-loader.test.ts
@@ -0,0 +1,248 @@
+/* eslint-disable @typescript-eslint/no-unsafe-assignment */
+/* eslint-disable @typescript-eslint/no-explicit-any */
+
+import { describe, test, expect } from "bun:test";
+import { FileDataLoader } from "./file-loader";
+import type { DatasWithFileSource } from "./database";
+
+describe("FileDataLoader Integration Tests", () => {
+  describe("PDF Loading", () => {
+    test("should load and transform Wizard of Oz PDF", async () => {
+      const config: DatasWithFileSource = {
+        type: "pdf",
+        fileSource: "./data/the_wonderful_wizard_of_oz.pdf",
+        options: {
+          metadata: {
+            book: "The Wonderful Wizard of Oz",
+            type: "classic literature",
+          },
+        },
+      };
+
+      const loader = new FileDataLoader(config);
+      const loadFunction = await loader.loadFile({});
+      const result = await loadFunction({
+        chunkSize: 1000,
+        chunkOverlap: 200,
+      });
+
+      expect(result.length).toBeGreaterThan(0);
+      expect(result[0]).toEqual({
+        data: expect.any(String),
+        id: expect.any(String),
+        metadata: expect.objectContaining({
+          book: "The Wonderful Wizard of Oz",
+          type: "classic literature",
+          source: expect.stringContaining("the_wonderful_wizard_of_oz.pdf"),
+          timestamp: expect.any(String),
+          paragraphNumber: expect.any(Number),
+        }),
+      });
+
+      const allContent = result.map((document) => document.data).join(" ");
+      expect(allContent).toContain("Dorothy");
+    });
+  });
+
+  describe("CSV Loading", () => {
+    test("should load and transform user info CSV", async () => {
+      const config: DatasWithFileSource = {
+        type: "csv",
+        fileSource: "./data/list_of_user_info.csv",
+        options: {
+          metadata: {
+            dataType: "user_info",
+            version: "1.0",
+          },
+        },
+      };
+
+      const loader = new FileDataLoader(config);
+      const loadFunction = await loader.loadFile({});
+      const result = await loadFunction({});
+
+      expect(result.length).toBeGreaterThan(0);
+      expect(result[0]).toEqual({
+        data: expect.any(String),
+        id: expect.any(String),
+        metadata: expect.objectContaining({
+          dataType: "user_info",
+          version: "1.0",
+        }),
+      });
+
+      for (const document of result) {
+        expect(document.data).toBeTruthy();
+        expect(typeof document.data).toBe("string");
+      }
+    });
+  });
+
+  describe("Text File Loading", () => {
+    test("should load and transform Wizard of Oz summary text", async () => {
+      const chunkSize = 500;
+      const chunkOverlap = 50;
+      const config: DatasWithFileSource = {
+        type: "text-file",
+        fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt",
+        options: {
+          metadata: {
+            contentType: "summary",
+            subject: "The Wonderful Wizard of Oz",
+          },
+        },
+      };
+
+      const loader = new FileDataLoader(config);
+      const loadFunction = await loader.loadFile({});
+      const result = await loadFunction({
+        chunkSize: chunkSize,
+        chunkOverlap: chunkOverlap,
+      });
+
+      expect(result.length).toBeGreaterThan(0);
+      expect(result[0]).toEqual({
+        data: expect.any(String),
+        id: expect.any(String),
+        metadata: expect.objectContaining({
+          contentType: "summary",
+          subject: "The Wonderful Wizard of Oz",
+        }),
+      });
+
+      for (const document of result) {
+        expect(document.data.length).toBeLessThanOrEqual(chunkSize + chunkOverlap);
+      }
+    });
+  });
+
+  describe("HTML Loading", () => {
+    test("should load and transform Wizard of Oz summary HTML", async () => {
+      const config: DatasWithFileSource = {
+        type: "html",
+        source: "./data/the_wonderful_wizard_of_oz_summary.html",
+        options: {
+          metadata: {
+            format: "html",
+            subject: "The Wonderful Wizard of Oz Summary",
+          },
+        },
+      };
+
+      const loader = new FileDataLoader(config);
+      const loadFunction = await loader.loadFile({});
+      const result = await loadFunction({});
+
+      expect(result.length).toBeGreaterThan(0);
+      expect(result[0]).toEqual({
+        data: expect.any(String),
+        id: expect.any(String),
+        metadata: expect.objectContaining({
+          format: "html",
+          subject: "The Wonderful Wizard of Oz Summary",
+        }),
+      });
+
+      const content = result[0].data;
+      expect(content).not.toContain("<html>");
+      expect(content).not.toContain("<body>");
+      expect(content).not.toContain("<");
+    });
+  });
+
+  describe("Multiple File Types", () => {
+    test("should handle loading different formats with consistent metadata", async () => {
+      const commonMetadata = {
+        project: "Wizard of Oz Analysis",
+        timestamp: new Date().toISOString(),
+      };
+
+      const configs: DatasWithFileSource[] = [
+        {
+          type: "pdf",
+          fileSource: "./data/the_wonderful_wizard_of_oz.pdf",
+          options: { metadata: commonMetadata },
+        },
+        {
+          type: "text-file",
+          fileSource: "./data/the_wonderful_wizard_of_oz_summary.txt",
+          options: { metadata: commonMetadata },
+        },
+        {
+          type: "html",
+          source: "./data/the_wonderful_wizard_of_oz_summary.html",
+          options: { metadata: commonMetadata },
+        },
+      ];
+
+      const results = await Promise.all(
+        configs.map(async (config) => {
+          const loader = new FileDataLoader(config);
+          const loadFunction = await loader.loadFile({});
+          return loadFunction({});
+        })
+      );
+
+      for (const result of results) {
+        expect(result.length).toBeGreaterThan(0);
+        expect(result[0].metadata).toMatchObject(commonMetadata);
+      }
+
+      const [pdfContent, txtContent, htmlContent] = results.map((r) =>
+        r.map((document) => document.data).join(" ")
+      );
+
+      expect(pdfContent).toContain("Dorothy");
+      expect(txtContent).toContain("Dorothy");
+      expect(htmlContent).toContain("Dorothy");
+    });
+  });
+
+  describe("FileDataLoader Error Handling", () => {
+    describe("Missing Files", () => {
+      test("should handle non-existent files", () => {
+        const config: DatasWithFileSource = {
+          type: "pdf",
+          fileSource: "./data/does_not_exist.pdf",
+        };
+
+        const loader = new FileDataLoader(config);
+        expect(loader.loadFile({})).rejects.toThrow(/no such file/i);
+      });
+    });
+
+    describe("Invalid Configurations", () => {
+      test("should error with invalid file type", () => {
+        const config: DatasWithFileSource = {
+          type: "invalid" as any,
+          fileSource: "./data/some_file.txt",
+        };
+
+        const loader = new FileDataLoader(config);
+        expect(loader.loadFile({})).rejects.toThrow(/unsupported data type/i);
+      });
+
+      test("should error with missing required options for processors", () => {
+        const config: DatasWithFileSource = {
+          fileSource: "test.doc",
+          processor: {
+            options: {},
+          },
+        } as any;
+
+        const loader = new FileDataLoader(config);
+        expect(loader.loadFile({})).rejects.toThrow();
+      });
+
+      test("should error with invalid file path", () => {
+        const config: DatasWithFileSource = {
+          type: "pdf",
+          fileSource: "",
+        };
+
+        const loader = new FileDataLoader(config);
+        expect(loader.loadFile({})).rejects.toThrow();
+      });
+    });
+  });
+});
diff --git a/src/file-loader.ts b/src/file-loader.ts
@@ -131,27 +131,29 @@ export class FileDataLoader {
       case "pdf": {
         const splitter = new RecursiveCharacterTextSplitter(args);
         const splittedDocuments = await splitter.splitDocuments(documents);
-
-        return mapDocumentsIntoInsertPayload(splittedDocuments, (metadata: any, index: number) => ({
-          source: metadata.source,
-          timestamp: new Date().toISOString(),
-          paragraphNumber: index + 1,
-          pageNumber: metadata.loc?.pageNumber || undefined,
-          author: metadata.pdf?.info?.Author || undefined,
-          title: metadata.pdf?.info?.Title || undefined,
-          totalPages: metadata.pdf?.totalPages || undefined,
-          language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined,
-        }));
+        return this.mapDocumentsIntoInsertPayload(
+          splittedDocuments,
+          (metadata: any, index: number) => ({
+            source: metadata.source,
+            timestamp: new Date().toISOString(),
+            paragraphNumber: index + 1,
+            pageNumber: metadata.loc?.pageNumber || undefined,
+            author: metadata.pdf?.info?.Author || undefined,
+            title: metadata.pdf?.info?.Title || undefined,
+            totalPages: metadata.pdf?.totalPages || undefined,
+            language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined,
+          })
+        );
       }
 
       case "csv": {
-        return mapDocumentsIntoInsertPayload(documents);
+        return this.mapDocumentsIntoInsertPayload(documents);
       }
 
       case "text-file": {
         const splitter = new RecursiveCharacterTextSplitter(args);
         const splittedDocuments = await splitter.splitDocuments(documents);
-        return mapDocumentsIntoInsertPayload(splittedDocuments);
+        return this.mapDocumentsIntoInsertPayload(splittedDocuments);
       }
 
       case "html": {
@@ -162,7 +164,7 @@ export class FileDataLoader {
 
         const newDocuments = await sequence.invoke(documents);
 
-        return mapDocumentsIntoInsertPayload(newDocuments);
+        return this.mapDocumentsIntoInsertPayload(newDocuments);
       }
 
       // Processors will be handled here. E.g. "unstructured", "llama-parse"
@@ -182,17 +184,20 @@ export class FileDataLoader {
         throw new Error(`Unsupported data type: ${this.config.type}`);
       }
     }
+  }
 
-    function mapDocumentsIntoInsertPayload(
-      splittedDocuments: Document[],
-      metadataMapper?: (metadata: any, index: number) => Record<string, any>
-    ) {
-      return splittedDocuments.map((document, index) => ({
-        data: document.pageContent,
-        id: nanoid(),
-        ...(metadataMapper ? { metadata: metadataMapper(document.metadata, index) } : {}),
-      }));
-    }
+  private mapDocumentsIntoInsertPayload(
+    splittedDocuments: Document[],
+    metadataMapper?: (metadata: any, index: number) => Record<string, any>
+  ) {
+    return splittedDocuments.map((document, index) => ({
+      data: document.pageContent,
+      id: nanoid(),
+      metadata: {
+        ...(metadataMapper ? metadataMapper(document.metadata, index) : {}),
+        ...this.config.options?.metadata,
+      },
+    }));
   }
 }