Skip to content

Commit

Permalink
Merge pull request #61 from upstash/add-unstructured-loader
Browse files Browse the repository at this point in the history
feat: add unstructured loader
  • Loading branch information
ogzhanolguncu authored Aug 26, 2024
2 parents 2800e0d + 365b72b commit abd1c56
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 9 deletions.
Binary file modified bun.lockb
Binary file not shown.
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
},
"dependencies": {
"@ai-sdk/openai": "^0.0.44",
"@langchain/community": "^0.2.13",
"@langchain/community": "^0.2.28",
"@langchain/core": "^0.2.9",
"@upstash/vector": "^1.1.3",
"ai": "^3.1.1",
Expand All @@ -69,7 +69,8 @@
"html-to-text": "^9.0.5",
"langchain": "^0.2.0",
"nanoid": "^5.0.7",
"pdf-parse": "^1.1.1"
"pdf-parse": "^1.1.1",
"unstructured-client": "^0.15.1"
},
"peerDependencies": {
"@upstash/redis": "^1.31.3",
Expand Down
14 changes: 13 additions & 1 deletion src/database.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,23 @@ import { nanoid } from "nanoid";
import { DEFAULT_SIMILARITY_THRESHOLD, DEFAULT_TOP_K } from "./constants";
import { FileDataLoader } from "./file-loader";
import type { AddContextOptions } from "./types";
import type { UnstructuredLoaderOptions } from "@langchain/community/document_loaders/fs/unstructured";

export type FilePath = string;
export type URL = string;

export type ProcessorType = {
name: "unstructured";
options: UnstructuredLoaderOptions;
};

export type DatasWithFileSource =
| {
type?: "pdf" | "csv" | "text-file" | "html";
fileSource: FilePath;
options?: AddContextOptions;
processor: ProcessorType;
}
| {
type: "pdf";
fileSource: FilePath | Blob;
Expand Down Expand Up @@ -162,7 +174,7 @@ export class Database {
} else {
try {
const fileArgs =
"pdfOpts" in input ? input.pdfOpts : "csvOpts" in input ? input.csvOpts : {};
"pdfConfig" in input ? input.pdfConfig : "csvConfig" in input ? input.csvConfig : {};

const transformOrSplit = await new FileDataLoader(input).loadFile(fileArgs);

Expand Down
84 changes: 78 additions & 6 deletions src/file-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,19 @@ import { CSVLoader } from "@langchain/community/document_loaders/fs/csv";
import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
import { CheerioWebBaseLoader } from "@langchain/community/document_loaders/web/cheerio";
import { HtmlToTextTransformer } from "@langchain/community/document_transformers/html_to_text";
import type { Document } from "@langchain/core/documents";
import { Document } from "@langchain/core/documents";
import { TextLoader } from "langchain/document_loaders/fs/text";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { nanoid } from "nanoid";
import type { DatasWithFileSource, FilePath, URL } from "./database";
import { UnstructuredClient } from "unstructured-client";
import type { DatasWithFileSource, FilePath, ProcessorType, URL } from "./database";

type Element = {
type: string;
text: string;
// this is purposefully loosely typed
metadata: Record<string, unknown>;
};

export class FileDataLoader {
private config: DatasWithFileSource;
Expand All @@ -21,12 +29,60 @@ export class FileDataLoader {

async loadFile(args: any) {
const loader = this.createLoader(args);
const documents = await loader.load();
const _loader = await loader;
const documents = await _loader.load();

return (args: any) => this.transformDocument(documents, args);
}

private createLoader(args: any) {
private async createLoader(args: any) {
if (hasProcessor(this.config)) {
const client = new UnstructuredClient({
serverURL: "https://api.unstructuredapp.io",
security: {
apiKeyAuth: this.config.processor.options.apiKey,
},
});

//@ts-expect-error TS can't pick up the correct type due to complex union
const fileData = await Bun.file(this.config.fileSource).text();
const response = await client.general.partition({
//@ts-expect-error Will be fixed soon
partitionParameters: {
files: {
content: fileData,
//@ts-expect-error TS can't pick up the correct type due to complex union
fileName: this.config.fileSource,
},
...this.config.processor.options,
},
});
const elements = response.elements?.filter(
(element) => typeof element.text === "string"
) as Element[];

return {
// eslint-disable-next-line @typescript-eslint/require-await
load: async (): Promise<Document[]> => {
const documents: Document[] = [];
for (const element of elements) {
const { metadata, text } = element;
if (typeof text === "string" && text !== "") {
documents.push(
new Document({
pageContent: text,
metadata: {
...metadata,
category: element.type,
},
})
);
}
}
return documents;
},
};
}
switch (this.config.type) {
case "pdf": {
return new PDFLoader(
Expand All @@ -53,7 +109,7 @@ export class FileDataLoader {
}

default: {
// @ts-expect-error config type is set as never
//@ts-expect-error TS can't pick up the correct type due to complex union
throw new Error(`Unsupported data type: ${this.config.type}`);
}
}
Expand Down Expand Up @@ -87,7 +143,6 @@ export class FileDataLoader {

case "text-file": {
const splitter = new RecursiveCharacterTextSplitter(args);

const splittedDocuments = await splitter.splitDocuments(documents);
return mapDocumentsIntoInsertPayload(splittedDocuments);
}
Expand All @@ -103,6 +158,17 @@ export class FileDataLoader {
return mapDocumentsIntoInsertPayload(newDocuments);
}

case undefined: {
const documents_ = documents.map(
(item) => new Document({ pageContent: item.pageContent, metadata: item.metadata })
);
return documents_.map((document) => ({
data: document.pageContent,
metadata: document.metadata,
id: nanoid(),
}));
}

default: {
// @ts-expect-error config type is set as never
throw new Error(`Unsupported data type: ${this.config.type}`);
Expand All @@ -121,3 +187,9 @@ export class FileDataLoader {
}
}
}

function hasProcessor(
data: DatasWithFileSource
): data is DatasWithFileSource & { processor: ProcessorType } {
return "processor" in data && typeof data.processor === "object" && "options" in data.processor;
}

0 comments on commit abd1c56

Please sign in to comment.