diff --git a/README.md b/README.md index 5fbc4b37..6e290b8e 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ The author(s) are looking to add core maintainers for this opensource project. R - [Sitemap](#sitemap) - [Text](#text) - [Json](#json) + - [Csv](#csv) - [Add a custom loader](#add-a-custom-loader) - [More loaders coming soon](#more-loaders-coming-soon) - [LLMs](#llms) @@ -407,6 +408,16 @@ To add a parsed Javascript object to your embeddings, use `JsonLoader`. The libr **Note:** if you want to restrict the keys that get added to the vectorDb in a dynamically obtained object, you can use the `pickKeysForEmbedding` optional parameter in the `JsonLoader` constructor. +## Csv + +To add a Csv file (or URL) to your embeddings, use `CsvLoader`. The library will parse the Csv and add each row to its vector database. + +```TS +.addLoader(new CsvLoader({ filePathOrUrl: '...' })) +``` + +**Note:** You can control how the `CsvLoader` parses the file in great detail by passing in the optional `csvParseOptions` constructor parameter. + ## Add a custom loader You can pass along a custom loader to the `addLoader` method by extending and implementing the abstract class `BaseLoader`. Here's how that would look like - diff --git a/package-lock.json b/package-lock.json index d938e96f..048493f9 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,29 +1,31 @@ { "name": "@llm-tools/embedjs", - "version": "0.0.81", + "version": "0.0.82", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@llm-tools/embedjs", - "version": "0.0.81", + "version": "0.0.82", "license": "Apache-2.0", "dependencies": { "@huggingface/inference": "^2.7.0", "@langchain/anthropic": "^0.1.21", "@langchain/cohere": "^0.0.10", "@langchain/community": "^0.2.4", - "@langchain/core": "^0.2.4", + "@langchain/core": "^0.2.5", "@langchain/google-vertexai": "^0.0.17", - "@langchain/mistralai": "^0.0.22", - "@langchain/openai": "^0.0.34", + "@langchain/mistralai": "^0.0.23", + "@langchain/openai": "^0.1.0", "axios": "^1.7.2", "compute-cosine-similarity": "^1.1.0", "confluence.js": "^1.7.4", - "debug": "^4.3.4", + "csv-parse": "^5.5.6", + "debug": "^4.3.5", "html-to-text": "^9.0.5", - "langchain": "^0.2.3", + "langchain": "^0.2.4", "md5": "^2.3.0", + "mime": "^4.0.3", "office-text-extractor": "^3.0.3", "sitemapper": "^3.1.8", "stream-mmmagic": "^2.3.0", @@ -39,7 +41,7 @@ "@types/html-to-text": "^9.0.4", "@types/jest": "29.5.12", "@types/md5": "^2.3.5", - "@types/node": "^20.12.13", + "@types/node": "^20.13.0", "@types/usetube": "^2.1.2", "@typescript-eslint/eslint-plugin": "^7.11.0", "@typescript-eslint/parser": "^7.11.0", @@ -1201,6 +1203,17 @@ "node": ">=14.0.0" } }, + "node_modules/@aws-sdk/signature-v4/node_modules/@smithy/eventstream-codec": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@smithy/eventstream-codec/-/eventstream-codec-1.1.0.tgz", + "integrity": "sha512-3tEbUb8t8an226jKB6V/Q2XU/J53lCwCzULuBPEaF4JjSh+FlCMp7TmogE/Aij5J9DwlsZ4VAD/IRDuQ/0ZtMw==", + "dependencies": { + "@aws-crypto/crc32": "3.0.0", + "@smithy/types": "^1.2.0", + "@smithy/util-hex-encoding": "^1.1.0", + "tslib": "^2.5.0" + } + }, "node_modules/@aws-sdk/signature-v4/node_modules/@smithy/is-array-buffer": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/@smithy/is-array-buffer/-/is-array-buffer-1.1.0.tgz", @@ -3341,10 +3354,254 @@ } } }, + "node_modules/@langchain/community/node_modules/@langchain/openai": { + "version": "0.0.34", + "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.0.34.tgz", + "integrity": "sha512-M+CW4oXle5fdoz2T2SwdOef8pl3/1XmUx1vjn2mXUVM/128aO0l23FMF0SNBsAbRV6P+p/TuzjodchJbi0Ht/A==", + "dependencies": { + "@langchain/core": ">0.1.56 <0.3.0", + "js-tiktoken": "^1.0.12", + "openai": "^4.41.1", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@langchain/community/node_modules/langchain": { + "version": "0.2.3", + "resolved": "https://registry.npmjs.org/langchain/-/langchain-0.2.3.tgz", + "integrity": "sha512-T9xR7zd+Nj0oXy6WoYKmZLy0DlQiDLFPGYWdOXDxy+AvqlujoPdVQgDSpdqiOHvAjezrByAoKxoHCz5XMwTP/Q==", + "dependencies": { + "@langchain/core": "~0.2.0", + "@langchain/openai": "~0.0.28", + "@langchain/textsplitters": "~0.0.0", + "binary-extensions": "^2.2.0", + "js-tiktoken": "^1.0.12", + "js-yaml": "^4.1.0", + "jsonpointer": "^5.0.1", + "langchainhub": "~0.0.8", + "langsmith": "~0.1.7", + "ml-distance": "^4.0.0", + "openapi-types": "^12.1.3", + "p-retry": "4", + "uuid": "^9.0.0", + "yaml": "^2.2.1", + "zod": "^3.22.4", + "zod-to-json-schema": "^3.22.3" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@aws-sdk/client-s3": "^3.310.0", + "@aws-sdk/client-sagemaker-runtime": "^3.310.0", + "@aws-sdk/client-sfn": "^3.310.0", + "@aws-sdk/credential-provider-node": "^3.388.0", + "@azure/storage-blob": "^12.15.0", + "@browserbasehq/sdk": "*", + "@gomomento/sdk": "^1.51.1", + "@gomomento/sdk-core": "^1.51.1", + "@gomomento/sdk-web": "^1.51.1", + "@mendable/firecrawl-js": "^0.0.13", + "@notionhq/client": "^2.2.10", + "@pinecone-database/pinecone": "*", + "@supabase/supabase-js": "^2.10.0", + "@vercel/kv": "^0.2.3", + "@xata.io/client": "^0.28.0", + "apify-client": "^2.7.1", + "assemblyai": "^4.0.0", + "axios": "*", + "cheerio": "^1.0.0-rc.12", + "chromadb": "*", + "convex": "^1.3.1", + "couchbase": "^4.3.0", + "d3-dsv": "^2.0.0", + "epub2": "^3.0.1", + "fast-xml-parser": "*", + "handlebars": "^4.7.8", + "html-to-text": "^9.0.5", + "ignore": "^5.2.0", + "ioredis": "^5.3.2", + "jsdom": "*", + "mammoth": "^1.6.0", + "mongodb": ">=5.2.0", + "node-llama-cpp": "*", + "notion-to-md": "^3.1.0", + "officeparser": "^4.0.4", + "pdf-parse": "1.1.1", + "peggy": "^3.0.2", + "playwright": "^1.32.1", + "puppeteer": "^19.7.2", + "pyodide": "^0.24.1", + "redis": "^4.6.4", + "sonix-speech-recognition": "^2.1.1", + "srt-parser-2": "^1.2.3", + "typeorm": "^0.3.12", + "weaviate-ts-client": "*", + "web-auth-library": "^1.0.3", + "ws": "^8.14.2", + "youtube-transcript": "^1.0.6", + "youtubei.js": "^9.1.0" + }, + "peerDependenciesMeta": { + "@aws-sdk/client-s3": { + "optional": true + }, + "@aws-sdk/client-sagemaker-runtime": { + "optional": true + }, + "@aws-sdk/client-sfn": { + "optional": true + }, + "@aws-sdk/credential-provider-node": { + "optional": true + }, + "@azure/storage-blob": { + "optional": true + }, + "@browserbasehq/sdk": { + "optional": true + }, + "@gomomento/sdk": { + "optional": true + }, + "@gomomento/sdk-core": { + "optional": true + }, + "@gomomento/sdk-web": { + "optional": true + }, + "@mendable/firecrawl-js": { + "optional": true + }, + "@notionhq/client": { + "optional": true + }, + "@pinecone-database/pinecone": { + "optional": true + }, + "@supabase/supabase-js": { + "optional": true + }, + "@vercel/kv": { + "optional": true + }, + "@xata.io/client": { + "optional": true + }, + "apify-client": { + "optional": true + }, + "assemblyai": { + "optional": true + }, + "axios": { + "optional": true + }, + "cheerio": { + "optional": true + }, + "chromadb": { + "optional": true + }, + "convex": { + "optional": true + }, + "couchbase": { + "optional": true + }, + "d3-dsv": { + "optional": true + }, + "epub2": { + "optional": true + }, + "faiss-node": { + "optional": true + }, + "fast-xml-parser": { + "optional": true + }, + "handlebars": { + "optional": true + }, + "html-to-text": { + "optional": true + }, + "ignore": { + "optional": true + }, + "ioredis": { + "optional": true + }, + "jsdom": { + "optional": true + }, + "mammoth": { + "optional": true + }, + "mongodb": { + "optional": true + }, + "node-llama-cpp": { + "optional": true + }, + "notion-to-md": { + "optional": true + }, + "officeparser": { + "optional": true + }, + "pdf-parse": { + "optional": true + }, + "peggy": { + "optional": true + }, + "playwright": { + "optional": true + }, + "puppeteer": { + "optional": true + }, + "pyodide": { + "optional": true + }, + "redis": { + "optional": true + }, + "sonix-speech-recognition": { + "optional": true + }, + "srt-parser-2": { + "optional": true + }, + "typeorm": { + "optional": true + }, + "weaviate-ts-client": { + "optional": true + }, + "web-auth-library": { + "optional": true + }, + "ws": { + "optional": true + }, + "youtube-transcript": { + "optional": true + }, + "youtubei.js": { + "optional": true + } + } + }, "node_modules/@langchain/core": { - "version": "0.2.4", - "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.4.tgz", - "integrity": "sha512-xsXMcZ5Sj9amlXFNFNb2cYxnjDrw9nWATiTkquFAWzrw96uHxzoapBZeCfkbnrWXiUBxg9rLWO8Bs1wTJDwVPQ==", + "version": "0.2.5", + "resolved": "https://registry.npmjs.org/@langchain/core/-/core-0.2.5.tgz", + "integrity": "sha512-tMaKRFVewFn8crQwlbXGjT7hlMdX1yXHap1ebBx7Bb2C3C9AeZ+sXbX11m27yamypNlVVegwUcisw3YCaDkZJA==", "dependencies": { "ansi-styles": "^5.0.0", "camelcase": "6", @@ -3483,12 +3740,12 @@ } }, "node_modules/@langchain/mistralai": { - "version": "0.0.22", - "resolved": "https://registry.npmjs.org/@langchain/mistralai/-/mistralai-0.0.22.tgz", - "integrity": "sha512-cZ+HBz32Gq2zjkXcDs5hDnr8W0QxlRXhWAXu9zepwzLNe3aXxAD/lhz9l654o2fo89bqnuhtYgbGx8ZVYcDvCw==", + "version": "0.0.23", + "resolved": "https://registry.npmjs.org/@langchain/mistralai/-/mistralai-0.0.23.tgz", + "integrity": "sha512-0huhT3KXqrD1u20e2NWpBlprBoNidw4Q4hjBI9DySCb1Gx4wSIV6y8kswsWRfM1VDBZvJcCVLeVuOSAyShoFug==", "dependencies": { "@langchain/core": ">0.1.56 <0.3.0", - "@mistralai/mistralai": "^0.1.3", + "@mistralai/mistralai": "^0.4.0", "uuid": "^9.0.0", "zod": "^3.22.4", "zod-to-json-schema": "^3.22.4" @@ -3498,11 +3755,11 @@ } }, "node_modules/@langchain/openai": { - "version": "0.0.34", - "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.0.34.tgz", - "integrity": "sha512-M+CW4oXle5fdoz2T2SwdOef8pl3/1XmUx1vjn2mXUVM/128aO0l23FMF0SNBsAbRV6P+p/TuzjodchJbi0Ht/A==", + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/@langchain/openai/-/openai-0.1.0.tgz", + "integrity": "sha512-jm7U9oxXQ2N03q3+S9CzEAmMJaL2FqdAi4bOYdEBS0aAWAU29so35ZOs5i2uu4W29mK9oV9XS/4A5ggR1gOLEA==", "dependencies": { - "@langchain/core": ">0.1.56 <0.3.0", + "@langchain/core": ">=0.2.5 <0.3.0", "js-tiktoken": "^1.0.12", "openai": "^4.41.1", "zod": "^3.22.4", @@ -3609,9 +3866,9 @@ ] }, "node_modules/@mistralai/mistralai": { - "version": "0.1.3", - "resolved": "https://registry.npmjs.org/@mistralai/mistralai/-/mistralai-0.1.3.tgz", - "integrity": "sha512-WUHxC2xdeqX9PTXJEqdiNY54vT2ir72WSJrZTTBKRnkfhX6zIfCYA24faRlWjUB5WTpn+wfdGsTMl3ArijlXFA==", + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/@mistralai/mistralai/-/mistralai-0.4.0.tgz", + "integrity": "sha512-KmFzNro1RKxIFh19J3osmUQhucefBBauMXN5fa9doG6dT9OHR/moBvvn+riVlR7c0AVfuxO8Dfa03AyLYYzbyg==", "dependencies": { "node-fetch": "^2.6.7" } @@ -3983,33 +4240,26 @@ } }, "node_modules/@smithy/eventstream-codec": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@smithy/eventstream-codec/-/eventstream-codec-1.1.0.tgz", - "integrity": "sha512-3tEbUb8t8an226jKB6V/Q2XU/J53lCwCzULuBPEaF4JjSh+FlCMp7TmogE/Aij5J9DwlsZ4VAD/IRDuQ/0ZtMw==", + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/@smithy/eventstream-codec/-/eventstream-codec-2.2.0.tgz", + "integrity": "sha512-8janZoJw85nJmQZc4L8TuePp2pk1nxLgkxIR0TUjKJ5Dkj5oelB9WtiSSGXCQvNsJl0VSTvK/2ueMXxvpa9GVw==", + "optional": true, + "peer": true, "dependencies": { "@aws-crypto/crc32": "3.0.0", - "@smithy/types": "^1.2.0", - "@smithy/util-hex-encoding": "^1.1.0", - "tslib": "^2.5.0" + "@smithy/types": "^2.12.0", + "@smithy/util-hex-encoding": "^2.2.0", + "tslib": "^2.6.2" } }, "node_modules/@smithy/eventstream-codec/node_modules/@smithy/types": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/@smithy/types/-/types-1.2.0.tgz", - "integrity": "sha512-z1r00TvBqF3dh4aHhya7nz1HhvCg4TRmw51fjMrh5do3h+ngSstt/yKlNbHeb9QxJmFbmN8KEVSWgb1bRvfEoA==", - "dependencies": { - "tslib": "^2.5.0" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@smithy/eventstream-codec/node_modules/@smithy/util-hex-encoding": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@smithy/util-hex-encoding/-/util-hex-encoding-1.1.0.tgz", - "integrity": "sha512-7UtIE9eH0u41zpB60Jzr0oNCQ3hMJUabMcKRUVjmyHTXiWDE4vjSqN6qlih7rCNeKGbioS7f/y2Jgym4QZcKFg==", + "version": "2.12.0", + "resolved": "https://registry.npmjs.org/@smithy/types/-/types-2.12.0.tgz", + "integrity": "sha512-QwYgloJ0sVNBeBuBs65cIkTbfzV/Q6ZNPCJ99EICFEdJYG50nGIY/uYXp+TbsdJReIuPr0a0kXmCvren3MbRRw==", + "optional": true, + "peer": true, "dependencies": { - "tslib": "^2.5.0" + "tslib": "^2.6.2" }, "engines": { "node": ">=14.0.0" @@ -4864,9 +5114,9 @@ "dev": true }, "node_modules/@types/node": { - "version": "20.12.13", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.13.tgz", - "integrity": "sha512-gBGeanV41c1L171rR7wjbMiEpEI/l5XFQdLLfhr/REwpgDy/4U8y89+i8kRiLzDyZdOkXh+cRaTetUnCYutoXA==", + "version": "20.13.0", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.13.0.tgz", + "integrity": "sha512-FM6AOb3khNkNIXPnHFDYaHerSv8uN22C91z098AnGccVu+Pcdhi+pNUFDi0iLmPIsVE0JBD0KVS7mzUYt4nRzQ==", "dependencies": { "undici-types": "~5.26.4" } @@ -6320,15 +6570,20 @@ "node": "*" } }, + "node_modules/csv-parse": { + "version": "5.5.6", + "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-5.5.6.tgz", + "integrity": "sha512-uNpm30m/AGSkLxxy7d9yRXpJQFrZzVWLFBkS+6ngPcZkw/5k3L/jjFuj7tVnEpRn+QgmiXr21nDlhCiUK4ij2A==" + }, "node_modules/dayjs": { "version": "1.11.10", "resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.11.10.tgz", "integrity": "sha512-vjAczensTgRcqDERK0SR2XMwsF/tSvnvlv6VcF2GIhg6Sx4yOIt/irsr1RDJsKiIyBzJDpCoXiWWq28MqH2cnQ==" }, "node_modules/debug": { - "version": "4.3.4", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", - "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "version": "4.3.5", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.5.tgz", + "integrity": "sha512-pt0bNEmneDIvdL1Xsd9oDQ/wrQRkXDT4AUWlNZNPKvW5x/jyO9VFXkJUP07vQ2upmw5PlaITaPKc31jK13V+jg==", "dependencies": { "ms": "2.1.2" }, @@ -9052,19 +9307,19 @@ } }, "node_modules/langchain": { - "version": "0.2.3", - "resolved": "https://registry.npmjs.org/langchain/-/langchain-0.2.3.tgz", - "integrity": "sha512-T9xR7zd+Nj0oXy6WoYKmZLy0DlQiDLFPGYWdOXDxy+AvqlujoPdVQgDSpdqiOHvAjezrByAoKxoHCz5XMwTP/Q==", + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/langchain/-/langchain-0.2.4.tgz", + "integrity": "sha512-zBsBuNREn/3IlWvIQqhQ2iqf6JJhyjjsB1Db/keDkcgThPI3EcblC1pqAXU2BIKHmpNUkHBR2bAUok5+xtgOcw==", "dependencies": { "@langchain/core": "~0.2.0", - "@langchain/openai": "~0.0.28", + "@langchain/openai": "~0.1.0", "@langchain/textsplitters": "~0.0.0", "binary-extensions": "^2.2.0", "js-tiktoken": "^1.0.12", "js-yaml": "^4.1.0", "jsonpointer": "^5.0.1", "langchainhub": "~0.0.8", - "langsmith": "~0.1.7", + "langsmith": "~0.1.30", "ml-distance": "^4.0.0", "openapi-types": "^12.1.3", "p-retry": "4", @@ -9592,6 +9847,20 @@ "node": ">=8.6" } }, + "node_modules/mime": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/mime/-/mime-4.0.3.tgz", + "integrity": "sha512-KgUb15Oorc0NEKPbvfa0wRU+PItIEZmiv+pyAO2i0oTIVTJhlzMclU7w4RXWQrSOVH5ax/p/CkIO7KI4OyFJTQ==", + "funding": [ + "https://github.com/sponsors/broofa" + ], + "bin": { + "mime": "bin/cli.js" + }, + "engines": { + "node": ">=16" + } + }, "node_modules/mime-db": { "version": "1.52.0", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", diff --git a/package.json b/package.json index 4d1862bc..8f49e846 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@llm-tools/embedjs", - "version": "0.0.81", + "version": "0.0.82", "description": "A NodeJS RAG framework to easily work with LLMs and custom datasets", "main": "dist/index.js", "types": "dist/index.d.ts", @@ -59,17 +59,19 @@ "@langchain/anthropic": "^0.1.21", "@langchain/cohere": "^0.0.10", "@langchain/community": "^0.2.4", - "@langchain/core": "^0.2.4", + "@langchain/core": "^0.2.5", "@langchain/google-vertexai": "^0.0.17", - "@langchain/mistralai": "^0.0.22", - "@langchain/openai": "^0.0.34", + "@langchain/mistralai": "^0.0.23", + "@langchain/openai": "^0.1.0", "axios": "^1.7.2", "compute-cosine-similarity": "^1.1.0", "confluence.js": "^1.7.4", - "debug": "^4.3.4", + "csv-parse": "^5.5.6", + "debug": "^4.3.5", "html-to-text": "^9.0.5", - "langchain": "^0.2.3", + "langchain": "^0.2.4", "md5": "^2.3.0", + "mime": "^4.0.3", "office-text-extractor": "^3.0.3", "sitemapper": "^3.1.8", "stream-mmmagic": "^2.3.0", @@ -85,7 +87,7 @@ "@types/html-to-text": "^9.0.4", "@types/jest": "29.5.12", "@types/md5": "^2.3.5", - "@types/node": "^20.12.13", + "@types/node": "^20.13.0", "@types/usetube": "^2.1.2", "@typescript-eslint/eslint-plugin": "^7.11.0", "@typescript-eslint/parser": "^7.11.0", diff --git a/src/core/dynamic-loader-selector.ts b/src/core/dynamic-loader-selector.ts index 8d772caf..6e2acd92 100644 --- a/src/core/dynamic-loader-selector.ts +++ b/src/core/dynamic-loader-selector.ts @@ -18,6 +18,7 @@ import { BaseLoader } from '../interfaces/base-loader.js'; import { JsonLoader } from '../loaders/json-loader.js'; import { UrlLoader } from '../loaders/url-loader.js'; import { LocalPathLoader } from '../loaders/local-path-loader.js'; +import { CsvLoader } from '../loaders/csv-loader.js'; export type LoaderParam = | string @@ -35,7 +36,8 @@ export type LoaderParam = | ({ type: 'Youtube' } & ConstructorParameters[0]) | ({ type: 'YoutubeSearch' } & ConstructorParameters[0]) | ({ type: 'LocalPath' } & ConstructorParameters[0]) - | ({ type: 'Url' } & ConstructorParameters[0]); + | ({ type: 'Url' } & ConstructorParameters[0]) + | ({ type: 'Csv' } & ConstructorParameters[0]); /** * This class generates different types of loaders based on a string input. @@ -123,6 +125,8 @@ export class DynamicLoader { return new LocalPathLoader(loader); case 'Url': return new UrlLoader(loader); + case 'Csv': + return new CsvLoader(loader); default: throw new SyntaxError(`Unknown loader type ${(loader).type}`); } diff --git a/src/loaders/csv-loader.ts b/src/loaders/csv-loader.ts new file mode 100644 index 00000000..16c7a682 --- /dev/null +++ b/src/loaders/csv-loader.ts @@ -0,0 +1,54 @@ +import { parse, Options as CsvParseOptions } from 'csv-parse'; +import createDebugMessages from 'debug'; +import axios from 'axios'; +import fs from 'node:fs'; +import md5 from 'md5'; + +import { BaseLoader } from '../interfaces/base-loader.js'; +import { cleanString, isValidURL } from '../util/strings.js'; + +export class CsvLoader extends BaseLoader<{ type: 'CsvLoader' }> { + private readonly debug = createDebugMessages('embedjs:loader:CsvLoader'); + private readonly csvParseOptions: CsvParseOptions; + private readonly filePathOrUrl: string; + private readonly isUrl: boolean; + + constructor({ + filePathOrUrl, + csvParseOptions, + chunkOverlap, + chunkSize, + }: { + filePathOrUrl: string; + csvParseOptions?: CsvParseOptions; + chunkSize?: number; + chunkOverlap?: number; + }) { + super(`CsvLoader_${md5(filePathOrUrl)}`, { filePathOrUrl }, chunkSize ?? 1000, chunkOverlap ?? 0); + + this.filePathOrUrl = filePathOrUrl; + this.isUrl = isValidURL(filePathOrUrl) ? true : false; + this.csvParseOptions = csvParseOptions; + } + + override async *getUnfilteredChunks() { + const parser = this.isUrl + ? (await axios.get(this.filePathOrUrl, { responseType: 'stream' })).data + : fs.createReadStream(this.filePathOrUrl).pipe(parse(this.csvParseOptions)); + this.debug('CsvParser stream created'); + + let i = 0; + for await (const record of parser) { + yield { + pageContent: cleanString(record.join(',')), + metadata: { + type: <'CsvLoader'>'CsvLoader', + source: this.filePathOrUrl, + }, + }; + i++; + } + + this.debug(`CsvParser for filePathOrUrl '${this.filePathOrUrl}' resulted in ${i} entries`); + } +} diff --git a/src/util/mime.ts b/src/util/mime.ts index 7105aab8..dd53b94a 100644 --- a/src/util/mime.ts +++ b/src/util/mime.ts @@ -1,3 +1,6 @@ +import mime from 'mime'; +import createDebugMessages from 'debug'; + import { BaseLoader } from '../interfaces/base-loader.js'; import { DocxLoader } from '../loaders/docx-loader.js'; import { ExcelLoader } from '../loaders/excel-loader.js'; @@ -6,6 +9,7 @@ import { PptLoader } from '../loaders/ppt-loader.js'; import { SitemapLoader } from '../loaders/sitemap-loader.js'; import { TextLoader } from '../loaders/text-loader.js'; import { WebLoader } from '../loaders/web-loader.js'; +import { CsvLoader } from '../loaders/csv-loader.js'; export async function createLoaderFromMimeType(loader: string, mimeType: string): Promise { switch (mimeType) { @@ -20,7 +24,10 @@ export async function createLoaderFromMimeType(loader: string, mimeType: string) case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': return new PptLoader({ filePathOrUrl: loader }); case 'text/plain': - return new TextLoader({ text: loader }); + const fineType = mime.getType(loader); + createDebugMessages('embedjs:createLoaderFromMimeType')(`Fine type for '${loader}' is '${fineType}'`); + if (fineType === 'text/csv') return new CsvLoader({ filePathOrUrl: loader }); + else return new TextLoader({ text: loader }); case 'text/html': return new WebLoader({ urlOrContent: loader }); case 'text/xml':