Skip to content

Commit

Permalink
Merge pull request #143 from llm-tools/loaders
Browse files Browse the repository at this point in the history
Dynamic loader fix
  • Loading branch information
adhityan authored Oct 27, 2024
2 parents f5964cb + 9f39bfe commit 3f2a1e5
Show file tree
Hide file tree
Showing 76 changed files with 1,253 additions and 1,347 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ jobs:
- name: Install dependencies
run: npm ci
- name: Test build
run: npm run build:esm
run: npm run build:ci
4 changes: 4 additions & 0 deletions core/embedjs-interfaces/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-interfaces to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

This was a version bump only for embedjs-interfaces to align it with other projects, there were no code changes.
Expand Down
2 changes: 1 addition & 1 deletion core/embedjs-interfaces/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@llm-tools/embedjs-interfaces",
"version": "0.1.10",
"version": "0.1.11",
"description": "Interfaces for extending the embedjs ecosystem",
"dependencies": {
"@langchain/core": "^0.3.15",
Expand Down
4 changes: 4 additions & 0 deletions core/embedjs-utils/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-utils to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

This was a version bump only for embedjs-utils to align it with other projects, there were no code changes.
Expand Down
4 changes: 2 additions & 2 deletions core/embedjs-utils/package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"name": "@llm-tools/embedjs-utils",
"version": "0.1.10",
"version": "0.1.11",
"description": "Useful util functions when extending the embedjs ecosystem",
"dependencies": {
"@llm-tools/embedjs-interfaces": "0.1.10"
"@llm-tools/embedjs-interfaces": "0.1.11"
},
"type": "module",
"main": "./src/index.js",
Expand Down
6 changes: 6 additions & 0 deletions core/embedjs-utils/src/util/stream.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,9 @@ export async function stream2buffer(stream: Stream): Promise<Buffer> {
stream.on('error', (err) => reject(`error converting stream - ${err}`));
});
}

export function contentTypeToMimeType(contentType: string) {
if (!contentType) return contentType;
if (contentType.includes(';')) return contentType.split(';')[0];
else return contentType;
}
5 changes: 5 additions & 0 deletions core/embedjs/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.1.11 (2024-10-27)

- Improved URL Loader to better detect dynamic content mime type
- Url and Local path loaders do not throw an exception when a loader is missing. Only logs will be emitted.

## 0.1.10 (2024-10-25)

- The library now shows more informative errors when embedding model is not set
Expand Down
10 changes: 5 additions & 5 deletions core/embedjs/package.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"type": "module",
"name": "@llm-tools/embedjs",
"version": "0.1.10",
"version": "0.1.11",
"description": "A NodeJS RAG framework to easily work with LLMs and custom datasets",
"dependencies": {
"@llm-tools/embedjs-interfaces": "0.1.10",
"@llm-tools/embedjs-utils": "0.1.10",
"axios": "^1.7.7",
"@langchain/textsplitters": "^0.1.0",
"@llm-tools/embedjs-interfaces": "0.1.11",
"@llm-tools/embedjs-utils": "0.1.11",
"debug": "^4.3.7",
"langchain": "^0.3.4",
"md5": "^2.3.0",
Expand All @@ -16,7 +16,7 @@
"devDependencies": {
"@types/debug": "^4.1.12",
"@types/md5": "^2.3.5",
"@types/node": "^22.8.0"
"@types/node": "^22.8.1"
},
"main": "./src/index.js",
"license": "Apache-2.0",
Expand Down
20 changes: 12 additions & 8 deletions core/embedjs/src/loaders/local-path-loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,18 @@ export class LocalPathLoader extends BaseLoader<{ type: 'LocalPathLoader' }> {
this.debug(`File '${this.path}' has mime type '${mime}'`);
stream.destroy();

const loader = await createLoaderFromMimeType(currentPath, mime);
for await (const result of await loader.getUnfilteredChunks()) {
yield {
pageContent: result.pageContent,
metadata: {
source: currentPath,
},
};
try {
const loader = await createLoaderFromMimeType(currentPath, mime);
for await (const result of await loader.getUnfilteredChunks()) {
yield {
pageContent: result.pageContent,
metadata: {
source: currentPath,
},
};
}
} catch (err) {
this.debug(`Error creating loader for mime type '${mime}'`, err);
}
} else {
const files = fs.readdirSync(currentPath);
Expand Down
2 changes: 1 addition & 1 deletion core/embedjs/src/loaders/text-loader.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
import md5 from 'md5';

import { BaseLoader } from '@llm-tools/embedjs-interfaces';
Expand Down
44 changes: 26 additions & 18 deletions core/embedjs/src/loaders/url-loader.ts
Original file line number Diff line number Diff line change
@@ -1,37 +1,45 @@
import { getMimeType } from 'stream-mime-type';
import createDebugMessages from 'debug';
import axios from 'axios';
import md5 from 'md5';

import { contentTypeToMimeType, truncateCenterString } from '@llm-tools/embedjs-utils';
import { BaseLoader } from '@llm-tools/embedjs-interfaces';
import { truncateCenterString } from '@llm-tools/embedjs-utils';
import { createLoaderFromMimeType } from '../util/mime.js';

export class UrlLoader extends BaseLoader<{ type: 'UrlLoader' }> {
private readonly debug = createDebugMessages('embedjs:loader:UrlLoader');
private readonly url: string;
private readonly url: URL;

constructor({ url }: { url: string }) {
super(`UrlLoader_${md5(url)}`, { url: truncateCenterString(url, 50) });
this.url = url;
this.url = new URL(url);
this.debug(`UrlLoader verified '${url}' is a valid URL!`);
}

override async *getUnfilteredChunks() {
this.debug('Loader is a valid URL!');
const stream = (await axios.get(this.url, { responseType: 'stream' })).data;
const { mime } = await getMimeType(stream);
this.debug(`Loader type detected as '${mime}'`);
stream.destroy();
const response = await fetch(this.url, { headers: { 'Accept-Encoding': '' } });
const stream = response.body as unknown as NodeJS.ReadableStream;
let { mime } = await getMimeType(stream, { strict: true });
this.debug(`Loader stream detected type '${mime}'`);

const loader = await createLoaderFromMimeType(this.url, mime);
for await (const result of await loader.getUnfilteredChunks()) {
yield {
pageContent: result.pageContent,
metadata: {
type: <const>'UrlLoader',
source: this.url,
},
};
if (!mime) {
mime = contentTypeToMimeType(response.headers.get('content-type'));
this.debug(`Using type '${mime}' from content-type header`);
}

try {
const loader = await createLoaderFromMimeType(this.url.href, mime);
for await (const result of await loader.getUnfilteredChunks()) {
yield {
pageContent: result.pageContent,
metadata: {
type: <const>'UrlLoader',
source: this.url.href,
},
};
}
} catch (err) {
this.debug(`Error creating loader for mime type '${mime}'`, err);
}
}
}
4 changes: 3 additions & 1 deletion core/embedjs/src/util/mime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,10 @@ export async function createLoaderFromMimeType(loader: string, mimeType: string)
if (await SitemapLoader.test(loader)) {
return new SitemapLoader({ url: loader });
}
throw new Error(`No processor found for generic xml`);
throw new Error(`No loader supported for generic xml`);
}
case undefined:
throw new Error(`MIME type could not be detected. Please file an issue if you think this is a bug.`);
default:
throw new Error(`Unknown mime type '${mimeType}'`);
}
Expand Down
4 changes: 4 additions & 0 deletions databases/embedjs-astra/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-astra to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

This was a version bump only for embedjs-astra to align it with other projects, there were no code changes.
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-astra/package.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"name": "@llm-tools/embedjs-astradb",
"version": "0.1.10",
"version": "0.1.11",
"description": "Add AstraDB support to embedjs",
"dependencies": {
"@datastax/astra-db-ts": "^1.5.0",
"@llm-tools/embedjs-interfaces": "0.1.10",
"@llm-tools/embedjs-interfaces": "0.1.11",
"debug": "^4.3.7"
},
"type": "module",
Expand Down
4 changes: 4 additions & 0 deletions databases/embedjs-cosmos/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-cosmos to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

This was a version bump only for embedjs-cosmos to align it with other projects, there were no code changes.
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-cosmos/package.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"name": "@llm-tools/embedjs-cosmos",
"version": "0.1.10",
"version": "0.1.11",
"description": "Add CosmosDB support to embedjs",
"dependencies": {
"@azure/cosmos": "^4.1.1",
"@llm-tools/embedjs-interfaces": "0.1.10",
"@llm-tools/embedjs-interfaces": "0.1.11",
"debug": "^4.3.7"
},
"type": "module",
Expand Down
4 changes: 4 additions & 0 deletions databases/embedjs-hnswlib/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-hnswlib to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

This was a version bump only for embedjs-hnswlib to align it with other projects, there were no code changes.
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-hnswlib/package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"name": "@llm-tools/embedjs-hnswlib",
"version": "0.1.10",
"version": "0.1.11",
"description": "Add HNSWLib support to embedjs",
"dependencies": {
"@llm-tools/embedjs-interfaces": "0.1.10",
"@llm-tools/embedjs-interfaces": "0.1.11",
"debug": "^4.3.7",
"hnswlib-node": "^3.0.0"
},
Expand Down
4 changes: 4 additions & 0 deletions databases/embedjs-lancedb/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-lancedb to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

This was a version bump only for embedjs-lancedb to align it with other projects, there were no code changes.
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-lancedb/package.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
{
"name": "@llm-tools/embedjs-lancedb",
"version": "0.1.10",
"version": "0.1.11",
"description": "Add LanceDb support to embedjs",
"dependencies": {
"@lancedb/lancedb": "^0.11.0",
"@llm-tools/embedjs-interfaces": "0.1.10",
"@llm-tools/embedjs-interfaces": "0.1.11",
"compute-cosine-similarity": "^1.1.0"
},
"type": "module",
Expand Down
4 changes: 4 additions & 0 deletions databases/embedjs-lmdb/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-lmdb to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

This was a version bump only for embedjs-lmdb to align it with other projects, there were no code changes.
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-lmdb/package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"name": "@llm-tools/embedjs-lmdb",
"version": "0.1.10",
"version": "0.1.11",
"description": "Add LMDB support to embedjs",
"dependencies": {
"@llm-tools/embedjs-interfaces": "0.1.10",
"@llm-tools/embedjs-interfaces": "0.1.11",
"lmdb": "^3.1.3"
},
"type": "module",
Expand Down
4 changes: 4 additions & 0 deletions databases/embedjs-mongodb/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-mongodb to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

This was a version bump only for embedjs-mongodb to align it with other projects, there were no code changes.
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-mongodb/package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"name": "@llm-tools/embedjs-mongodb",
"version": "0.1.10",
"version": "0.1.11",
"description": "Add MongoDB support to embedjs",
"dependencies": {
"@llm-tools/embedjs-interfaces": "0.1.10",
"@llm-tools/embedjs-interfaces": "0.1.11",
"debug": "^4.3.7",
"mongodb": "^6.10.0"
},
Expand Down
4 changes: 4 additions & 0 deletions databases/embedjs-pinecone/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-pinecone to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

- bumped pinecone library version to 4.x
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-pinecone/package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"name": "@llm-tools/embedjs-pinecone",
"version": "0.1.10",
"version": "0.1.11",
"description": "Add Pinecone support to embedjs",
"dependencies": {
"@llm-tools/embedjs-interfaces": "0.1.10",
"@llm-tools/embedjs-interfaces": "0.1.11",
"@pinecone-database/pinecone": "^4.0.0",
"debug": "^4.3.7"
},
Expand Down
4 changes: 4 additions & 0 deletions databases/embedjs-qdrant/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-qdrant to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

This was a version bump only for embedjs-qdrant to align it with other projects, there were no code changes.
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-qdrant/package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"name": "@llm-tools/embedjs-qdrant",
"version": "0.1.10",
"version": "0.1.11",
"description": "Add Qdrant support to embedjs",
"dependencies": {
"@llm-tools/embedjs-interfaces": "0.1.10",
"@llm-tools/embedjs-interfaces": "0.1.11",
"@qdrant/js-client-rest": "^1.12.0",
"debug": "^4.3.7",
"uuid": "^10.0.0"
Expand Down
4 changes: 4 additions & 0 deletions databases/embedjs-redis/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-redis to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

This was a version bump only for embedjs-redis to align it with other projects, there were no code changes.
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-redis/package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"name": "@llm-tools/embedjs-redis",
"version": "0.1.10",
"version": "0.1.11",
"description": "Add Redis support to embedjs",
"dependencies": {
"@llm-tools/embedjs-interfaces": "0.1.10",
"@llm-tools/embedjs-interfaces": "0.1.11",
"ioredis": "^5.4.1"
},
"type": "module",
Expand Down
4 changes: 4 additions & 0 deletions databases/embedjs-weaviate/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.1.11 (2024-10-27)

This was a version bump only for embedjs-weaviate to align it with other projects, there were no code changes.

## 0.1.10 (2024-10-25)

This was a version bump only for embedjs-weaviate to align it with other projects, there were no code changes.
Expand Down
4 changes: 2 additions & 2 deletions databases/embedjs-weaviate/package.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{
"name": "@llm-tools/embedjs-weaviate",
"version": "0.1.10",
"version": "0.1.11",
"description": "Add Weaviate support to embedjs",
"dependencies": {
"@llm-tools/embedjs-interfaces": "0.1.10",
"@llm-tools/embedjs-interfaces": "0.1.11",
"compute-cosine-similarity": "^1.1.0",
"debug": "^4.3.7",
"weaviate-ts-client": "^2.2.0"
Expand Down
Loading

0 comments on commit 3f2a1e5

Please sign in to comment.