Skip to content

Commit

Permalink
Merge pull request #117 from llm-tools/monorepo
Browse files Browse the repository at this point in the history
Addition of new examples, re-add localPath and URL loaders, rename astra to astradb
  • Loading branch information
adhityan authored Oct 6, 2024
2 parents 2260734 + 7381cee commit 5a525f0
Show file tree
Hide file tree
Showing 18 changed files with 506 additions and 256 deletions.
1 change: 0 additions & 1 deletion core/embedjs-interfaces/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,5 @@ import { BaseModel } from './interfaces/base-model.js';
import { BaseConversation } from './interfaces/base-conversations.js';

export * from './types.js';
export * from './loaders.js';
export * from './constants.js';
export { BaseDb, BaseCache, BaseLoader, BaseEmbeddings, BaseModel, BaseConversation };
21 changes: 0 additions & 21 deletions core/embedjs-interfaces/src/loaders.ts

This file was deleted.

25 changes: 0 additions & 25 deletions core/embedjs-interfaces/src/util.ts

This file was deleted.

4 changes: 3 additions & 1 deletion core/embedjs/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import { RAGApplication } from './core/rag-application.js';
import { RAGApplicationBuilder } from './core/rag-application-builder.js';
import { LocalPathLoader } from './loaders/local-path-loader.js';
import { TextLoader } from './loaders/text-loader.js';
import { JsonLoader } from './loaders/json-loader.js';
import { UrlLoader } from './loaders/url-loader.js';

export { RAGApplication, RAGApplicationBuilder, TextLoader, JsonLoader };
export { RAGApplication, RAGApplicationBuilder, TextLoader, JsonLoader, LocalPathLoader, UrlLoader };
62 changes: 62 additions & 0 deletions core/embedjs/src/loaders/local-path-loader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import { getMimeType } from 'stream-mime-type';
import createDebugMessages from 'debug';
import path from 'node:path';
import fs from 'node:fs';
import md5 from 'md5';

import { createLoaderFromMimeType } from '../util/mime.js';
import { BaseLoader, UnfilteredLoaderChunk } from '@llm-tools/embedjs-interfaces';

export class LocalPathLoader extends BaseLoader<{ type: 'LocalPathLoader' }> {
private readonly debug = createDebugMessages('embedjs:loader:LocalPathLoader');
private readonly path: string;

constructor({ path }: { path: string }) {
super(`LocalPathLoader_${md5(path)}`, { path });
this.path = path;
}

override async *getUnfilteredChunks() {
for await (const result of await this.recursivelyAddPath(this.path)) {
yield {
...result,
metadata: {
...result.metadata,
type: <const>'LocalPathLoader',
originalPath: this.path,
},
};
}
}

private async *recursivelyAddPath(currentPath: string): AsyncGenerator<UnfilteredLoaderChunk, void, void> {
const isDir = fs.lstatSync(currentPath).isDirectory();
this.debug(`Processing path '${currentPath}'. It is a ${isDir ? 'Directory!' : 'file...'}`);

if (!isDir) {
const stream = fs.createReadStream(currentPath);
const { mime } = await getMimeType(stream);
this.debug(`File '${this.path}' has mime type '${mime}'`);
stream.destroy();

const loader = await createLoaderFromMimeType(currentPath, mime);
for await (const result of await loader.getUnfilteredChunks()) {
yield {
pageContent: result.pageContent,
metadata: {
source: currentPath,
},
};
}
} else {
const files = fs.readdirSync(currentPath);
this.debug(`Dir '${currentPath}' has ${files.length} entries inside`, files);

for (const file of files) {
for await (const result of await this.recursivelyAddPath(path.resolve(currentPath, file))) {
yield result;
}
}
}
}
}
37 changes: 37 additions & 0 deletions core/embedjs/src/loaders/url-loader.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import { getMimeType } from 'stream-mime-type';
import createDebugMessages from 'debug';
import axios from 'axios';
import md5 from 'md5';

import { BaseLoader } from '@llm-tools/embedjs-interfaces';
import { truncateCenterString } from '@llm-tools/embedjs-utils';
import { createLoaderFromMimeType } from '../util/mime.js';

export class UrlLoader extends BaseLoader<{ type: 'UrlLoader' }> {
private readonly debug = createDebugMessages('embedjs:loader:UrlLoader');
private readonly url: string;

constructor({ url }: { url: string }) {
super(`UrlLoader_${md5(url)}`, { url: truncateCenterString(url, 50) });
this.url = url;
}

override async *getUnfilteredChunks() {
this.debug('Loader is a valid URL!');
const stream = (await axios.get(this.url, { responseType: 'stream' })).data;
const { mime } = await getMimeType(stream);
this.debug(`Loader type detected as '${mime}'`);
stream.destroy();

const loader = await createLoaderFromMimeType(this.url, mime);
for await (const result of await loader.getUnfilteredChunks()) {
yield {
pageContent: result.pageContent,
metadata: {
type: <const>'UrlLoader',
source: this.url,
},
};
}
}
}
74 changes: 74 additions & 0 deletions core/embedjs/src/util/mime.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import mime from 'mime';
import createDebugMessages from 'debug';
import { BaseLoader } from '@llm-tools/embedjs-interfaces';
import { TextLoader } from '../loaders/text-loader.js';

export async function createLoaderFromMimeType(loader: string, mimeType: string): Promise<BaseLoader> {
switch (mimeType) {
case 'application/msword':
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': {
const { DocxLoader } = await import('@llm-tools/embedjs-loader-msoffice').catch(() => {
throw new Error(
'Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load docx files',
);
});
return new DocxLoader({ filePathOrUrl: loader });
}
case 'application/vnd.ms-excel':
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': {
const { ExcelLoader } = await import('@llm-tools/embedjs-loader-msoffice').catch(() => {
throw new Error(
'Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load excel files',
);
});
return new ExcelLoader({ filePathOrUrl: loader });
}
case 'application/pdf': {
const { PdfLoader } = await import('@llm-tools/embedjs-loader-pdf').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-pdf` needs to be installed to load PDF files');
});
return new PdfLoader({ filePathOrUrl: loader });
}
case 'application/vnd.openxmlformats-officedocument.presentationml.presentation': {
const { PptLoader } = await import('@llm-tools/embedjs-loader-msoffice').catch(() => {
throw new Error(
'Package `@llm-tools/embedjs-loader-msoffice` needs to be installed to load pptx files',
);
});
return new PptLoader({ filePathOrUrl: loader });
}
case 'text/plain': {
const fineType = mime.getType(loader);
createDebugMessages('embedjs:createLoaderFromMimeType')(`Fine type for '${loader}' is '${fineType}'`);
if (fineType === 'text/csv') {
const { CsvLoader } = await import('@llm-tools/embedjs-loader-csv').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load csv files');
});
return new CsvLoader({ filePathOrUrl: loader });
} else return new TextLoader({ text: loader });
}
case 'application/csv': {
const { CsvLoader } = await import('@llm-tools/embedjs-loader-csv').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-csv` needs to be installed to load csv files');
});
return new CsvLoader({ filePathOrUrl: loader });
}
case 'text/html': {
const { WebLoader } = await import('@llm-tools/embedjs-loader-web').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-web` needs to be installed to load web documents');
});
return new WebLoader({ urlOrContent: loader });
}
case 'text/xml': {
const { SitemapLoader } = await import('@llm-tools/embedjs-loader-sitemap').catch(() => {
throw new Error('Package `@llm-tools/embedjs-loader-sitemap` needs to be installed to load sitemaps');
});
if (await SitemapLoader.test(loader)) {
return new SitemapLoader({ url: loader });
}
throw new SyntaxError(`No processor found for generic xml`);
}
default:
throw new SyntaxError(`Unknown mime type '${mimeType}'`);
}
}
3 changes: 3 additions & 0 deletions examples/pinecone/eslint.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import baseConfig from '../../eslint.config.js';

export default [...baseConfig];
27 changes: 27 additions & 0 deletions examples/pinecone/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions examples/pinecone/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "@llm-tools/embedjs-examples-pinecone",
"version": "0.1.1",
"type": "module",
"dependencies": {
"dotenv": "^16.4.5"
},
"scripts": {
"start": "nx run examples-pinecone:serve"
}
}
57 changes: 57 additions & 0 deletions examples/pinecone/project.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"name": "examples-pinecone",
"$schema": "../../node_modules/nx/schemas/project-schema.json",
"sourceRoot": "examples/pinecone/src",
"projectType": "application",
"tags": [],
"targets": {
"build": {
"executor": "@nx/esbuild:esbuild",
"outputs": ["{options.outputPath}"],
"defaultConfiguration": "development",
"options": {
"platform": "node",
"outputPath": "dist/examples/pinecone",
"format": ["esm"],
"bundle": true,
"main": "examples/pinecone/src/main.ts",
"tsConfig": "examples/pinecone/tsconfig.app.json",
"generatePackageJson": false,
"esbuildOptions": {
"sourcemap": true,
"outExtension": {
".js": ".js"
}
}
},
"configurations": {
"development": {},
"production": {
"esbuildOptions": {
"sourcemap": false,
"outExtension": {
".js": ".js"
}
}
}
}
},
"serve": {
"executor": "@nx/js:node",
"defaultConfiguration": "development",
"dependsOn": ["build"],
"options": {
"buildTarget": "examples-pinecone:build",
"runBuildTargetDependencies": true
},
"configurations": {
"development": {
"buildTarget": "examples-pinecone:build:development"
},
"production": {
"buildTarget": "examples-pinecone:build:production"
}
}
}
}
}
22 changes: 22 additions & 0 deletions examples/pinecone/src/main.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import { RAGApplicationBuilder } from '@llm-tools/embedjs';
import { PineconeDb } from '@llm-tools/embedjs-pinecone';
import { WebLoader } from 'loaders/embedjs-loader-web/src/web-loader.js';

const llmApplication = await new RAGApplicationBuilder()
.setVectorDb(
new PineconeDb({
projectName: 'test',
namespace: 'dev',
indexSpec: {
pod: {
podType: 'p1.x1',
environment: 'us-east1-gcp',
},
},
}),
)
.build();

await llmApplication.addLoader(new WebLoader({ urlOrContent: 'https://en.wikipedia.org/wiki/Tesla,_Inc.' }));

console.log(await llmApplication.query('Who founded Tesla?'));
9 changes: 9 additions & 0 deletions examples/pinecone/tsconfig.app.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"extends": "./tsconfig.json",
"compilerOptions": {
"outDir": "../../dist/out-tsc",
"types": ["node"]
},
"exclude": ["jest.config.ts", "src/**/*.spec.ts", "src/**/*.test.ts"],
"include": ["src/**/*.ts"]
}
17 changes: 17 additions & 0 deletions examples/pinecone/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"extends": "../../tsconfig.base.json",
"files": [],
"include": [],
"references": [
{
"path": "./tsconfig.app.json"
}
],
"compilerOptions": {
"esModuleInterop": true,
"target": "ES2022",
"lib": ["ES2022", "ES2022.Object"],
"module": "NodeNext",
"moduleResolution": "nodenext"
}
}
Loading

0 comments on commit 5a525f0

Please sign in to comment.