forked from langchain-ai/langchainjs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
langchain[minor]: Firecrawl Document Loader (langchain-ai#5180)
* Nick: init * Update firecrawl.ts * Nick: * Nick: * Update package.json * Nick: fixes docs * Update yarn.lock * Update examples/src/document_loaders/firecrawl.ts Co-authored-by: Brace Sproul <[email protected]> * Update langchain/src/document_loaders/web/firecrawl.ts Co-authored-by: Brace Sproul <[email protected]> * Nick: fixes * Update yarn.lock * Fix yarn.lock * lint & format * Update firecrawl.ts * Add entrypoint --------- Co-authored-by: Brace Sproul <[email protected]> Co-authored-by: Jacob Lee <[email protected]>
- Loading branch information
1 parent
e77fcec
commit e0da231
Showing
9 changed files
with
233 additions
and
1 deletion.
There are no files selected for viewing
38 changes: 38 additions & 0 deletions
38
docs/core_docs/docs/integrations/document_loaders/web_loaders/firecrawl.mdx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
--- | ||
hide_table_of_contents: true | ||
--- | ||
|
||
# Firecrawl | ||
|
||
This guide shows how to use [Firecrawl](https://firecrawl.dev) with LangChain to load web data into an LLM-ready format using Firecrawl. | ||
|
||
## Overview | ||
|
||
[FireCrawl](https://firecrawl.dev) crawls and convert any website into LLM-ready data. It crawls all accessible subpages and give you clean markdown and metadata for each. No sitemap required. | ||
|
||
FireCrawl handles complex tasks such as reverse proxies, caching, rate limits, and content blocked by JavaScript. Built by the [mendable.ai](https://mendable.ai) team. | ||
|
||
This guide shows how to scrap and crawl entire websites and load them using the `FireCrawlLoader` in LangChain. | ||
|
||
## Setup | ||
|
||
Sign up and get your free [FireCrawl API key](https://firecrawl.dev) to start. FireCrawl offers 100 free credits to get you started, and it's [open-source](https://github.com/mendableai/firecrawl) in case you want to self-host. | ||
|
||
## Usage | ||
|
||
Here's an example of how to use the `FireCrawlLoader` to load web search results: | ||
|
||
Firecrawl offers 2 modes: `scrape` and `crawl`. In `scrape` mode, Firecrawl will only scrape the page you provide. In `crawl` mode, Firecrawl will crawl the entire website. | ||
|
||
import CodeBlock from "@theme/CodeBlock"; | ||
import Example from "@examples/document_loaders/firecrawl.ts"; | ||
|
||
```bash npm2yarn | ||
npm install @mendableai/firecrawl-js | ||
``` | ||
|
||
<CodeBlock language="typescript">{Example}</CodeBlock> | ||
|
||
### Additional Parameters | ||
|
||
For `params` you can pass any of the params according to the [Firecrawl documentation](https://docs.firecrawl.dev). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import { FireCrawlLoader } from "langchain/document_loaders/web/firecrawl"; | ||
|
||
const loader = new FireCrawlLoader({ | ||
url: "https://firecrawl.dev", // The URL to scrape | ||
apiKey: process.env.FIRECRAWL_API_KEY, // Optional, defaults to `FIRECRAWL_API_KEY` in your env. | ||
mode: "scrape", // The mode to run the crawler in. Can be "scrape" for single urls or "crawl" for all accessible subpages | ||
params: { | ||
// optional parameters based on Firecrawl API docs | ||
// For API documentation, visit https://docs.firecrawl.dev | ||
}, | ||
}); | ||
|
||
const docs = await loader.load(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
34 changes: 34 additions & 0 deletions
34
langchain/src/document_loaders/tests/firecrawl.int.test.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/* eslint-disable no-process-env */ | ||
/* eslint-disable @typescript-eslint/no-non-null-assertion */ | ||
import { test } from "@jest/globals"; | ||
import { Document } from "@langchain/core/documents"; | ||
import { FireCrawlLoader } from "../web/firecrawl.js"; | ||
|
||
test("Test FireCrawlLoader load method with scrape mode", async () => { | ||
const loader = new FireCrawlLoader({ | ||
url: "https://firecrawl.dev", | ||
apiKey: process.env.FIRECRAWL_API_KEY, | ||
mode: "scrape", | ||
}); | ||
|
||
const documents = await loader.load(); | ||
expect(documents).toHaveLength(1); | ||
const document = documents[0]; | ||
expect(document).toBeInstanceOf(Document); | ||
expect(document.pageContent).toBeTruthy(); | ||
expect(document.metadata).toBeTruthy(); | ||
}); | ||
|
||
test("Test FireCrawlLoader load method with crawl mode", async () => { | ||
const loader = new FireCrawlLoader({ | ||
url: "https://firecrawl.dev", | ||
apiKey: process.env.FIRECRAWL_API_KEY, | ||
mode: "crawl", | ||
}); | ||
|
||
const documents = await loader.load(); | ||
const document = documents[0]; | ||
expect(document).toBeInstanceOf(Document); | ||
expect(document.pageContent).toBeTruthy(); | ||
expect(document.metadata).toBeTruthy(); | ||
}, 15000); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import FirecrawlApp from "@mendable/firecrawl-js"; | ||
import { Document, type DocumentInterface } from "@langchain/core/documents"; | ||
import { getEnvironmentVariable } from "@langchain/core/utils/env"; | ||
import { BaseDocumentLoader } from "../base.js"; | ||
|
||
/** | ||
* Interface representing the parameters for the Firecrawl loader. It | ||
* includes properties such as the URL to scrape or crawl and the API key. | ||
*/ | ||
interface FirecrawlLoaderParameters { | ||
/** | ||
* URL to scrape or crawl | ||
*/ | ||
url: string; | ||
|
||
/** | ||
* API key for Firecrawl. If not provided, the default value is the value of the FIRECRAWL_API_KEY environment variable. | ||
*/ | ||
apiKey?: string; | ||
|
||
/** | ||
* Mode of operation. Can be either "crawl" or "scrape". If not provided, the default value is "crawl". | ||
*/ | ||
mode?: "crawl" | "scrape"; | ||
params?: Record<string, unknown>; | ||
} | ||
interface FirecrawlDocument { | ||
markdown: string; | ||
metadata: Record<string, unknown>; | ||
} | ||
|
||
/** | ||
* Class representing a document loader for loading data from | ||
* Firecrawl (firecrawl.dev). It extends the BaseDocumentLoader class. | ||
* @example | ||
* ```typescript | ||
* const loader = new FireCrawlLoader({ | ||
* url: "{url}", | ||
* apiKey: "{apiKey}", | ||
* mode: "crawl" | ||
* }); | ||
* const docs = await loader.load(); | ||
* ``` | ||
*/ | ||
export class FireCrawlLoader extends BaseDocumentLoader { | ||
private apiKey: string; | ||
|
||
private url: string; | ||
|
||
private mode: "crawl" | "scrape"; | ||
|
||
private params?: Record<string, unknown>; | ||
|
||
constructor(loaderParams: FirecrawlLoaderParameters) { | ||
super(); | ||
const { | ||
apiKey = getEnvironmentVariable("FIRECRAWL_API_KEY"), | ||
url, | ||
mode = "crawl", | ||
params, | ||
} = loaderParams; | ||
if (!apiKey) { | ||
throw new Error( | ||
"Firecrawl API key not set. You can set it as FIRECRAWL_API_KEY in your .env file, or pass it to Firecrawl." | ||
); | ||
} | ||
|
||
this.apiKey = apiKey; | ||
this.url = url; | ||
this.mode = mode; | ||
this.params = params; | ||
} | ||
|
||
/** | ||
* Loads the data from the Firecrawl. | ||
* @returns An array of Documents representing the retrieved data. | ||
* @throws An error if the data could not be loaded. | ||
*/ | ||
public async load(): Promise<DocumentInterface[]> { | ||
const app = new FirecrawlApp({ apiKey: this.apiKey }); | ||
let firecrawlDocs: FirecrawlDocument[]; | ||
|
||
if (this.mode === "scrape") { | ||
const response = await app.scrapeUrl(this.url, this.params); | ||
if (!response.success) { | ||
throw new Error( | ||
`Firecrawl: Failed to scrape URL. Error: ${response.error}` | ||
); | ||
} | ||
firecrawlDocs = [response.data as FirecrawlDocument]; | ||
} else if (this.mode === "crawl") { | ||
const response = await app.crawlUrl(this.url, this.params, true); | ||
firecrawlDocs = response as FirecrawlDocument[]; | ||
} else { | ||
throw new Error( | ||
`Unrecognized mode '${this.mode}'. Expected one of 'crawl', 'scrape'.` | ||
); | ||
} | ||
|
||
return firecrawlDocs.map( | ||
(doc) => | ||
new Document({ | ||
pageContent: doc.markdown || "", | ||
metadata: doc.metadata || {}, | ||
}) | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters