diff --git a/apps/api/fly.toml b/apps/api/fly.toml index ca619d164..0656a2cb3 100644 --- a/apps/api/fly.toml +++ b/apps/api/fly.toml @@ -27,6 +27,13 @@ kill_timeout = '5s' hard_limit = 200 soft_limit = 100 +[[http_service.checks]] + grace_period = "10s" + interval = "30s" + method = "GET" + timeout = "5s" + path = "/" + [[services]] protocol = 'tcp' internal_port = 8080 diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index b0bfabb94..a335f8c5b 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -1,12 +1,12 @@ import { parseApi } from "../../src/lib/parseApi"; -import { getRateLimiter, } from "../../src/services/rate-limiter"; +import { getRateLimiter, } from "../../src/services/rate-limiter"; import { AuthResponse, RateLimiterMode } from "../../src/types"; import { supabase_service } from "../../src/services/supabase"; import { withAuth } from "../../src/lib/withAuth"; import { RateLimiterRedis } from "rate-limiter-flexible"; import { setTraceAttributes } from '@hyperdx/node-opentelemetry'; -export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise { +export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise { return withAuth(supaAuthenticateUser)(req, res, mode); } function setTrace(team_id: string, api_key: string) { @@ -18,7 +18,7 @@ function setTrace(team_id: string, api_key: string) { } catch (error) { console.error('Error setting trace attributes:', error); } - + } export async function supaAuthenticateUser( req, @@ -97,7 +97,7 @@ export async function supaAuthenticateUser( team_id: team_id, plan: plan } - switch (mode) { + switch (mode) { case RateLimiterMode.Crawl: rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan); break; @@ -126,9 +126,11 @@ export async function supaAuthenticateUser( await rateLimiter.consume(iptoken); } catch (rateLimiterRes) { console.error(rateLimiterRes); + const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1; + const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext); return { success: false, - error: "Rate limit exceeded. Too many requests, try again in 1 minute.", + error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Please retry after ${secs}s, resets at ${retryDate}`, status: 429, }; } @@ -155,9 +157,9 @@ export async function supaAuthenticateUser( normalizedApi = parseApi(token); const { data, error } = await supabase_service - .from("api_keys") - .select("*") - .eq("key", normalizedApi); + .from("api_keys") + .select("*") + .eq("key", normalizedApi); if (error || !data || data.length === 0) { return { @@ -170,7 +172,7 @@ export async function supaAuthenticateUser( subscriptionData = data[0]; } - return { success: true, team_id: subscriptionData.team_id }; + return { success: true, team_id: subscriptionData.team_id }; } function getPlanByPriceId(price_id: string) { diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts index d98c08d5d..a73c625a8 100644 --- a/apps/api/src/controllers/search.ts +++ b/apps/api/src/controllers/search.ts @@ -28,11 +28,13 @@ export async function searchHelper( const tbs = searchOptions.tbs ?? null; const filter = searchOptions.filter ?? null; + const num_results = searchOptions.limit ?? 7; + const num_results_buffer = Math.floor(num_results * 1.5); let res = await search({ query: query, advanced: advanced, - num_results: searchOptions.limit ?? 7, + num_results: num_results_buffer, tbs: tbs, filter: filter, lang: searchOptions.lang ?? "en", @@ -47,6 +49,9 @@ export async function searchHelper( } res = res.filter((r) => !isUrlBlocked(r.url)); + if (res.length > num_results) { + res = res.slice(0, num_results); + } if (res.length === 0) { return { success: true, error: "No search results found", returnCode: 200 }; diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index c3d37c476..c6a1232eb 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -1,7 +1,6 @@ const socialMediaBlocklist = [ 'facebook.com', 'twitter.com', - 'x.com', 'instagram.com', 'linkedin.com', 'pinterest.com', diff --git a/apps/python-sdk/build/lib/firecrawl/firecrawl.py b/apps/python-sdk/build/lib/firecrawl/firecrawl.py index 98cb8ed68..6c0bc4150 100644 --- a/apps/python-sdk/build/lib/firecrawl/firecrawl.py +++ b/apps/python-sdk/build/lib/firecrawl/firecrawl.py @@ -1,25 +1,57 @@ +""" +FirecrawlApp Module + +This module provides a class `FirecrawlApp` for interacting with the Firecrawl API. +It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, +and check the status of these jobs. The module uses requests for HTTP communication +and handles retries for certain HTTP status codes. + +Classes: + - FirecrawlApp: Main class for interacting with the Firecrawl API. +""" + import os +import time from typing import Any, Dict, Optional + import requests -import time + class FirecrawlApp: - def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'): + """ + Initialize the FirecrawlApp instance. + + Args: + api_key (Optional[str]): API key for authenticating with the Firecrawl API. + api_url (Optional[str]): Base URL for the Firecrawl API. + """ + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') if self.api_key is None: raise ValueError('No API key provided') - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL') - - - + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: + """ + Scrape the specified URL using the Firecrawl API. + + Args: + url (str): The URL to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. + + Returns: + Any: The scraped data if the request is successful. + + Raises: + Exception: If the scrape request fails. + """ + headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}' } # Prepare the base scrape parameters with the URL scrape_params = {'url': url} - + # If there are additional params, process them if params: # Initialize extractorOptions if present @@ -32,7 +64,7 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') # Update the scrape_params with the processed extractorOptions scrape_params['extractorOptions'] = extractor_options - + # Include any other params directly at the top level of scrape_params for key, value in params.items(): if key != 'extractorOptions': @@ -41,11 +73,11 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: response = requests.post( f'{self.api_url}/v0/scrape', headers=headers, - json=scrape_params + json=scrape_params, ) if response.status_code == 200: response = response.json() - if response['success']: + if response['success'] and 'data' in response: return response['data'] else: raise Exception(f'Failed to scrape URL. Error: {response["error"]}') @@ -54,8 +86,21 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') else: raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') - + def search(self, query, params=None): + """ + Perform a search using the Firecrawl API. + + Args: + query (str): The search query. + params (Optional[Dict[str, Any]]): Additional parameters for the search request. + + Returns: + Any: The search results if the request is successful. + + Raises: + Exception: If the search request fails. + """ headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}' @@ -70,19 +115,36 @@ def search(self, query, params=None): ) if response.status_code == 200: response = response.json() - if response['success'] == True: + + if response['success'] and 'data' in response: return response['data'] else: raise Exception(f'Failed to search. Error: {response["error"]}') - + elif response.status_code in [402, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') else: raise Exception(f'Failed to search. Status code: {response.status_code}') - def crawl_url(self, url, params=None, wait_until_done=True, timeout=2): - headers = self._prepare_headers() + def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None): + """ + Initiate a crawl job for the specified URL using the Firecrawl API. + + Args: + url (str): The URL to crawl. + params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. + wait_until_done (bool): Whether to wait until the crawl job is completed. + timeout (int): Timeout between status checks when waiting for job completion. + idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. + + Returns: + Any: The crawl job ID or the crawl results if waiting until completion. + + Raises: + Exception: If the crawl job initiation or monitoring fails. + """ + headers = self._prepare_headers(idempotency_key) json_data = {'url': url} if params: json_data.update(params) @@ -97,6 +159,18 @@ def crawl_url(self, url, params=None, wait_until_done=True, timeout=2): self._handle_error(response, 'start crawl job') def check_crawl_status(self, job_id): + """ + Check the status of a crawl job using the Firecrawl API. + + Args: + job_id (str): The ID of the crawl job. + + Returns: + Any: The status of the crawl job. + + Raises: + Exception: If the status check request fails. + """ headers = self._prepare_headers() response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) if response.status_code == 200: @@ -104,13 +178,45 @@ def check_crawl_status(self, job_id): else: self._handle_error(response, 'check crawl status') - def _prepare_headers(self): + def _prepare_headers(self, idempotency_key=None): + """ + Prepare the headers for API requests. + + Args: + idempotency_key (Optional[str]): A unique key to ensure idempotency of requests. + + Returns: + Dict[str, str]: The headers including content type, authorization, and optionally idempotency key. + """ + if idempotency_key: + return { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.api_key}', + 'x-idempotency-key': idempotency_key + } + return { 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' + 'Authorization': f'Bearer {self.api_key}', } def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): + """ + Make a POST request with retries. + + Args: + url (str): The URL to send the POST request to. + data (Dict[str, Any]): The JSON data to include in the POST request. + headers (Dict[str, str]): The headers to include in the POST request. + retries (int): Number of retries for the request. + backoff_factor (float): Backoff factor for retries. + + Returns: + requests.Response: The response from the POST request. + + Raises: + requests.RequestException: If the request fails after the specified retries. + """ for attempt in range(retries): response = requests.post(url, headers=headers, json=data) if response.status_code == 502: @@ -120,6 +226,21 @@ def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): return response def _get_request(self, url, headers, retries=3, backoff_factor=0.5): + """ + Make a GET request with retries. + + Args: + url (str): The URL to send the GET request to. + headers (Dict[str, str]): The headers to include in the GET request. + retries (int): Number of retries for the request. + backoff_factor (float): Backoff factor for retries. + + Returns: + requests.Response: The response from the GET request. + + Raises: + requests.RequestException: If the request fails after the specified retries. + """ for attempt in range(retries): response = requests.get(url, headers=headers) if response.status_code == 502: @@ -129,7 +250,20 @@ def _get_request(self, url, headers, retries=3, backoff_factor=0.5): return response def _monitor_job_status(self, job_id, headers, timeout): - import time + """ + Monitor the status of a crawl job until completion. + + Args: + job_id (str): The ID of the crawl job. + headers (Dict[str, str]): The headers to include in the status check requests. + timeout (int): Timeout between status checks. + + Returns: + Any: The crawl results if the job is completed successfully. + + Raises: + Exception: If the job fails or an error occurs during status checks. + """ while True: status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) if status_response.status_code == 200: @@ -140,8 +274,7 @@ def _monitor_job_status(self, job_id, headers, timeout): else: raise Exception('Crawl job completed but no data was returned') elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: - if timeout < 2: - timeout = 2 + timeout=max(timeout,2) time.sleep(timeout) # Wait for the specified timeout before checking again else: raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') @@ -149,6 +282,16 @@ def _monitor_job_status(self, job_id, headers, timeout): self._handle_error(status_response, 'check crawl status') def _handle_error(self, response, action): + """ + Handle errors from API responses. + + Args: + response (requests.Response): The response object from the API request. + action (str): Description of the action that was being performed. + + Raises: + Exception: An exception with a message containing the status code and error details from the response. + """ if response.status_code in [402, 408, 409, 500]: error_message = response.json().get('error', 'Unknown error occurred') raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz new file mode 100644 index 000000000..3dd82a93b Binary files /dev/null and b/apps/python-sdk/dist/firecrawl-py-0.0.11.tar.gz differ diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz deleted file mode 100644 index 55e0eed7b..000000000 Binary files a/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz and /dev/null differ diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl new file mode 100644 index 000000000..4d7e3aacd Binary files /dev/null and b/apps/python-sdk/dist/firecrawl_py-0.0.11-py3-none-any.whl differ diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl deleted file mode 100644 index 83cb5b774..000000000 Binary files a/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl and /dev/null differ diff --git a/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc b/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 605b3df71..000000000 Binary files a/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc b/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc deleted file mode 100644 index 7c98fa33c..000000000 Binary files a/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc and /dev/null differ diff --git a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO index c1ee531a0..7d4b83503 100644 --- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO +++ b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO @@ -1,7 +1,160 @@ Metadata-Version: 2.1 Name: firecrawl-py -Version: 0.0.9 +Version: 0.0.11 Summary: Python SDK for Firecrawl API Home-page: https://github.com/mendableai/firecrawl Author: Mendable.ai Author-email: nick@mendable.ai +License: GNU General Public License v3 (GPLv3) +Project-URL: Documentation, https://docs.firecrawl.dev +Project-URL: Source, https://github.com/mendableai/firecrawl +Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues +Keywords: SDK API firecrawl +Classifier: Development Status :: 5 - Production/Stable +Classifier: Environment :: Web Environment +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) +Classifier: Natural Language :: English +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Topic :: Internet +Classifier: Topic :: Internet :: WWW/HTTP +Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search +Classifier: Topic :: Software Development +Classifier: Topic :: Software Development :: Libraries +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Text Processing +Classifier: Topic :: Text Processing :: Indexing +Requires-Python: >=3.8 +Description-Content-Type: text/markdown + +# Firecrawl Python SDK + +The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. + +## Installation + +To install the Firecrawl Python SDK, you can use pip: + +```bash +pip install firecrawl-py +``` + +## Usage + +1. Get an API key from [firecrawl.dev](https://firecrawl.dev) +2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. + + +Here's an example of how to use the SDK: + +```python +from firecrawl import FirecrawlApp + +# Initialize the FirecrawlApp with your API key +app = FirecrawlApp(api_key='your_api_key') + +# Scrape a single URL +url = 'https://mendable.ai' +scraped_data = app.scrape_url(url) + +# Crawl a website +crawl_url = 'https://mendable.ai' +params = { + 'pageOptions': { + 'onlyMainContent': True + } +} +crawl_result = app.crawl_url(crawl_url, params=params) +``` + +### Scraping a URL + +To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary. + +```python +url = 'https://example.com' +scraped_data = app.scrape_url(url) +``` +### Extracting structured data from a URL + +With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it: + +```python +class ArticleSchema(BaseModel): + title: str + points: int + by: str + commentsURL: str + +class TopArticlesSchema(BaseModel): + top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") + +data = app.scrape_url('https://news.ycombinator.com', { + 'extractorOptions': { + 'extractionSchema': TopArticlesSchema.model_json_schema(), + 'mode': 'llm-extraction' + }, + 'pageOptions':{ + 'onlyMainContent': True + } +}) +print(data["llm_extraction"]) +``` + +### Search for a query + +Used to search the web, get the most relevant results, scrap each page and return the markdown. + +```python +query = 'what is mendable?' +search_result = app.search(query) +``` + +### Crawling a Website + +To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. + +The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method. + +```python +crawl_url = 'https://example.com' +params = { + 'crawlerOptions': { + 'excludes': ['blog/*'], + 'includes': [], # leave empty for all pages + 'limit': 1000, + }, + 'pageOptions': { + 'onlyMainContent': True + } +} +crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5) +``` + +If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised. + +### Checking Crawl Status + +To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job. + +```python +job_id = crawl_result['jobId'] +status = app.check_crawl_status(job_id) +``` + +## Error Handling + +The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. + +## Contributing + +Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. + +## License + +The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py index 726cafa34..beee059de 100644 --- a/apps/python-sdk/setup.py +++ b/apps/python-sdk/setup.py @@ -1,16 +1,52 @@ -from setuptools import setup, find_packages +from pathlib import Path + +from setuptools import find_packages, setup + +this_directory = Path(__file__).parent +long_description_content = (this_directory / "README.md").read_text() setup( - name='firecrawl-py', - version='0.0.9', - url='https://github.com/mendableai/firecrawl', - author='Mendable.ai', - author_email='nick@mendable.ai', - description='Python SDK for Firecrawl API', + name="firecrawl-py", + version="0.0.11", + url="https://github.com/mendableai/firecrawl", + author="Mendable.ai", + author_email="nick@mendable.ai", + description="Python SDK for Firecrawl API", + long_description=long_description_content, + long_description_content_type="text/markdown", packages=find_packages(), install_requires=[ 'requests', 'pytest', 'python-dotenv', ], + python_requires='>=3.8', + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Environment :: Web Environment", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Internet", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Internet :: WWW/HTTP :: Indexing/Search", + "Topic :: Software Development", + "Topic :: Software Development :: Libraries", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: Text Processing", + "Topic :: Text Processing :: Indexing", + ], + keywords="SDK API firecrawl", + project_urls={ + "Documentation": "https://docs.firecrawl.dev", + "Source": "https://github.com/mendableai/firecrawl", + "Tracker": "https://github.com/mendableai/firecrawl/issues", + }, + license="GNU General Public License v3 (GPLv3)", )