Merge branch 'main' into main

Axmoney · May 27, 2024 · efb821d · efb821d
2 parents ed4226f + 1bbfb98
commit efb821d
Show file tree

Hide file tree

Showing 11 changed files with 182 additions and 32 deletions.
diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts
@@ -1,12 +1,12 @@
 import { parseApi } from "../../src/lib/parseApi";
-import { getRateLimiter,  } from "../../src/services/rate-limiter";
+import { getRateLimiter, } from "../../src/services/rate-limiter";
 import { AuthResponse, RateLimiterMode } from "../../src/types";
 import { supabase_service } from "../../src/services/supabase";
 import { withAuth } from "../../src/lib/withAuth";
 import { RateLimiterRedis } from "rate-limiter-flexible";
 import { setTraceAttributes } from '@hyperdx/node-opentelemetry';
 
-export async function authenticateUser(req, res, mode?: RateLimiterMode) : Promise<AuthResponse> {
+export async function authenticateUser(req, res, mode?: RateLimiterMode): Promise<AuthResponse> {
   return withAuth(supaAuthenticateUser)(req, res, mode);
 }
 function setTrace(team_id: string, api_key: string) {
@@ -18,7 +18,7 @@ function setTrace(team_id: string, api_key: string) {
   } catch (error) {
     console.error('Error setting trace attributes:', error);
   }
-  
+
 }
 export async function supaAuthenticateUser(
   req,
@@ -97,7 +97,7 @@ export async function supaAuthenticateUser(
       team_id: team_id,
       plan: plan
     }
-    switch (mode) { 
+    switch (mode) {
       case RateLimiterMode.Crawl:
         rateLimiter = getRateLimiter(RateLimiterMode.Crawl, token, subscriptionData.plan);
         break;
@@ -126,9 +126,11 @@ export async function supaAuthenticateUser(
     await rateLimiter.consume(iptoken);
   } catch (rateLimiterRes) {
     console.error(rateLimiterRes);
+    const secs = Math.round(rateLimiterRes.msBeforeNext / 1000) || 1;
+    const retryDate = new Date(Date.now() + rateLimiterRes.msBeforeNext);
     return {
       success: false,
-      error: "Rate limit exceeded. Too many requests, try again in 1 minute.",
+      error: `Rate limit exceeded. Consumed points: ${rateLimiterRes.consumedPoints}, Remaining points: ${rateLimiterRes.remainingPoints}. Please retry after ${secs}s, resets at ${retryDate}`,
       status: 429,
     };
   }
@@ -155,9 +157,9 @@ export async function supaAuthenticateUser(
     normalizedApi = parseApi(token);
 
     const { data, error } = await supabase_service
-    .from("api_keys")
-    .select("*")
-    .eq("key", normalizedApi);
+      .from("api_keys")
+      .select("*")
+      .eq("key", normalizedApi);
 
     if (error || !data || data.length === 0) {
       return {
@@ -170,7 +172,7 @@ export async function supaAuthenticateUser(
     subscriptionData = data[0];
   }
 
-  return { success: true, team_id: subscriptionData.team_id };  
+  return { success: true, team_id: subscriptionData.team_id };
 }
 
 function getPlanByPriceId(price_id: string) {

diff --git a/apps/api/src/controllers/search.ts b/apps/api/src/controllers/search.ts
@@ -28,11 +28,13 @@ export async function searchHelper(
 
   const tbs = searchOptions.tbs ?? null;
   const filter = searchOptions.filter ?? null;
+  const num_results = searchOptions.limit ?? 7;
+  const num_results_buffer = Math.floor(num_results * 1.5);
 
   let res = await search({
     query: query,
     advanced: advanced,
-    num_results: searchOptions.limit ?? 7,
+    num_results: num_results_buffer,
     tbs: tbs,
     filter: filter,
     lang: searchOptions.lang ?? "en",
@@ -47,6 +49,9 @@ export async function searchHelper(
   }
 
   res = res.filter((r) => !isUrlBlocked(r.url));
+  if (res.length > num_results) {
+    res = res.slice(0, num_results);
+  }
 
   if (res.length === 0) {
     return { success: true, error: "No search results found", returnCode: 200 };

diff --git a/apps/python-sdk/build/lib/firecrawl/firecrawl.py b/apps/python-sdk/build/lib/firecrawl/firecrawl.py
@@ -1,25 +1,57 @@
+"""
+FirecrawlApp Module
+
+This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
+It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
+and check the status of these jobs. The module uses requests for HTTP communication
+and handles retries for certain HTTP status codes.
+
+Classes:
+    - FirecrawlApp: Main class for interacting with the Firecrawl API.
+"""
+
 import os
+import time
 from typing import Any, Dict, Optional
+
 import requests
-import time
+
 
 class FirecrawlApp:
-    def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'):
+    """
+    Initialize the FirecrawlApp instance.
+
+    Args:
+        api_key (Optional[str]): API key for authenticating with the Firecrawl API.
+        api_url (Optional[str]): Base URL for the Firecrawl API.
+    """
+    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
         self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
         if self.api_key is None:
             raise ValueError('No API key provided')
-        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL')
-
-
-
+        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
     def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
+        """
+        Scrape the specified URL using the Firecrawl API.
+
+        Args:
+            url (str): The URL to scrape.
+            params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
+
+        Returns:
+            Any: The scraped data if the request is successful.
+
+        Raises:
+            Exception: If the scrape request fails.
+        """
+
         headers = {
             'Content-Type': 'application/json',
             'Authorization': f'Bearer {self.api_key}'
         }
         # Prepare the base scrape parameters with the URL
         scrape_params = {'url': url}
-        
+
         # If there are additional params, process them
         if params:
             # Initialize extractorOptions if present
@@ -32,7 +64,7 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
                 extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
                 # Update the scrape_params with the processed extractorOptions
                 scrape_params['extractorOptions'] = extractor_options
-            
+
             # Include any other params directly at the top level of scrape_params
             for key, value in params.items():
                 if key != 'extractorOptions':
@@ -41,11 +73,11 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
         response = requests.post(
             f'{self.api_url}/v0/scrape',
             headers=headers,
-            json=scrape_params
+            json=scrape_params,
         )
         if response.status_code == 200:
             response = response.json()
-            if response['success']:
+            if response['success'] and 'data' in response:
                 return response['data']
             else:
                 raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
@@ -54,8 +86,21 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
             raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
         else:
             raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
-        
+
     def search(self, query, params=None):
+        """
+        Perform a search using the Firecrawl API.
+
+        Args:
+            query (str): The search query.
+            params (Optional[Dict[str, Any]]): Additional parameters for the search request.
+
+        Returns:
+            Any: The search results if the request is successful.
+
+        Raises:
+            Exception: If the search request fails.
+        """
         headers = {
             'Content-Type': 'application/json',
             'Authorization': f'Bearer {self.api_key}'
@@ -70,19 +115,36 @@ def search(self, query, params=None):
         )
         if response.status_code == 200:
             response = response.json()
-            if response['success'] == True:
+
+            if response['success'] and 'data' in response:
                 return response['data']
             else:
                 raise Exception(f'Failed to search. Error: {response["error"]}')
-            
+
         elif response.status_code in [402, 409, 500]:
             error_message = response.json().get('error', 'Unknown error occurred')
             raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
         else:
             raise Exception(f'Failed to search. Status code: {response.status_code}')
 
-    def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
-        headers = self._prepare_headers()
+    def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
+        """
+        Initiate a crawl job for the specified URL using the Firecrawl API.
+
+        Args:
+            url (str): The URL to crawl.
+            params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
+            wait_until_done (bool): Whether to wait until the crawl job is completed.
+            timeout (int): Timeout between status checks when waiting for job completion.
+            idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
+
+        Returns:
+            Any: The crawl job ID or the crawl results if waiting until completion.
+
+        Raises:
+            Exception: If the crawl job initiation or monitoring fails.
+        """
+        headers = self._prepare_headers(idempotency_key)
         json_data = {'url': url}
         if params:
             json_data.update(params)
@@ -97,20 +159,64 @@ def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
             self._handle_error(response, 'start crawl job')
 
     def check_crawl_status(self, job_id):
+        """
+        Check the status of a crawl job using the Firecrawl API.
+
+        Args:
+            job_id (str): The ID of the crawl job.
+
+        Returns:
+            Any: The status of the crawl job.
+
+        Raises:
+            Exception: If the status check request fails.
+        """
         headers = self._prepare_headers()
         response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
         if response.status_code == 200:
             return response.json()
         else:
             self._handle_error(response, 'check crawl status')
 
-    def _prepare_headers(self):
+    def _prepare_headers(self, idempotency_key=None):
+        """
+        Prepare the headers for API requests.
+
+        Args:
+            idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
+
+        Returns:
+            Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
+        """
+        if idempotency_key:
+            return {
+                'Content-Type': 'application/json',
+                'Authorization': f'Bearer {self.api_key}',
+                'x-idempotency-key': idempotency_key
+            }
+
         return {
             'Content-Type': 'application/json',
-            'Authorization': f'Bearer {self.api_key}'
+            'Authorization': f'Bearer {self.api_key}',
         }
 
     def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
+        """
+        Make a POST request with retries.
+
+        Args:
+            url (str): The URL to send the POST request to.
+            data (Dict[str, Any]): The JSON data to include in the POST request.
+            headers (Dict[str, str]): The headers to include in the POST request.
+            retries (int): Number of retries for the request.
+            backoff_factor (float): Backoff factor for retries.
+
+        Returns:
+            requests.Response: The response from the POST request.
+
+        Raises:
+            requests.RequestException: If the request fails after the specified retries.
+        """
         for attempt in range(retries):
             response = requests.post(url, headers=headers, json=data)
             if response.status_code == 502:
@@ -120,6 +226,21 @@ def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
         return response
 
     def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
+        """
+        Make a GET request with retries.
+
+        Args:
+            url (str): The URL to send the GET request to.
+            headers (Dict[str, str]): The headers to include in the GET request.
+            retries (int): Number of retries for the request.
+            backoff_factor (float): Backoff factor for retries.
+
+        Returns:
+            requests.Response: The response from the GET request.
+
+        Raises:
+            requests.RequestException: If the request fails after the specified retries.
+        """
         for attempt in range(retries):
             response = requests.get(url, headers=headers)
             if response.status_code == 502:
@@ -129,7 +250,20 @@ def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
         return response
 
     def _monitor_job_status(self, job_id, headers, timeout):
-        import time
+        """
+        Monitor the status of a crawl job until completion.
+
+        Args:
+            job_id (str): The ID of the crawl job.
+            headers (Dict[str, str]): The headers to include in the status check requests.
+            timeout (int): Timeout between status checks.
+
+        Returns:
+            Any: The crawl results if the job is completed successfully.
+
+        Raises:
+            Exception: If the job fails or an error occurs during status checks.
+        """
         while True:
             status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
             if status_response.status_code == 200:
@@ -140,15 +274,24 @@ def _monitor_job_status(self, job_id, headers, timeout):
                     else:
                         raise Exception('Crawl job completed but no data was returned')
                 elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
-                    if timeout < 2:
-                        timeout = 2
+                    timeout=max(timeout,2)
                     time.sleep(timeout)  # Wait for the specified timeout before checking again
                 else:
                     raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
             else:
                 self._handle_error(status_response, 'check crawl status')
 
     def _handle_error(self, response, action):
+        """
+        Handle errors from API responses.
+
+        Args:
+            response (requests.Response): The response object from the API request.
+            action (str): Description of the action that was being performed.
+
+        Raises:
+            Exception: An exception with a message containing the status code and error details from the response.
+        """
         if response.status_code in [402, 408, 409, 500]:
             error_message = response.json().get('error', 'Unknown error occurred')
             raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')

diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.10.tar.gz
diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.9.tar.gz
diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.10-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.10-py3-none-any.whl
diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.9-py3-none-any.whl
diff --git a/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc b/apps/python-sdk/firecrawl/__pycache__/__init__.cpython-311.pyc
diff --git a/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc b/apps/python-sdk/firecrawl/__pycache__/firecrawl.cpython-311.pyc
diff --git a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: firecrawl-py
-Version: 0.0.9
+Version: 0.0.10
 Summary: Python SDK for Firecrawl API
 Home-page: https://github.com/mendableai/firecrawl
 Author: Mendable.ai

diff --git a/apps/python-sdk/setup.py b/apps/python-sdk/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name="firecrawl-py",
-    version="0.0.9",
+    version="0.0.10",
     url="https://github.com/mendableai/firecrawl",
     author="Mendable.ai",
     author_email="[email protected]",