Merge pull request mendableai#178 from mattjoyce/main

Fix FIRECRAWL_API_URL bug, also various PyLint fixes
Axmoney · May 24, 2024 · 605ba4c · 605ba4c
2 parents e0d979e + 8c380d7
commit 605ba4c
Show file tree

Hide file tree

Showing 2 changed files with 148 additions and 14 deletions.
diff --git a/apps/python-sdk/.pylintrc b/apps/python-sdk/.pylintrc
@@ -0,0 +1,2 @@
+[FORMAT]
+max-line-length = 120
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
@@ -1,25 +1,57 @@
+"""
+FirecrawlApp Module
+
+This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
+It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
+and check the status of these jobs. The module uses requests for HTTP communication
+and handles retries for certain HTTP status codes.
+
+Classes:
+    - FirecrawlApp: Main class for interacting with the Firecrawl API.
+"""
+
 import os
+import time
 from typing import Any, Dict, Optional
+
 import requests
-import time
+
 
 class FirecrawlApp:
-    def __init__(self, api_key=None, api_url='https://api.firecrawl.dev'):
+    """
+    Initialize the FirecrawlApp instance.
+
+    Args:
+        api_key (Optional[str]): API key for authenticating with the Firecrawl API.
+        api_url (Optional[str]): Base URL for the Firecrawl API.
+    """
+    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
         self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
         if self.api_key is None:
             raise ValueError('No API key provided')
-        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL')
-
-
-
+        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
     def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
+        """
+        Scrape the specified URL using the Firecrawl API.
+
+        Args:
+            url (str): The URL to scrape.
+            params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
+
+        Returns:
+            Any: The scraped data if the request is successful.
+
+        Raises:
+            Exception: If the scrape request fails.
+        """
+
         headers = {
             'Content-Type': 'application/json',
             'Authorization': f'Bearer {self.api_key}'
         }
         # Prepare the base scrape parameters with the URL
         scrape_params = {'url': url}
-        
+
         # If there are additional params, process them
         if params:
             # Initialize extractorOptions if present
@@ -32,7 +64,7 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
                 extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
                 # Update the scrape_params with the processed extractorOptions
                 scrape_params['extractorOptions'] = extractor_options
-            
+
             # Include any other params directly at the top level of scrape_params
             for key, value in params.items():
                 if key != 'extractorOptions':
@@ -41,7 +73,7 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
         response = requests.post(
             f'{self.api_url}/v0/scrape',
             headers=headers,
-            json=scrape_params
+            json=scrape_params,
         )
         if response.status_code == 200:
             response = response.json()
@@ -54,8 +86,21 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
             raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
         else:
             raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
-        
+
     def search(self, query, params=None):
+        """
+        Perform a search using the Firecrawl API.
+
+        Args:
+            query (str): The search query.
+            params (Optional[Dict[str, Any]]): Additional parameters for the search request.
+
+        Returns:
+            Any: The search results if the request is successful.
+
+        Raises:
+            Exception: If the search request fails.
+        """
         headers = {
             'Content-Type': 'application/json',
             'Authorization': f'Bearer {self.api_key}'
@@ -70,18 +115,34 @@ def search(self, query, params=None):
         )
         if response.status_code == 200:
             response = response.json()
+
             if response['success'] and 'data' in response:
                 return response['data']
             else:
                 raise Exception(f'Failed to search. Error: {response["error"]}')
-            
+
         elif response.status_code in [402, 409, 500]:
             error_message = response.json().get('error', 'Unknown error occurred')
             raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
         else:
             raise Exception(f'Failed to search. Status code: {response.status_code}')
 
     def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
+        """
+        Initiate a crawl job for the specified URL using the Firecrawl API.
+
+        Args:
+            url (str): The URL to crawl.
+            params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
+            wait_until_done (bool): Whether to wait until the crawl job is completed.
+            timeout (int): Timeout between status checks when waiting for job completion.
+
+        Returns:
+            Any: The crawl job ID or the crawl results if waiting until completion.
+
+        Raises:
+            Exception: If the crawl job initiation or monitoring fails.
+        """
         headers = self._prepare_headers()
         json_data = {'url': url}
         if params:
@@ -97,6 +158,18 @@ def crawl_url(self, url, params=None, wait_until_done=True, timeout=2):
             self._handle_error(response, 'start crawl job')
 
     def check_crawl_status(self, job_id):
+        """
+        Check the status of a crawl job using the Firecrawl API.
+
+        Args:
+            job_id (str): The ID of the crawl job.
+
+        Returns:
+            Any: The status of the crawl job.
+
+        Raises:
+            Exception: If the status check request fails.
+        """
         headers = self._prepare_headers()
         response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
         if response.status_code == 200:
@@ -105,12 +178,34 @@ def check_crawl_status(self, job_id):
             self._handle_error(response, 'check crawl status')
 
     def _prepare_headers(self):
+        """
+        Prepare the headers for API requests.
+
+        Returns:
+            Dict[str, str]: The headers including content type and authorization.
+        """
         return {
             'Content-Type': 'application/json',
             'Authorization': f'Bearer {self.api_key}'
         }
 
     def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
+        """
+        Make a POST request with retries.
+
+        Args:
+            url (str): The URL to send the POST request to.
+            data (Dict[str, Any]): The JSON data to include in the POST request.
+            headers (Dict[str, str]): The headers to include in the POST request.
+            retries (int): Number of retries for the request.
+            backoff_factor (float): Backoff factor for retries.
+
+        Returns:
+            requests.Response: The response from the POST request.
+
+        Raises:
+            requests.RequestException: If the request fails after the specified retries.
+        """
         for attempt in range(retries):
             response = requests.post(url, headers=headers, json=data)
             if response.status_code == 502:
@@ -120,6 +215,21 @@ def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
         return response
 
     def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
+        """
+        Make a GET request with retries.
+
+        Args:
+            url (str): The URL to send the GET request to.
+            headers (Dict[str, str]): The headers to include in the GET request.
+            retries (int): Number of retries for the request.
+            backoff_factor (float): Backoff factor for retries.
+
+        Returns:
+            requests.Response: The response from the GET request.
+
+        Raises:
+            requests.RequestException: If the request fails after the specified retries.
+        """
         for attempt in range(retries):
             response = requests.get(url, headers=headers)
             if response.status_code == 502:
@@ -129,7 +239,20 @@ def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
         return response
 
     def _monitor_job_status(self, job_id, headers, timeout):
-        import time
+        """
+        Monitor the status of a crawl job until completion.
+
+        Args:
+            job_id (str): The ID of the crawl job.
+            headers (Dict[str, str]): The headers to include in the status check requests.
+            timeout (int): Timeout between status checks.
+
+        Returns:
+            Any: The crawl results if the job is completed successfully.
+
+        Raises:
+            Exception: If the job fails or an error occurs during status checks.
+        """
         while True:
             status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
             if status_response.status_code == 200:
@@ -140,15 +263,24 @@ def _monitor_job_status(self, job_id, headers, timeout):
                     else:
                         raise Exception('Crawl job completed but no data was returned')
                 elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
-                    if timeout < 2:
-                        timeout = 2
+                    timeout=max(timeout,2)
                     time.sleep(timeout)  # Wait for the specified timeout before checking again
                 else:
                     raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
             else:
                 self._handle_error(status_response, 'check crawl status')
 
     def _handle_error(self, response, action):
+        """
+        Handle errors from API responses.
+
+        Args:
+            response (requests.Response): The response object from the API request.
+            action (str): Description of the action that was being performed.
+
+        Raises:
+            Exception: An exception with a message containing the status code and error details from the response.
+        """
         if response.status_code in [402, 408, 409, 500]:
             error_message = response.json().get('error', 'Unknown error occurred')
             raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')