Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: added timeouts to requests to prevent blocking requests #890

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 66 additions & 31 deletions apps/python-sdk/firecrawl/firecrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,15 @@ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None)
raise ValueError('No API key provided')
logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}")

def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None,
timeout: Optional[float] = None) -> Any:
"""
Scrape the specified URL using the Firecrawl API.

Args:
url (str): The URL to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
Any: The scraped data if the request is successful.
Expand Down Expand Up @@ -76,6 +78,7 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
f'{self.api_url}{endpoint}',
headers=headers,
json=scrape_params,
timeout=timeout
)
if response.status_code == 200:
response = response.json()
Expand Down Expand Up @@ -108,7 +111,9 @@ def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
def crawl_url(self, url: str,
params: Optional[Dict[str, Any]] = None,
poll_interval: Optional[int] = 2,
idempotency_key: Optional[str] = None) -> Any:
idempotency_key: Optional[str] = None,
timeout: Optional[float] = None
) -> Any:
"""
Initiate a crawl job for the specified URL using the Firecrawl API.

Expand All @@ -117,6 +122,7 @@ def crawl_url(self, url: str,
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
Dict[str, Any]: A dictionary containing the crawl results. The structure includes:
Expand All @@ -136,7 +142,7 @@ def crawl_url(self, url: str,
json_data = {'url': url}
if params:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout)
if response.status_code == 200:
id = response.json().get('id')
return self._monitor_job_status(id, headers, poll_interval)
Expand All @@ -145,14 +151,16 @@ def crawl_url(self, url: str,
self._handle_error(response, 'start crawl job')


def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None,
timeout: Optional[float] = None) -> Dict[str, Any]:
"""
Initiate a crawl job asynchronously.

Args:
url (str): The URL to crawl.
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
Dict[str, Any]: A dictionary containing the crawl initiation response. The structure includes:
Expand All @@ -165,18 +173,19 @@ def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, ide
json_data = {'url': url}
if params:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, 'start crawl job')

def check_crawl_status(self, id: str) -> Any:
def check_crawl_status(self, id: str, timeout: Optional[float] = None) -> Any:
"""
Check the status of a crawl job using the Firecrawl API.

Args:
id (str): The ID of the crawl job.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
Any: The status of the crawl job.
Expand All @@ -187,7 +196,7 @@ def check_crawl_status(self, id: str) -> Any:
endpoint = f'/v1/crawl/{id}'

headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}{endpoint}', headers)
response = self._get_request(f'{self.api_url}{endpoint}', headers, timeout=timeout)
if response.status_code == 200:
data = response.json()
return {
Expand All @@ -204,48 +213,53 @@ def check_crawl_status(self, id: str) -> Any:
else:
self._handle_error(response, 'check crawl status')

def cancel_crawl(self, id: str) -> Dict[str, Any]:
def cancel_crawl(self, id: str, timeout: Optional[float] = None) -> Dict[str, Any]:
"""
Cancel an asynchronous crawl job using the Firecrawl API.

Args:
id (str): The ID of the crawl job to cancel.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
Dict[str, Any]: The response from the cancel crawl request.
"""
headers = self._prepare_headers()
response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers, timeout=timeout)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, "cancel crawl job")

def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
def crawl_url_and_watch(self, url: str, params: Optional[Dict[str, Any]] = None,
idempotency_key: Optional[str] = None,
timeout: Optional[float] = None) -> 'CrawlWatcher':
"""
Initiate a crawl job and return a CrawlWatcher to monitor the job via WebSocket.

Args:
url (str): The URL to crawl.
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
CrawlWatcher: An instance of CrawlWatcher to monitor the crawl job.
"""
crawl_response = self.async_crawl_url(url, params, idempotency_key)
crawl_response = self.async_crawl_url(url, params, idempotency_key, timeout=timeout)
if crawl_response['success'] and 'id' in crawl_response:
return CrawlWatcher(crawl_response['id'], self)
else:
raise Exception("Crawl job failed to start")

def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
def map_url(self, url: str, params: Optional[Dict[str, Any]] = None, timeout: Optional[float] = None) -> Any:
"""
Perform a map search using the Firecrawl API.

Args:
url (str): The URL to perform the map search on.
params (Optional[Dict[str, Any]]): Additional parameters for the map search.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
List[str]: A list of URLs discovered during the map search.
Expand All @@ -263,6 +277,7 @@ def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
f'{self.api_url}{endpoint}',
headers=headers,
json=json_data,
timeout=timeout
)
if response.status_code == 200:
response = response.json()
Expand All @@ -278,7 +293,9 @@ def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
def batch_scrape_urls(self, urls: list[str],
params: Optional[Dict[str, Any]] = None,
poll_interval: Optional[int] = 2,
idempotency_key: Optional[str] = None) -> Any:
idempotency_key: Optional[str] = None,
timeout: Optional[float] = None
) -> Any:
"""
Initiate a batch scrape job for the specified URLs using the Firecrawl API.

Expand All @@ -287,6 +304,7 @@ def batch_scrape_urls(self, urls: list[str],
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
poll_interval (Optional[int]): Time in seconds between status checks when waiting for job completion. Defaults to 2 seconds.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
timeout: Optional[float] = None

Returns:
Dict[str, Any]: A dictionary containing the scrape results. The structure includes:
Expand All @@ -306,7 +324,7 @@ def batch_scrape_urls(self, urls: list[str],
json_data = {'urls': urls}
if params:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout)
if response.status_code == 200:
id = response.json().get('id')
return self._monitor_job_status(id, headers, poll_interval)
Expand All @@ -315,14 +333,18 @@ def batch_scrape_urls(self, urls: list[str],
self._handle_error(response, 'start batch scrape job')


def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> Dict[str, Any]:
def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, Any]] = None,
idempotency_key: Optional[str] = None,
timeout: Optional[float] = None) -> Dict[str, Any]:
"""
Initiate a crawl job asynchronously.

Args:
urls (list[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.


Returns:
Dict[str, Any]: A dictionary containing the batch scrape initiation response. The structure includes:
Expand All @@ -335,36 +357,40 @@ def async_batch_scrape_urls(self, urls: list[str], params: Optional[Dict[str, An
json_data = {'urls': urls}
if params:
json_data.update(params)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers, timeout=timeout)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, 'start batch scrape job')

def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> 'CrawlWatcher':
def batch_scrape_urls_and_watch(self, urls: list[str], params: Optional[Dict[str, Any]] = None,
idempotency_key: Optional[str] = None,
timeout: Optional[float] = None) -> 'CrawlWatcher':
"""
Initiate a batch scrape job and return a CrawlWatcher to monitor the job via WebSocket.

Args:
urls (list[str]): The URLs to scrape.
params (Optional[Dict[str, Any]]): Additional parameters for the scraper.
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
CrawlWatcher: An instance of CrawlWatcher to monitor the batch scrape job.
"""
crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key)
crawl_response = self.async_batch_scrape_urls(urls, params, idempotency_key, timeout=timeout)
if crawl_response['success'] and 'id' in crawl_response:
return CrawlWatcher(crawl_response['id'], self)
else:
raise Exception("Batch scrape job failed to start")

def check_batch_scrape_status(self, id: str) -> Any:
def check_batch_scrape_status(self, id: str, timeout: Optional[float] = None) -> Any:
"""
Check the status of a batch scrape job using the Firecrawl API.

Args:
id (str): The ID of the batch scrape job.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
Any: The status of the batch scrape job.
Expand All @@ -375,7 +401,7 @@ def check_batch_scrape_status(self, id: str) -> Any:
endpoint = f'/v1/batch/scrape/{id}'

headers = self._prepare_headers()
response = self._get_request(f'{self.api_url}{endpoint}', headers)
response = self._get_request(f'{self.api_url}{endpoint}', headers, timeout=timeout)
if response.status_code == 200:
data = response.json()
return {
Expand Down Expand Up @@ -418,7 +444,9 @@ def _post_request(self, url: str,
data: Dict[str, Any],
headers: Dict[str, str],
retries: int = 3,
backoff_factor: float = 0.5) -> requests.Response:
backoff_factor: float = 0.5,
timeout: Optional[float] = None
) -> requests.Response:
"""
Make a POST request with retries.

Expand All @@ -428,6 +456,7 @@ def _post_request(self, url: str,
headers (Dict[str, str]): The headers to include in the POST request.
retries (int): Number of retries for the request.
backoff_factor (float): Backoff factor for retries.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
requests.Response: The response from the POST request.
Expand All @@ -436,7 +465,7 @@ def _post_request(self, url: str,
requests.RequestException: If the request fails after the specified retries.
"""
for attempt in range(retries):
response = requests.post(url, headers=headers, json=data)
response = requests.post(url, headers=headers, json=data, timeout=timeout)
if response.status_code == 502:
time.sleep(backoff_factor * (2 ** attempt))
else:
Expand All @@ -446,7 +475,8 @@ def _post_request(self, url: str,
def _get_request(self, url: str,
headers: Dict[str, str],
retries: int = 3,
backoff_factor: float = 0.5) -> requests.Response:
backoff_factor: float = 0.5,
timeout: Optional[float] = None) -> requests.Response:
"""
Make a GET request with retries.

Expand All @@ -455,6 +485,7 @@ def _get_request(self, url: str,
headers (Dict[str, str]): The headers to include in the GET request.
retries (int): Number of retries for the request.
backoff_factor (float): Backoff factor for retries.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
requests.Response: The response from the GET request.
Expand All @@ -463,7 +494,7 @@ def _get_request(self, url: str,
requests.RequestException: If the request fails after the specified retries.
"""
for attempt in range(retries):
response = requests.get(url, headers=headers)
response = requests.get(url, headers=headers, timeout=timeout)
if response.status_code == 502:
time.sleep(backoff_factor * (2 ** attempt))
else:
Expand All @@ -473,7 +504,8 @@ def _get_request(self, url: str,
def _delete_request(self, url: str,
headers: Dict[str, str],
retries: int = 3,
backoff_factor: float = 0.5) -> requests.Response:
backoff_factor: float = 0.5,
timeout: Optional[float] = None) -> requests.Response:
"""
Make a DELETE request with retries.

Expand All @@ -482,6 +514,7 @@ def _delete_request(self, url: str,
headers (Dict[str, str]): The headers to include in the DELETE request.
retries (int): Number of retries for the request.
backoff_factor (float): Backoff factor for retries.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.

Returns:
requests.Response: The response from the DELETE request.
Expand All @@ -490,21 +523,24 @@ def _delete_request(self, url: str,
requests.RequestException: If the request fails after the specified retries.
"""
for attempt in range(retries):
response = requests.delete(url, headers=headers)
response = requests.delete(url, headers=headers, timeout=timeout)
if response.status_code == 502:
time.sleep(backoff_factor * (2 ** attempt))
else:
return response
return response

def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int) -> Any:
def _monitor_job_status(self, id: str, headers: Dict[str, str],
poll_interval: int,
timeout: Optional[float] = None) -> Any:
"""
Monitor the status of a crawl job until completion.

Args:
id (str): The ID of the crawl job.
headers (Dict[str, str]): The headers to include in the status check requests.
poll_interval (int): Secounds between status checks.
poll_interval (int): Seconds between status checks.
timeout (Optional[float]): Timeout in seconds for the request. Defaults to None.
Returns:
Any: The crawl results if the job is completed successfully.

Expand All @@ -513,15 +549,14 @@ def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: i
"""
while True:
api_url = f'{self.api_url}/v1/crawl/{id}'

status_response = self._get_request(api_url, headers)
status_response = self._get_request(api_url, headers, timeout=timeout)
if status_response.status_code == 200:
status_data = status_response.json()
if status_data['status'] == 'completed':
if 'data' in status_data:
data = status_data['data']
while 'next' in status_data:
status_response = self._get_request(status_data['next'], headers)
status_response = self._get_request(status_data['next'], headers, timeout=timeout)
status_data = status_response.json()
data.extend(status_data['data'])
status_data['data'] = data
Expand Down
Loading