From 83a16c1b9b69ed3a19ef61bb92c02739ae1fe697 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 4 Oct 2023 12:54:14 +0200 Subject: [PATCH] Format Python Actor templates (#213) U used auto-formatters (`autopep8`, `isort`) that we use in our other Python libraries (`SDK`, `Client`) to format the Python Actor templates. I also used linter (`flake8` with plugins we use in our other libs) to fix some violations, like "trailing comma" or "bare except". --- templates/python-beautifulsoup/src/main.py | 14 ++++++++------ templates/python-empty/src/main.py | 6 ++++-- templates/python-playwright/src/main.py | 13 +++++++------ templates/python-scrapy/src/apify/main.py | 2 +- templates/python-scrapy/src/apify/middlewares.py | 4 ++-- templates/python-scrapy/src/apify/pipelines.py | 1 - templates/python-scrapy/src/apify/scheduler.py | 2 +- templates/python-selenium/src/main.py | 13 +++++++------ templates/python-start/src/main.py | 8 +++++--- 9 files changed, 35 insertions(+), 28 deletions(-) diff --git a/templates/python-beautifulsoup/src/main.py b/templates/python-beautifulsoup/src/main.py index 3f86137b..c1bfe139 100644 --- a/templates/python-beautifulsoup/src/main.py +++ b/templates/python-beautifulsoup/src/main.py @@ -1,14 +1,16 @@ from urllib.parse import urljoin import requests -from apify import Actor from bs4 import BeautifulSoup +from apify import Actor + + async def main(): async with Actor: # Read the Actor input actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }]) + start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) max_depth = actor_input.get('max_depth', 1) if not start_urls: @@ -20,7 +22,7 @@ async def main(): for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - await default_queue.add_request({ 'url': url, 'userData': { 'depth': 0 }}) + await default_queue.add_request({'url': url, 'userData': {'depth': 0}}) # Process the requests in the queue one by one while request := await default_queue.fetch_next_request(): @@ -43,13 +45,13 @@ async def main(): Actor.log.info(f'Enqueuing {link_url} ...') await default_queue.add_request({ 'url': link_url, - 'userData': {'depth': depth + 1 }, + 'userData': {'depth': depth + 1}, }) # Push the title of the page into the default dataset title = soup.title.string if soup.title else None - await Actor.push_data({ 'url': url, 'title': title }) - except: + await Actor.push_data({'url': url, 'title': title}) + except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: # Mark the request as handled so it's not processed again diff --git a/templates/python-empty/src/main.py b/templates/python-empty/src/main.py index 0183abbc..a9ff21bd 100644 --- a/templates/python-empty/src/main.py +++ b/templates/python-empty/src/main.py @@ -1,8 +1,10 @@ -# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python) -from apify import Actor # Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc/) # from bs4 import BeautifulSoup +# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python) +from apify import Actor + + async def main(): async with Actor: print('Hello from the Actor!') diff --git a/templates/python-playwright/src/main.py b/templates/python-playwright/src/main.py index d9b8b00b..e994698b 100644 --- a/templates/python-playwright/src/main.py +++ b/templates/python-playwright/src/main.py @@ -1,8 +1,9 @@ from urllib.parse import urljoin -from apify import Actor from playwright.async_api import async_playwright +from apify import Actor + # To run this Actor locally, you need to have the Playwright browsers installed. # Run `playwright install --with-deps` in the Actor's virtual environment to install them. # When running on the Apify platform, they are already included in the Actor's Docker image. @@ -12,7 +13,7 @@ async def main(): async with Actor: # Read the Actor input actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }]) + start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) max_depth = actor_input.get('max_depth', 1) if not start_urls: @@ -24,7 +25,7 @@ async def main(): for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - await default_queue.add_request({ 'url': url, 'userData': { 'depth': 0 }}) + await default_queue.add_request({'url': url, 'userData': {'depth': 0}}) # Launch Playwright an open a new browser context Actor.log.info('Launching Playwright...') @@ -53,13 +54,13 @@ async def main(): Actor.log.info(f'Enqueuing {link_url} ...') await default_queue.add_request({ 'url': link_url, - 'userData': {'depth': depth + 1 }, + 'userData': {'depth': depth + 1}, }) # Push the title of the page into the default dataset title = await page.title() - await Actor.push_data({ 'url': url, 'title': title }) - except: + await Actor.push_data({'url': url, 'title': title}) + except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: await page.close() diff --git a/templates/python-scrapy/src/apify/main.py b/templates/python-scrapy/src/apify/main.py index 7bcaf8b9..15fd046b 100644 --- a/templates/python-scrapy/src/apify/main.py +++ b/templates/python-scrapy/src/apify/main.py @@ -1,6 +1,6 @@ from scrapy.crawler import CrawlerProcess -from scrapy.utils.project import get_project_settings from scrapy.settings import Settings +from scrapy.utils.project import get_project_settings from apify import Actor diff --git a/templates/python-scrapy/src/apify/middlewares.py b/templates/python-scrapy/src/apify/middlewares.py index ef4a3b30..fff5efdf 100644 --- a/templates/python-scrapy/src/apify/middlewares.py +++ b/templates/python-scrapy/src/apify/middlewares.py @@ -2,9 +2,9 @@ from scrapy import Spider from scrapy.downloadermiddlewares.retry import RetryMiddleware +from scrapy.exceptions import IgnoreRequest from scrapy.http import Request, Response from scrapy.utils.response import response_status_message -from scrapy.exceptions import IgnoreRequest from apify.storages import RequestQueue @@ -74,7 +74,7 @@ async def _handle_retry_logic( self, request: Request, response: Response, - spider: Spider + spider: Spider, ) -> Request | Response: apify_request = to_apify_request(request) diff --git a/templates/python-scrapy/src/apify/pipelines.py b/templates/python-scrapy/src/apify/pipelines.py index b874cedd..960a2b1d 100644 --- a/templates/python-scrapy/src/apify/pipelines.py +++ b/templates/python-scrapy/src/apify/pipelines.py @@ -1,5 +1,4 @@ from itemadapter import ItemAdapter - from scrapy import Item, Spider from apify import Actor diff --git a/templates/python-scrapy/src/apify/scheduler.py b/templates/python-scrapy/src/apify/scheduler.py index da78289f..b39a5ea7 100644 --- a/templates/python-scrapy/src/apify/scheduler.py +++ b/templates/python-scrapy/src/apify/scheduler.py @@ -22,7 +22,7 @@ def __init__(self) -> None: raise ValueError( f'{ApifyScheduler.__qualname__} requires the asyncio Twisted reactor. ' 'Make sure you have it configured in the TWISTED_REACTOR setting. See the asyncio ' - 'documentation of Scrapy for more information.' + 'documentation of Scrapy for more information.', ) self._rq: RequestQueue | None = None self.spider: Spider | None = None diff --git a/templates/python-selenium/src/main.py b/templates/python-selenium/src/main.py index 3ca665a5..168e0507 100644 --- a/templates/python-selenium/src/main.py +++ b/templates/python-selenium/src/main.py @@ -1,10 +1,11 @@ from urllib.parse import urljoin -from apify import Actor from selenium import webdriver from selenium.webdriver.chrome.options import Options as ChromeOptions from selenium.webdriver.common.by import By +from apify import Actor + # To run this Actor locally, you need to have the Selenium Chromedriver installed. # https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/ # When running on the Apify platform, it is already included in the Actor's Docker image. @@ -14,7 +15,7 @@ async def main(): async with Actor: # Read the Actor input actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }]) + start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}]) max_depth = actor_input.get('max_depth', 1) if not start_urls: @@ -26,7 +27,7 @@ async def main(): for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - await default_queue.add_request({ 'url': url, 'userData': { 'depth': 0 }}) + await default_queue.add_request({'url': url, 'userData': {'depth': 0}}) # Launch a new Selenium Chrome WebDriver Actor.log.info('Launching Chrome WebDriver...') @@ -60,13 +61,13 @@ async def main(): Actor.log.info(f'Enqueuing {link_url} ...') await default_queue.add_request({ 'url': link_url, - 'userData': {'depth': depth + 1 }, + 'userData': {'depth': depth + 1}, }) # Push the title of the page into the default dataset title = driver.title - await Actor.push_data({ 'url': url, 'title': title }) - except: + await Actor.push_data({'url': url, 'title': title}) + except Exception: Actor.log.exception(f'Cannot extract data from {url}.') finally: await default_queue.mark_request_as_handled(request) diff --git a/templates/python-start/src/main.py b/templates/python-start/src/main.py index be96cd2a..a36ae3cc 100644 --- a/templates/python-start/src/main.py +++ b/templates/python-start/src/main.py @@ -1,10 +1,12 @@ -# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python). -from apify import Actor # Requests - library for making HTTP requests in Python (Read more at https://requests.readthedocs.io) import requests # Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc) from bs4 import BeautifulSoup +# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python). +from apify import Actor + + async def main(): async with Actor: # Structure of input is defined in input_schema.json @@ -20,7 +22,7 @@ async def main(): # Extract all headings from the page (tag name and text). headings = [] for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): - heading_object = { 'level': heading.name, 'text': heading.text } + heading_object = {'level': heading.name, 'text': heading.text} print('Extracted heading', heading_object) headings.append(heading_object)