Skip to content

Commit

Permalink
Format Python Actor templates (#213)
Browse files Browse the repository at this point in the history
U used auto-formatters (`autopep8`, `isort`) that we use in our other
Python libraries (`SDK`, `Client`) to format the Python Actor templates.

I also used linter (`flake8` with plugins we use in our other libs) to
fix some violations, like "trailing comma" or "bare except".
  • Loading branch information
vdusek authored Oct 4, 2023
1 parent 46aa262 commit 83a16c1
Show file tree
Hide file tree
Showing 9 changed files with 35 additions and 28 deletions.
14 changes: 8 additions & 6 deletions templates/python-beautifulsoup/src/main.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from urllib.parse import urljoin

import requests
from apify import Actor
from bs4 import BeautifulSoup

from apify import Actor


async def main():
async with Actor:
# Read the Actor input
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }])
start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
max_depth = actor_input.get('max_depth', 1)

if not start_urls:
Expand All @@ -20,7 +22,7 @@ async def main():
for start_url in start_urls:
url = start_url.get('url')
Actor.log.info(f'Enqueuing {url} ...')
await default_queue.add_request({ 'url': url, 'userData': { 'depth': 0 }})
await default_queue.add_request({'url': url, 'userData': {'depth': 0}})

# Process the requests in the queue one by one
while request := await default_queue.fetch_next_request():
Expand All @@ -43,13 +45,13 @@ async def main():
Actor.log.info(f'Enqueuing {link_url} ...')
await default_queue.add_request({
'url': link_url,
'userData': {'depth': depth + 1 },
'userData': {'depth': depth + 1},
})

# Push the title of the page into the default dataset
title = soup.title.string if soup.title else None
await Actor.push_data({ 'url': url, 'title': title })
except:
await Actor.push_data({'url': url, 'title': title})
except Exception:
Actor.log.exception(f'Cannot extract data from {url}.')
finally:
# Mark the request as handled so it's not processed again
Expand Down
6 changes: 4 additions & 2 deletions templates/python-empty/src/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python)
from apify import Actor
# Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
# from bs4 import BeautifulSoup

# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python)
from apify import Actor


async def main():
async with Actor:
print('Hello from the Actor!')
Expand Down
13 changes: 7 additions & 6 deletions templates/python-playwright/src/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from urllib.parse import urljoin

from apify import Actor
from playwright.async_api import async_playwright

from apify import Actor

# To run this Actor locally, you need to have the Playwright browsers installed.
# Run `playwright install --with-deps` in the Actor's virtual environment to install them.
# When running on the Apify platform, they are already included in the Actor's Docker image.
Expand All @@ -12,7 +13,7 @@ async def main():
async with Actor:
# Read the Actor input
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }])
start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
max_depth = actor_input.get('max_depth', 1)

if not start_urls:
Expand All @@ -24,7 +25,7 @@ async def main():
for start_url in start_urls:
url = start_url.get('url')
Actor.log.info(f'Enqueuing {url} ...')
await default_queue.add_request({ 'url': url, 'userData': { 'depth': 0 }})
await default_queue.add_request({'url': url, 'userData': {'depth': 0}})

# Launch Playwright an open a new browser context
Actor.log.info('Launching Playwright...')
Expand Down Expand Up @@ -53,13 +54,13 @@ async def main():
Actor.log.info(f'Enqueuing {link_url} ...')
await default_queue.add_request({
'url': link_url,
'userData': {'depth': depth + 1 },
'userData': {'depth': depth + 1},
})

# Push the title of the page into the default dataset
title = await page.title()
await Actor.push_data({ 'url': url, 'title': title })
except:
await Actor.push_data({'url': url, 'title': title})
except Exception:
Actor.log.exception(f'Cannot extract data from {url}.')
finally:
await page.close()
Expand Down
2 changes: 1 addition & 1 deletion templates/python-scrapy/src/apify/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings

from apify import Actor

Expand Down
4 changes: 2 additions & 2 deletions templates/python-scrapy/src/apify/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from scrapy import Spider
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.exceptions import IgnoreRequest
from scrapy.http import Request, Response
from scrapy.utils.response import response_status_message
from scrapy.exceptions import IgnoreRequest

from apify.storages import RequestQueue

Expand Down Expand Up @@ -74,7 +74,7 @@ async def _handle_retry_logic(
self,
request: Request,
response: Response,
spider: Spider
spider: Spider,
) -> Request | Response:
apify_request = to_apify_request(request)

Expand Down
1 change: 0 additions & 1 deletion templates/python-scrapy/src/apify/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from itemadapter import ItemAdapter

from scrapy import Item, Spider

from apify import Actor
Expand Down
2 changes: 1 addition & 1 deletion templates/python-scrapy/src/apify/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self) -> None:
raise ValueError(
f'{ApifyScheduler.__qualname__} requires the asyncio Twisted reactor. '
'Make sure you have it configured in the TWISTED_REACTOR setting. See the asyncio '
'documentation of Scrapy for more information.'
'documentation of Scrapy for more information.',
)
self._rq: RequestQueue | None = None
self.spider: Spider | None = None
Expand Down
13 changes: 7 additions & 6 deletions templates/python-selenium/src/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from urllib.parse import urljoin

from apify import Actor
from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By

from apify import Actor

# To run this Actor locally, you need to have the Selenium Chromedriver installed.
# https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/
# When running on the Apify platform, it is already included in the Actor's Docker image.
Expand All @@ -14,7 +15,7 @@ async def main():
async with Actor:
# Read the Actor input
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get('start_urls', [{ 'url': 'https://apify.com' }])
start_urls = actor_input.get('start_urls', [{'url': 'https://apify.com'}])
max_depth = actor_input.get('max_depth', 1)

if not start_urls:
Expand All @@ -26,7 +27,7 @@ async def main():
for start_url in start_urls:
url = start_url.get('url')
Actor.log.info(f'Enqueuing {url} ...')
await default_queue.add_request({ 'url': url, 'userData': { 'depth': 0 }})
await default_queue.add_request({'url': url, 'userData': {'depth': 0}})

# Launch a new Selenium Chrome WebDriver
Actor.log.info('Launching Chrome WebDriver...')
Expand Down Expand Up @@ -60,13 +61,13 @@ async def main():
Actor.log.info(f'Enqueuing {link_url} ...')
await default_queue.add_request({
'url': link_url,
'userData': {'depth': depth + 1 },
'userData': {'depth': depth + 1},
})

# Push the title of the page into the default dataset
title = driver.title
await Actor.push_data({ 'url': url, 'title': title })
except:
await Actor.push_data({'url': url, 'title': title})
except Exception:
Actor.log.exception(f'Cannot extract data from {url}.')
finally:
await default_queue.mark_request_as_handled(request)
Expand Down
8 changes: 5 additions & 3 deletions templates/python-start/src/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python).
from apify import Actor
# Requests - library for making HTTP requests in Python (Read more at https://requests.readthedocs.io)
import requests
# Beautiful Soup - library for pulling data out of HTML and XML files (Read more at https://www.crummy.com/software/BeautifulSoup/bs4/doc)
from bs4 import BeautifulSoup

# Apify SDK - toolkit for building Apify Actors (Read more at https://docs.apify.com/sdk/python).
from apify import Actor


async def main():
async with Actor:
# Structure of input is defined in input_schema.json
Expand All @@ -20,7 +22,7 @@ async def main():
# Extract all headings from the page (tag name and text).
headings = []
for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
heading_object = { 'level': heading.name, 'text': heading.text }
heading_object = {'level': heading.name, 'text': heading.text}
print('Extracted heading', heading_object)
headings.append(heading_object)

Expand Down

0 comments on commit 83a16c1

Please sign in to comment.