Skip to content

Commit

Permalink
Merge branch 'main' into feat/idempotency-key
Browse files Browse the repository at this point in the history
  • Loading branch information
rafaelsideguide committed May 24, 2024
2 parents c201ea1 + 605ba4c commit d39860c
Show file tree
Hide file tree
Showing 8 changed files with 273 additions and 24 deletions.
35 changes: 35 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
---
name: Bug report
about: Create a report to help us improve
title: "[BUG]"
labels: bug
assignees: ''

---

**Describe the Bug**
Provide a clear and concise description of what the bug is.

**To Reproduce**
Steps to reproduce the issue:
1. Configure the environment or settings with '...'
2. Run the command '...'
3. Observe the error or unexpected output at '...'
4. Log output/error message

**Expected Behavior**
A clear and concise description of what you expected to happen.

**Screenshots**
If applicable, add screenshots or copies of the command line output to help explain the issue.

**Environment (please complete the following information):**
- OS: [e.g. macOS, Linux, Windows]
- Firecrawl Version: [e.g. 1.2.3]
- Node.js Version: [e.g. 14.x]

**Logs**
If applicable, include detailed logs to help understand the problem.

**Additional Context**
Add any other context about the problem here, such as configuration specifics, network conditions, data volumes, etc.
26 changes: 26 additions & 0 deletions .github/ISSUE_TEMPLATE/feature_request.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
name: Feature request
about: Suggest an idea for this project
title: "[Feat]"
labels: ''
assignees: ''

---

**Problem Description**
Describe the issue you're experiencing that has prompted this feature request. For example, "I find it difficult when..."

**Proposed Feature**
Provide a clear and concise description of the feature you would like implemented.

**Alternatives Considered**
Discuss any alternative solutions or features you've considered. Why were these alternatives not suitable?

**Implementation Suggestions**
If you have ideas on how the feature could be implemented, share them here. This could include technical details, API changes, or interaction mechanisms.

**Use Case**
Explain how this feature would be used and what benefits it would bring. Include specific examples to illustrate how this would improve functionality or user experience.

**Additional Context**
Add any other context such as comparisons with similar features in other products, or links to prototypes or mockups.
9 changes: 8 additions & 1 deletion apps/api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,11 @@ STRIPE_PRICE_ID_SCALE=
HYPERDX_API_KEY=
HDX_NODE_BETA_MODE=1

FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta
FIRE_ENGINE_BETA_URL= # set if you'd like to use the fire engine closed beta

# Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request)
PROXY_SERVER=
PROXY_USERNAME=
PROXY_PASSWORD=
# set if you'd like to block media requests to save proxy bandwidth
BLOCK_MEDIA=
20 changes: 17 additions & 3 deletions apps/api/src/scraper/WebScraper/utils/blocklist.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
const socialMediaBlocklist = [
'facebook.com',
'twitter.com',
'x.com',
'instagram.com',
'linkedin.com',
'pinterest.com',
Expand All @@ -14,12 +15,25 @@ const socialMediaBlocklist = [
'telegram.org',
];

const allowedUrls = [
'linkedin.com/pulse'
const allowedKeywords = [
'pulse',
'privacy',
'terms',
'policy',
'user-agreement',
'legal',
'help',
'support',
'contact',
'about',
'careers',
'blog',
'press',
'conditions',
];

export function isUrlBlocked(url: string): boolean {
if (allowedUrls.some(allowedUrl => url.includes(allowedUrl))) {
if (allowedKeywords.some(keyword => url.includes(keyword))) {
return false;
}

Expand Down
32 changes: 28 additions & 4 deletions apps/playwright-service/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,16 @@
from playwright.async_api import async_playwright, Browser
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from os import environ

PROXY_SERVER = environ.get('PROXY_SERVER', None)
PROXY_USERNAME = environ.get('PROXY_USERNAME', None)
PROXY_PASSWORD = environ.get('PROXY_PASSWORD', None)
BLOCK_MEDIA = environ.get('BLOCK_MEDIA', 'False').upper() == 'TRUE'

app = FastAPI()


class UrlModel(BaseModel):
url: str
wait: int = None
Expand All @@ -27,11 +34,28 @@ async def shutdown_event():

@app.post("/html")
async def root(body: UrlModel):
context = await browser.new_context()
context = None
if PROXY_SERVER and PROXY_USERNAME and PROXY_PASSWORD:
context = await browser.new_context(proxy={"server": PROXY_SERVER,
"username": PROXY_USERNAME,
"password": PROXY_PASSWORD})
else:
context = await browser.new_context()

if BLOCK_MEDIA:
await context.route("**/*.{png,jpg,jpeg,gif,svg,mp3,mp4,avi,flac,ogg,wav,webm}",
handler=lambda route, request: route.abort())

page = await context.new_page()
await page.goto(body.url, timeout=15000) # Set max timeout to 15s
if body.wait: # Check if wait parameter is provided in the request body
await page.wait_for_timeout(body.wait) # Convert seconds to milliseconds for playwright
await page.goto(
body.url,
wait_until="load",
timeout=body.timeout if body.timeout else 15000,
)
# Wait != timeout. Wait is the time to wait after the page is loaded - useful in some cases were "load" / "networkidle" is not enough
if body.wait:
await page.wait_for_timeout(body.wait)

page_content = await page.content()
await context.close()
json_compatible_item_data = {"content": page_content}
Expand Down
2 changes: 2 additions & 0 deletions apps/python-sdk/.pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[FORMAT]
max-line-length = 120
Loading

0 comments on commit d39860c

Please sign in to comment.