Skip to content

Commit

Permalink
feat(scrapers): add Bitcoin Core PR Review Club
Browse files Browse the repository at this point in the history
Implement Bitcoin Core PR Review Club scraper to extract meeting note
from bitcoincore.reviews. Handles both Bitcoin Core PR review meetings
and other special topics like rc-testing and bitcoin-inquisition.
  • Loading branch information
kouloumos committed Nov 27, 2024
1 parent cd17bf1 commit 0e299e5
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 13 deletions.
10 changes: 10 additions & 0 deletions scraper/models/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ class BitcoinTranscriptDocument(ScrapedDocument):
transcript_source: str = Field(description="Source of the transcript")


class PRReviewClubDocument(ScrapedDocument):
issue: Optional[int] = Field(
default_factory=None,
description="Bitcoin Core issue number associated with the meeting",
)
host: Optional[str] = Field(
default=None, description="The person hosting the meeting"
)


class MetadataDocument(BaseModel):
"""
Represents metadata about a scraping operation for a specific domain.
Expand Down
4 changes: 4 additions & 0 deletions scraper/scrapers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from .base import BaseScraper
from .bips import BIPsScraper
from .blips import BLIPsScraper
from .bitcoinops import BitcoinOpsScraper
from .bitcointranscripts import BitcoinTranscriptsScraper
from .pr_review_club import PRReviewClubScraper
from .github import GithubScraper
from .scrapy.scrapy_base import ScrapyScraper
from .scrapy.spider_base import BaseSpider
Expand All @@ -12,8 +14,10 @@
# github
"GithubScraper",
"BIPsScraper",
"BLIPsScraper",
"BitcoinOpsScraper",
"BitcoinTranscriptsScraper",
"PRReviewClubScraper",
# scrapy
"ScrapyScraper",
"BaseSpider",
Expand Down
56 changes: 43 additions & 13 deletions scraper/scrapers/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
import re
from git import Repo
from loguru import logger
from typing import List, Dict, Any, Set, Type
from typing import List, Dict, Any, Optional, Set, Type

import yaml

from scraper.models import ScrapedDocument
from scraper.config import settings
from scraper.scrapers.utils import parse_standard_date_formats
from scraper.utils import slugify, strip_emails
from scraper.registry import scraper_registry
from .base import BaseScraper
Expand Down Expand Up @@ -242,20 +243,49 @@ def get_title(self, metadata: Dict[str, Any], body: str) -> str:
# If no title is found, return a default string
return "Untitled"

def get_created_at(self, metadata: Dict[str, Any]) -> str:
# Handle 'date' field if it's a string
if isinstance(metadata.get("date"), str):
return datetime.strptime(metadata["date"], "%Y-%m-%d").strftime("%Y-%m-%d")
def get_created_at(self, metadata: Dict[str, Any]) -> Optional[str]:
"""
Extract and normalize creation date from document metadata.
Attempts to parse date from multiple common metadata fields and formats.
Returns ISO formatted date string (YYYY-MM-DD) or None if no valid date found.
Args:
metadata: Document metadata dictionary
# Handle 'Created' field if it's a string or a date
created = metadata.get("Created")
if isinstance(created, str):
return created # Assume the string is already in the desired format
elif isinstance(created, date):
# Format the date as 'YYYY-MM-DD'
return created.strftime("%Y-%m-%d")
Returns:
Optional[str]: ISO formatted date string or None
"""
# List of common metadata field names for dates
date_fields = [
"date",
"created",
"created_at",
"published",
"published_at",
"timestamp",
]

for field in date_fields:
value = metadata.get(field) or metadata.get(field.title())
if not value:
continue

# If no valid date found, return None
# Handle datetime/date objects
if isinstance(value, (datetime, date)):
return value.strftime("%Y-%m-%d")

# Handle string dates
if isinstance(value, str):
# Try standard date formats using utility function
parsed_date = parse_standard_date_formats(value)
if parsed_date:
# Convert full ISO timestamp to date-only format if needed
if "T" in parsed_date:
return parsed_date.split("T")[0]
return parsed_date

# No valid date found
return None

def get_language(self, metadata: Dict[str, Any]) -> str:
Expand Down
79 changes: 79 additions & 0 deletions scraper/scrapers/pr_review_club.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
from typing import Dict, Any, Set, Type
from urllib.parse import urljoin

from scraper.models.documents import PRReviewClubDocument, ScrapedDocument
from scraper.scrapers.github import GithubScraper
from scraper.registry import scraper_registry
from scraper.utils import slugify


@scraper_registry.register("PR-Review-Club")
class PRReviewClubScraper(GithubScraper):
# Predefined topics for non-Bitcoin Core meetings
KNOWN_TOPICS: Set[str] = {
"rc-testing",
"bitcoin-inquisition",
"libsecp256k1",
"minisketch",
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.document_class: Type[ScrapedDocument] = PRReviewClubDocument

def _extract_title_from_jekyll_filename(self, file_path: str) -> str:
"""
Extract the title portion from a Jekyll filename (YYYY-MM-DD-title.md).
Helper method used for both URL generation and ID generation.
"""
file_name = os.path.basename(file_path)
name_without_extension = os.path.splitext(file_name)[0]
# Split on first three hyphens to separate date components from title
parts = name_without_extension.split("-", 3)
return slugify(parts[3])

def get_url(self, file_path: str, metadata: Dict[str, Any]) -> str:
if "permalink" in metadata:
url_path = metadata["permalink"]
else:
url_path = self._extract_title_from_jekyll_filename(file_path)
return urljoin(str(self.config.domain), url_path)

def generate_id(self, file_path: str) -> str:
title = self._extract_title_from_jekyll_filename(file_path)
return f"{self.config.name.lower()}-{title}"

def customize_document(
self, document_data: Dict[str, Any], file_path: str, metadata: Dict[str, Any]
) -> Dict[str, Any]:
"""
Customize document data based on metadata and file information.
For Bitcoin Core PRs, uses PR number and metadata.
For other content, identifies topics from filename.
"""
document_data["issue"] = metadata.get("pr", None)
document_data["host"] = metadata.get("host", [])
document_data["tags"] = metadata.get("components", []) or []

# If no PR number exists, this is not a Bitcoin Core PR review
if not document_data["issue"]:
# Extract title from filename
title = self._extract_title_from_jekyll_filename(file_path)

# Find matching topics
matching_topics = [topic for topic in self.KNOWN_TOPICS if topic in title]

if not matching_topics:
raise ValueError(
f"File '{file_path}' is not related to Bitcoin Core PR"
f"doesn't match any known topics"
)

# Add matched topics to tags
if isinstance(document_data["tags"], list):
document_data["tags"].extend(list(matching_topics))
else:
document_data["tags"] = list(matching_topics)

return document_data
5 changes: 5 additions & 0 deletions scraper/sources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ github:
- summarization
- topic_extractor
- vector_embeddings
- name: PR-Review-Club
domain: https://bitcoincore.reviews/
url: https://github.com/bitcoin-core-review-club/website.git
directories:
_posts: post
web:
- name: BitcoinTalk
domain: https://bitcointalk.org
Expand Down

0 comments on commit 0e299e5

Please sign in to comment.