feat(scrapers): add Bitcoin Core PR Review Club

Implement Bitcoin Core PR Review Club scraper to extract meeting note from bitcoincore.reviews. Handles both Bitcoin Core PR review meetings and other special topics like rc-testing and bitcoin-inquisition.
bitcoinsearch · Nov 27, 2024 · 0e299e5 · 0e299e5
1 parent cd17bf1
commit 0e299e5
Show file tree

Hide file tree

Showing 5 changed files with 141 additions and 13 deletions.
diff --git a/scraper/models/documents.py b/scraper/models/documents.py
@@ -69,6 +69,16 @@ class BitcoinTranscriptDocument(ScrapedDocument):
     transcript_source: str = Field(description="Source of the transcript")
 
 
+class PRReviewClubDocument(ScrapedDocument):
+    issue: Optional[int] = Field(
+        default_factory=None,
+        description="Bitcoin Core issue number associated with the meeting",
+    )
+    host: Optional[str] = Field(
+        default=None, description="The person hosting the meeting"
+    )
+
+
 class MetadataDocument(BaseModel):
     """
     Represents metadata about a scraping operation for a specific domain.

diff --git a/scraper/scrapers/__init__.py b/scraper/scrapers/__init__.py
@@ -1,7 +1,9 @@
 from .base import BaseScraper
 from .bips import BIPsScraper
+from .blips import BLIPsScraper
 from .bitcoinops import BitcoinOpsScraper
 from .bitcointranscripts import BitcoinTranscriptsScraper
+from .pr_review_club import PRReviewClubScraper
 from .github import GithubScraper
 from .scrapy.scrapy_base import ScrapyScraper
 from .scrapy.spider_base import BaseSpider
@@ -12,8 +14,10 @@
     # github
     "GithubScraper",
     "BIPsScraper",
+    "BLIPsScraper",
     "BitcoinOpsScraper",
     "BitcoinTranscriptsScraper",
+    "PRReviewClubScraper",
     # scrapy
     "ScrapyScraper",
     "BaseSpider",

diff --git a/scraper/scrapers/github.py b/scraper/scrapers/github.py
@@ -3,12 +3,13 @@
 import re
 from git import Repo
 from loguru import logger
-from typing import List, Dict, Any, Set, Type
+from typing import List, Dict, Any, Optional, Set, Type
 
 import yaml
 
 from scraper.models import ScrapedDocument
 from scraper.config import settings
+from scraper.scrapers.utils import parse_standard_date_formats
 from scraper.utils import slugify, strip_emails
 from scraper.registry import scraper_registry
 from .base import BaseScraper
@@ -242,20 +243,49 @@ def get_title(self, metadata: Dict[str, Any], body: str) -> str:
         # If no title is found, return a default string
         return "Untitled"
 
-    def get_created_at(self, metadata: Dict[str, Any]) -> str:
-        # Handle 'date' field if it's a string
-        if isinstance(metadata.get("date"), str):
-            return datetime.strptime(metadata["date"], "%Y-%m-%d").strftime("%Y-%m-%d")
+    def get_created_at(self, metadata: Dict[str, Any]) -> Optional[str]:
+        """
+        Extract and normalize creation date from document metadata.
+
+        Attempts to parse date from multiple common metadata fields and formats.
+        Returns ISO formatted date string (YYYY-MM-DD) or None if no valid date found.
+
+        Args:
+            metadata: Document metadata dictionary
 
-        # Handle 'Created' field if it's a string or a date
-        created = metadata.get("Created")
-        if isinstance(created, str):
-            return created  # Assume the string is already in the desired format
-        elif isinstance(created, date):
-            # Format the date as 'YYYY-MM-DD'
-            return created.strftime("%Y-%m-%d")
+        Returns:
+            Optional[str]: ISO formatted date string or None
+        """
+        # List of common metadata field names for dates
+        date_fields = [
+            "date",
+            "created",
+            "created_at",
+            "published",
+            "published_at",
+            "timestamp",
+        ]
+
+        for field in date_fields:
+            value = metadata.get(field) or metadata.get(field.title())
+            if not value:
+                continue
 
-        # If no valid date found, return None
+            # Handle datetime/date objects
+            if isinstance(value, (datetime, date)):
+                return value.strftime("%Y-%m-%d")
+
+            # Handle string dates
+            if isinstance(value, str):
+                # Try standard date formats using utility function
+                parsed_date = parse_standard_date_formats(value)
+                if parsed_date:
+                    # Convert full ISO timestamp to date-only format if needed
+                    if "T" in parsed_date:
+                        return parsed_date.split("T")[0]
+                    return parsed_date
+
+        # No valid date found
         return None
 
     def get_language(self, metadata: Dict[str, Any]) -> str:

diff --git a/scraper/scrapers/pr_review_club.py b/scraper/scrapers/pr_review_club.py
@@ -0,0 +1,79 @@
+import os
+from typing import Dict, Any, Set, Type
+from urllib.parse import urljoin
+
+from scraper.models.documents import PRReviewClubDocument, ScrapedDocument
+from scraper.scrapers.github import GithubScraper
+from scraper.registry import scraper_registry
+from scraper.utils import slugify
+
+
+@scraper_registry.register("PR-Review-Club")
+class PRReviewClubScraper(GithubScraper):
+    # Predefined topics for non-Bitcoin Core meetings
+    KNOWN_TOPICS: Set[str] = {
+        "rc-testing",
+        "bitcoin-inquisition",
+        "libsecp256k1",
+        "minisketch",
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.document_class: Type[ScrapedDocument] = PRReviewClubDocument
+
+    def _extract_title_from_jekyll_filename(self, file_path: str) -> str:
+        """
+        Extract the title portion from a Jekyll filename (YYYY-MM-DD-title.md).
+        Helper method used for both URL generation and ID generation.
+        """
+        file_name = os.path.basename(file_path)
+        name_without_extension = os.path.splitext(file_name)[0]
+        # Split on first three hyphens to separate date components from title
+        parts = name_without_extension.split("-", 3)
+        return slugify(parts[3])
+
+    def get_url(self, file_path: str, metadata: Dict[str, Any]) -> str:
+        if "permalink" in metadata:
+            url_path = metadata["permalink"]
+        else:
+            url_path = self._extract_title_from_jekyll_filename(file_path)
+        return urljoin(str(self.config.domain), url_path)
+
+    def generate_id(self, file_path: str) -> str:
+        title = self._extract_title_from_jekyll_filename(file_path)
+        return f"{self.config.name.lower()}-{title}"
+
+    def customize_document(
+        self, document_data: Dict[str, Any], file_path: str, metadata: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """
+        Customize document data based on metadata and file information.
+        For Bitcoin Core PRs, uses PR number and metadata.
+        For other content, identifies topics from filename.
+        """
+        document_data["issue"] = metadata.get("pr", None)
+        document_data["host"] = metadata.get("host", [])
+        document_data["tags"] = metadata.get("components", []) or []
+
+        # If no PR number exists, this is not a Bitcoin Core PR review
+        if not document_data["issue"]:
+            # Extract title from filename
+            title = self._extract_title_from_jekyll_filename(file_path)
+
+            # Find matching topics
+            matching_topics = [topic for topic in self.KNOWN_TOPICS if topic in title]
+
+            if not matching_topics:
+                raise ValueError(
+                    f"File '{file_path}' is not related to Bitcoin Core PR"
+                    f"doesn't match any known topics"
+                )
+
+            # Add matched topics to tags
+            if isinstance(document_data["tags"], list):
+                document_data["tags"].extend(list(matching_topics))
+            else:
+                document_data["tags"] = list(matching_topics)
+
+        return document_data
diff --git a/scraper/sources.yaml b/scraper/sources.yaml
@@ -21,6 +21,11 @@ github:
       - summarization
       - topic_extractor
       - vector_embeddings
+  - name: PR-Review-Club
+    domain: https://bitcoincore.reviews/
+    url: https://github.com/bitcoin-core-review-club/website.git
+    directories: 
+      _posts: post
 web:
   - name: BitcoinTalk
     domain: https://bitcointalk.org