Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Any website to RSS (for later processing) #28

Open
fabriziosalmi opened this issue Jun 25, 2024 · 2 comments
Open

Any website to RSS (for later processing) #28

fabriziosalmi opened this issue Jun 25, 2024 · 2 comments

Comments

@fabriziosalmi
Copy link
Owner

Is your feature request related to a problem? Please describe.

Convert any website to RSS (for later processing)

Step 1: Create the Project Structure

  1. Create a new GitHub repository.
  2. Inside the repository, create the following directory structure:
.
├── .github
│   └── workflows
│       └── scrape_and_convert.yml
├── tools
│   ├── scrape_to_rdf.py
│   ├── rdf_to_rss.py
├── requirements.txt

Step 2: Define Python Scripts

tools/scrape_to_rdf.py:

import sys
import requests
from bs4 import BeautifulSoup
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import FOAF, DC

def scrape_to_rdf(url, rdf_output_path):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    articles = soup.find_all('div', class_='article')
    data = []

    for article in articles:
        title = article.find('h2').text
        link = article.find('a')['href']
        description = article.find('p').text if article.find('p') else ''
        data.append({'title': title, 'link': link, 'description': description})

    g = Graph()
    EX = URIRef("http://example.org/")

    for item in data:
        article_uri = URIRef(item['link'])
        g.add((article_uri, RDF.type, FOAF.Document))
        g.add((article_uri, DC.title, Literal(item['title'])))
        g.add((article_uri, FOAF.homepage, URIRef(item['link'])))
        g.add((article_uri, DC.description, Literal(item['description'])))

    g.serialize(destination=rdf_output_path, format='turtle')

if __name__ == "__main__":
    scrape_to_rdf(sys.argv[1], sys.argv[2])

tools/rdf_to_rss.py:

import sys
from rdflib import Graph, FOAF, DC
from lxml import etree

def rdf_to_rss(rdf_input_path, rss_output_path):
    g = Graph()
    g.parse(rdf_input_path, format="turtle")

    data = []
    for s, p, o in g:
        if p == DC.title:
            title = o
            link = [o2 for s2, p2, o2 in g.triples((s, FOAF.homepage, None))][0]
            description = [o3 for s3, p3, o3 in g.triples((s, DC.description, None))]
            description = description[0] if description else ''
            data.append({'title': title, 'link': link, 'description': description})

    rss = etree.Element("rss", version="2.0")
    channel = etree.SubElement(rss, "channel")

    etree.SubElement(channel, "title").text = "Example RSS Feed"
    etree.SubElement(channel, "link").text = "https://example.com"
    etree.SubElement(channel, "description").text = "This is an example RSS feed generated from RDF data."

    for item in data:
        item_element = etree.SubElement(channel, "item")
        etree.SubElement(item_element, "title").text = str(item['title'])
        etree.SubElement(item_element, "link").text = str(item['link'])
        etree.SubElement(item_element, "description").text = str(item['description'])

    rss_xml = etree.tostring(rss, pretty_print=True, xml_declaration=True, encoding="UTF-8")

    with open(rss_output_path, "wb") as f:
        f.write(rss_xml)

if __name__ == "__main__":
    rdf_to_rss(sys.argv[1], sys.argv[2])

Step 3: Define Dependencies

requirements.txt:

rdflib
lxml
requests
beautifulsoup4

Step 4: Create GitHub Action Workflow

.github/workflows/scrape_and_convert.yml:

name: Scrape and Convert

on:
  push:
    branches:
      - main
  workflow_dispatch:
    inputs:
      website_url:
        description: 'The URL of the website to scrape'
        required: true
        default: 'https://example.com'
      rdf_output_path:
        description: 'Path to save the RDF output'
        required: true
        default: 'output/output.rdf'
      rss_output_path:
        description: 'Path to save the RSS output'
        required: true
        default: 'output/rss_feed.xml'

jobs:
  build:
    runs-on: ubuntu-latest

    steps:
    - name: Checkout repository
      uses: actions/checkout@v2

    - name: Set up Python
      uses: actions/setup-python@v2
      with:
        python-version: '3.x'

    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -r requirements.txt

    - name: Scrape website and generate RDF
      run: |
        python tools/scrape_to_rdf.py "${{ github.event.inputs.website_url }}" "${{ github.event.inputs.rdf_output_path }}"

    - name: Convert RDF to RSS
      run: |
        python tools/rdf_to_rss.py "${{ github.event.inputs.rdf_output_path }}" "${{ github.event.inputs.rss_output_path }}"

    - name: Upload RSS feed
      uses: actions/upload-artifact@v2
      with:
        name: rss-feed
        path: ${{ github.event.inputs.rss_output_path }}

Summary

  1. Structure: Place your Python scripts in the tools directory.
  2. Scripts: scrape_to_rdf.py scrapes the website and generates RDF data. rdf_to_rss.py converts the RDF data to RSS XML.
  3. Workflow: The GitHub Action workflow is defined in .github/workflows/scrape_and_convert.yml, which triggers on push to the main branch or manual dispatch.

This setup allows you to input the website URL and the paths for RDF and RSS output files when triggering the GitHub Action, which will then perform the scraping, RDF generation, and RSS conversion tasks automatically.

@fabriziosalmi
Copy link
Owner Author


import sys
import requests
from bs4 import BeautifulSoup
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import FOAF, DC
import logging
from argparse import ArgumentParser

def get_web_page(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses
        return response.content
    except requests.RequestException as e:
        logging.error(f"Request failed: {e}")
        sys.exit(1)

def parse_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    articles = soup.find_all('div', class_='article')
    if not articles:
        logging.warning("No articles found on the page.")
        return []

    return [
        {
            'title': article.find('h2').text.strip(),
            'link': article.find('a')['href'].strip(),
            'description': article.find('p').text.strip() if article.find('p') else ''
        } for article in articles if article.find('h2') and article.find('a')
    ]

def build_rdf(data, namespace_uri):
    g = Graph(namespace_manager=namespace_manager(namespace_uri))
    for item in data:
        article_uri = URIRef(item['link'])
        g.add((article_uri, RDF.type, FOAF.Document))
        g.add((article_uri, DC.title, Literal(item['title'])))
        g.add((article_uri, FOAF.homepage, URIRef(item['link'])))
        g.add((article_uri, DC.description, Literal(item['description'])))
    return g

def namespace_manager(base_uri):
    from rdflib.namespace import Namespace, NamespaceManager
    namespace = Namespace(base_uri)
    nm = NamespaceManager(Graph())
    nm.bind('foaf', FOAF)
    nm.bind('dc', DC)
    nm.bind('ex', namespace, override=False)
    return nm

def scrape_to_rdf(url, rdf_output_path, namespace_uri):
    html_content = get_web_page(url)
    if html_content:
        data = parse_html(html_content)
        if data:
            rdf_graph = build_rdf(data, namespace_uri)
            rdf_graph.serialize(destination=rdf_output_path, format='turtle')
            logging.info(f"RDF data serialized successfully to {rdf_output_path}")
        else:
            logging.info("No data to serialize.")
    else:
        logging.error("Failed to retrieve or parse web page.")

def setup_argparser():
    parser = ArgumentParser(description="Scrape a webpage and save its content as RDF.")
    parser.add_argument("url", help="URL of the webpage to scrape.")
    parser.add_argument("rdf_output_path", help="File path where RDF output will be saved.")
    parser.add_argument("--namespace", default="http://example.org/", help="Namespace URI for RDF data.")
    return parser

if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    args = setup_argparser().parse_args()
    scrape_to_rdf(args.url, args.rdf_output_path, args.namespace)

@fabriziosalmi
Copy link
Owner Author

Crappy tentative, will be extended so far to manage any kind of source with the use of 3rd party tools.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant