Skip to content

Commit

Permalink
Merge pull request #2371 from sgoley/docs/duckdb-localhost
Browse files Browse the repository at this point in the history
duckdb docs (v1.1) - scrape v1
  • Loading branch information
simon04 authored Nov 23, 2024
2 parents c07ccf5 + deedda3 commit d0582c6
Show file tree
Hide file tree
Showing 8 changed files with 144 additions and 0 deletions.
4 changes: 4 additions & 0 deletions assets/javascripts/news.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
[
[
"2024-11-23",
"New documentation: <a href=\"/duckdb/\">DuckDB</a>"
],
[
"2024-08-20",
"New documentation: <a href=\"/man/\">Linux man pages</a>"
Expand Down
12 changes: 12 additions & 0 deletions lib/docs/filters/duckdb/attribution.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# frozen_string_literal: true

module Docs
class Duckdb
class AttributionFilter < Docs::AttributionFilter
def attribution_link
url = current_url.to_s.sub! 'http://localhost:8000', 'https://duckdb.org'
%(<a href="#{url}" class="_attribution-link">#{url}</a>)
end
end
end
end
40 changes: 40 additions & 0 deletions lib/docs/filters/duckdb/clean_html.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
module Docs
class Duckdb
class CleanHtmlFilter < Filter
def call
# First extract the main content
@doc = at_css('#main_content_wrap', 'main')
return doc if @doc.nil?

doc.prepend_child at_css('.title').remove
at_css('.title').name = 'h1'

# Remove navigation and header elements
css('.headerline', '.headlinebar', '.landingmenu', '.search_icon', '#sidebar', '.pagemeta', '.toc_menu', '.section-nav').remove

# Clean up code blocks
css('div.highlighter-rouge').each do |node|
node['data-language'] = node['class'][/language-(\w+)/, 1] if node['class']
node.content = node.content.strip
node.name = 'pre'
end

# Remove unnecessary attributes
css('div, span, p').each do |node|
node.remove_attribute('style')
node.remove_attribute('class')
end

# Remove empty elements
css('div, span').each do |node|
node.remove if node.content.strip.empty?
end

# Remove script tags
css('script').remove

doc
end
end
end
end
45 changes: 45 additions & 0 deletions lib/docs/filters/duckdb/entries.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
module Docs
class Duckdb
class EntriesFilter < Docs::EntriesFilter
def get_name
at_css('h1', '.title').content
end

def get_type
case subpath
when /\Asql\//
'SQL Reference'
when /\Aapi\//
'Client APIs'
when /\Aguides\//
'How-to Guides'
when /\Adata\//
'Data Import'
when /\Aoperations_manual\//
'Operations Manual'
when /\Adev\//
'Development'
when /\Ainternals\//
'Internals'
when /\Aextensions\//
'Extensions'
when /\Aarchive\//
'Archive'
else
'Documentation'
end
end

def additional_entries
entries = []
css('h2[id]', 'h3[id]').each do |node|
name = node.content.strip
# Clean up the name
name = name.gsub(/[\r\n\t]/, ' ').squeeze(' ')
entries << [name, node['id'], get_type]
end
entries
end
end
end
end
42 changes: 42 additions & 0 deletions lib/docs/scrapers/duckdb.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
module Docs
class Duckdb < UrlScraper
self.name = 'DuckDB'
self.type = 'duckdb'
self.root_path = 'index.html'
self.links = {
home: 'https://duckdb.org/',
code: 'https://github.com/duckdb/duckdb'
}

# https://duckdb.org/docs/guides/offline-copy.html
# curl -O https://duckdb.org/duckdb-docs.zip; bsdtar xf duckdb-docs.zip; cd duckdb-docs; python -m http.server
self.release = '1.1.3'
self.base_url = 'http://localhost:8000/docs/'

html_filters.push 'duckdb/entries', 'duckdb/clean_html'
text_filters.replace 'attribution', 'duckdb/attribution'

options[:container] = '.documentation'

options[:skip_patterns] = [
/installation/,
/archive/,
/reference/,
]

options[:skip] = %w(
docs/archive/
docs/installation/
docs/api/
)

options[:attribution] = <<-HTML
&copy; Copyright 2018&ndash;2024 Stichting DuckDB Foundation<br>
Licensed under the MIT License.
HTML

def get_latest_version(opts)
get_github_tags('duckdb', 'duckdb', opts)[0]['name']
end
end
end
Binary file added public/icons/docs/duckdb/16.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added public/icons/docs/duckdb/[email protected]
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions public/icons/docs/duckdb/SOURCE
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://github.com/duckdb/duckdb/tree/main/logo

0 comments on commit d0582c6

Please sign in to comment.