Skip to content

Commit

Permalink
Make settings.complete_page_selector work correctly again
Browse files Browse the repository at this point in the history
For now, at cost of parsing HTML twice...
  • Loading branch information
dmbaturin committed Apr 17, 2024
1 parent a07eadc commit 3ca0981
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 3 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# 4.10.0 (TBD)

## Bug fixes

* Complete HTML pages work correctly in generator mode again (report by Auguste Baum)

# 4.9.0 (2024-03-19)

## New features and improvements
Expand Down
35 changes: 32 additions & 3 deletions src/soupault.ml
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,40 @@ let load_html state hooks page_file =
| None -> Ok page_source

let parse_html ?(fragment=true) settings page_source =
(* As of lambdasoup 0.7.2, Soup.parse never fails, only returns empty element trees,
(* As of lambdasoup 1.0.0, [Soup.parse] never fails, only returns empty element trees,
so there's no need to handle errors here.
First we use the default HTML parsing function (equivalent to [Soup.parse]) to get the element tree
without any top-level structure corrections
and see if it's intended to be a complete page rather than a fragment
that the user may want to insert in a template.
The problem with using that function for real parsing is that, as of 1.0.0,
[Soup.parse] resolves certain ambiguities in a way that interferes with our templating:
for example, it may insert a [<body>] tag if it sees tags normally found in [<head>], like [<style>],
even though they are not prohibited in [<body>].
Passing [~context:(`Fragment "body")] to [Markup.parse_html] solves that problem
but creates a different problem instead: that strips top-level [<html>] tags
from the document if they are present.
So, for now at least, we parse HTML twice: first to determine if it's complete or partial,
then to produce an actual element tree
*)
let context = if fragment then (`Fragment "body") else `Document in
Ok (Html_utils.parse_html ~context:context ~encoding:settings.page_character_encoding page_source)
let _interim_html = Html_utils.parse_html_default ~encoding:settings.page_character_encoding page_source in
let html_elem = Soup.select_one "html" _interim_html in
match html_elem with
| Some _ ->
(* If there's the ([<html>]) in the page, it's a complete page.
We must not attempt to parse it as a fragment.
*)
Ok (Html_utils.parse_html ~context:`Document ~encoding:settings.page_character_encoding page_source)
| None ->
(* If there's no [<html>] element, it's a partial page.
We can parse it as a <body> fragment or a complete page, depending on the settings.
*)
let context = if fragment then (`Fragment "body") else `Document in
Ok (Html_utils.parse_html ~context:context ~encoding:settings.page_character_encoding page_source)

(* The built-in HTML rendering function that is used when the "render" hook is not configured. *)
let render_html_builtin settings soup =
Expand Down

0 comments on commit 3ca0981

Please sign in to comment.