Make settings.complete_page_selector work correctly again

For now, at cost of parsing HTML twice...
PataphysicalSociety · Apr 17, 2024 · 8ec6dd1 · 8ec6dd1
1 parent a07eadc
commit 8ec6dd1
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+# 4.10.0 (TBD)
+
+## Bug fixes
+
+* Complete HTML pages work correctly in generator mode again (report by Auguste Baum)
+
 # 4.9.0 (2024-03-19)
 
 ## New features and improvements

diff --git a/src/helpers/html_utils.ml b/src/helpers/html_utils.ml
@@ -12,6 +12,10 @@ include Soupault_common
 let parse_html ?context ?(encoding=Markup.Encoding.utf_8) str =
   Markup.string str |> Markup.parse_html ?context:context ~encoding:encoding |> Markup.signals |> Soup.from_signals
 
+(* An equivalent of [Soup.parse], but encoding-aware. *)
+let parse_html_default ?(encoding=Markup.Encoding.utf_8) str =
+  Markup.string str |> Markup.parse_html ~encoding:encoding |> Markup.signals |> Soup.from_signals
+
 (* Result-aware element selection functions *)
 let wrap_select f selector soup =
   try Ok (f selector soup)

diff --git a/src/soupault.ml b/src/soupault.ml
@@ -202,11 +202,40 @@ let load_html state hooks page_file =
   | None -> Ok page_source
 
 let parse_html ?(fragment=true) settings page_source =
-  (* As of lambdasoup 0.7.2, Soup.parse never fails, only returns empty element trees,
+  (* As of lambdasoup 1.0.0, [Soup.parse] never fails, only returns empty element trees,
      so there's no need to handle errors here.
+
+     First we use the default HTML parsing function (equivalent to [Soup.parse]) to get the element tree
+     without any top-level structure corrections
+     and see if it's intended to be a complete page rather than a fragment
+     that the user may want to insert in a template.
+
+     The problem with using that function for real parsing is that, as of 1.0.0,
+     [Soup.parse] resolves certain ambiguities in a way that interferes with our templating:
+     for example, it may insert a [<body>] tag if it sees tags normally found in [<head>], like [<style>],
+     even though they are not prohibited in [<body>].
+
+     Passing [~context:(`Fragment "body")] to [Markup.parse_html] solves that problem
+     but creates a different problem instead: that strips top-level [<html>] tags
+     from the document if they are present.
+
+     So, for now at least, we parse HTML twice: first to determine if it's complete or partial,
+     then to produce an actual element tree 
    *)
-  let context = if fragment then (`Fragment "body") else `Document in
-  Ok (Html_utils.parse_html ~context:context ~encoding:settings.page_character_encoding  page_source)
+  let _interim_html = Html_utils.parse_html_default ~encoding:settings.page_character_encoding page_source in
+  let html_elem = Soup.select_one "html" _interim_html in
+  match html_elem with
+  | Some _ ->
+    (* If there's the ([<html>]) in the page, it's a complete page.
+       We must not attempt to parse it as a fragment.
+     *)
+    Ok (Html_utils.parse_html ~context:`Document ~encoding:settings.page_character_encoding page_source)
+  | None ->
+    (* If there's no [<html>] element, it's a partial page.
+       We can parse it as a <body> fragment or a complete page, depending on the settings.
+     *)
+    let context = if fragment then (`Fragment "body") else `Document in
+    Ok (Html_utils.parse_html ~context:context ~encoding:settings.page_character_encoding page_source)
 
 (* The built-in HTML rendering function that is used when the "render" hook is not configured. *)
 let render_html_builtin settings soup =