From 8ec6dd16a61aa448c0a69f598c2c7416259eebfc Mon Sep 17 00:00:00 2001 From: Daniil Baturin Date: Wed, 17 Apr 2024 22:48:33 +0100 Subject: [PATCH] Make settings.complete_page_selector work correctly again For now, at cost of parsing HTML twice... --- CHANGELOG.md | 6 ++++++ src/helpers/html_utils.ml | 4 ++++ src/soupault.ml | 35 ++++++++++++++++++++++++++++++++--- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd312c3..9137c80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# 4.10.0 (TBD) + +## Bug fixes + +* Complete HTML pages work correctly in generator mode again (report by Auguste Baum) + # 4.9.0 (2024-03-19) ## New features and improvements diff --git a/src/helpers/html_utils.ml b/src/helpers/html_utils.ml index 1c9a14a..adb65af 100644 --- a/src/helpers/html_utils.ml +++ b/src/helpers/html_utils.ml @@ -12,6 +12,10 @@ include Soupault_common let parse_html ?context ?(encoding=Markup.Encoding.utf_8) str = Markup.string str |> Markup.parse_html ?context:context ~encoding:encoding |> Markup.signals |> Soup.from_signals +(* An equivalent of [Soup.parse], but encoding-aware. *) +let parse_html_default ?(encoding=Markup.Encoding.utf_8) str = + Markup.string str |> Markup.parse_html ~encoding:encoding |> Markup.signals |> Soup.from_signals + (* Result-aware element selection functions *) let wrap_select f selector soup = try Ok (f selector soup) diff --git a/src/soupault.ml b/src/soupault.ml index db909a7..3e8ff46 100644 --- a/src/soupault.ml +++ b/src/soupault.ml @@ -202,11 +202,40 @@ let load_html state hooks page_file = | None -> Ok page_source let parse_html ?(fragment=true) settings page_source = - (* As of lambdasoup 0.7.2, Soup.parse never fails, only returns empty element trees, + (* As of lambdasoup 1.0.0, [Soup.parse] never fails, only returns empty element trees, so there's no need to handle errors here. + + First we use the default HTML parsing function (equivalent to [Soup.parse]) to get the element tree + without any top-level structure corrections + and see if it's intended to be a complete page rather than a fragment + that the user may want to insert in a template. + + The problem with using that function for real parsing is that, as of 1.0.0, + [Soup.parse] resolves certain ambiguities in a way that interferes with our templating: + for example, it may insert a [] tag if it sees tags normally found in [], like [