From 5ab047824d2129b3e2335fded46178b973611125 Mon Sep 17 00:00:00 2001 From: Daniil Baturin Date: Tue, 28 Nov 2023 19:45:11 +0000 Subject: [PATCH] Give the post-index hook ability to tell soupault to completely ignore a page by setting a special variable `ignore_page`. Since it needs data to decide, also give that hook access to the completely index entry rather than just modifiable fields. However, the hook still needs to modify the `index_fields` variable: for compatibility reasons and also because it's a bad idea to let the hook modify the full index entry, since it also contains data used by soupault internally, such as the page file path and URL. --- src/hooks.ml | 17 +++++++++++++---- src/soupault.ml | 46 ++++++++++++++++++++++++++++++---------------- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/src/hooks.ml b/src/hooks.ml index af6c568..a1e3d94 100644 --- a/src/hooks.ml +++ b/src/hooks.ml @@ -175,7 +175,7 @@ let run_pre_process_hook soupault_state hook_config file_name lua_code page_file It has access to the page element tree and also to extracted index fields and can modify both. *) -let run_post_index_hook soupault_state hook_config file_name lua_code env soup fields = +let run_post_index_hook soupault_state hook_config file_name lua_code env soup entry = let assoc_of_json j = (* This function handles values projected from Lua, and Lua doesn't have a distinction between arrays/lists and tables: @@ -198,13 +198,15 @@ let run_post_index_hook soupault_state hook_config file_name lua_code env soup f let lua_str = I.Value.string in let lua_state = I.mk () in let settings = soupault_state.soupault_settings in - let () = + let index_entry_json = Utils.json_of_index_entry entry in + let () = (* Set up the post-index hook environment *) I.register_globals [ "page", Plugin_api.lua_of_soup (Plugin_api.Html.SoupNode soup); "page_url", lua_str.embed env.page_url; "page_file", lua_str.embed env.page_file; - "index_fields", Plugin_api.lua_of_json (`O fields); + "index_entry", Plugin_api.lua_of_json index_entry_json; + "index_fields", Plugin_api.lua_of_json (`O entry.fields); "config", lua_of_toml hook_config; "hook_config", lua_of_toml hook_config; "soupault_config", lua_of_toml soupault_state.soupault_config; @@ -213,18 +215,25 @@ let run_post_index_hook soupault_state hook_config file_name lua_code env soup f "site_dir", lua_str.embed settings.site_dir; "soupault_pass", I.Value.int.embed soupault_state.soupault_pass; "global_data", lua_of_json !(soupault_state.global_data); + "ignore_page", I.Value.bool.embed false; ] lua_state in let (let*) = Result.bind in let () = Logs.info @@ fun m -> m "Running the post-index hook on page %s" env.page_file in let* () = Plugin_api.run_lua lua_state file_name lua_code in let () = soupault_state.global_data := (Plugin_api.extract_global_data lua_state) in + (* XXX: The assumption is that there's no way to completely unset a global + in the Lua interpreter we are using, + so if we added [ignore_page] to globals, retrieving it will never cause errors, + and that projection to a bool will never fail either. + *) + let ignore_page = I.getglobal lua_state (I.Value.string.embed "ignore_page") |> I.Value.bool.project in let index_fields = I.getglobal lua_state (I.Value.string.embed "index_fields") in if not (I.Value.table.is index_fields) then Error "post-index hook has not assigned a table to the index_fields variable" else let* fields = Plugin_api.json_of_lua index_fields in - Ok (assoc_of_json fields) + Ok (ignore_page, (assoc_of_json fields)) (* render hook replaces the normal page rendering process. diff --git a/src/soupault.ml b/src/soupault.ml index 0f285b3..72b10f5 100644 --- a/src/soupault.ml +++ b/src/soupault.ml @@ -434,18 +434,21 @@ let make_page_url settings nav_path orig_path target_dir page_file = let extract_metadata state hooks env html = (* Metadata is only extracted from non-index pages *) let settings = state.soupault_settings in - if not (Autoindex.index_extraction_should_run settings env.page_file) then (Ok None) else + if not (Autoindex.index_extraction_should_run settings env.page_file) then (Ok (false, None)) else let entry = Autoindex.get_entry settings env html in let post_index_hook = Hashtbl.find_opt hooks "post-index" in match post_index_hook with | Some (file_name, source_code, hook_config) -> - if not (Hooks.hook_should_run settings hook_config "post-index" env.page_file) then (Ok (Some entry)) else - (* Let the post-index hook update the fields *) - let* index_fields = - Hooks.run_post_index_hook state hook_config file_name source_code env html entry.fields + if not (Hooks.hook_should_run settings hook_config "post-index" env.page_file) then (Ok (false, (Some entry))) else + (* Let the post-index hook update the fields. + It can also set a special [ignore_page] variable to tell soupault to exclude the page + from indexing and any further processing. + *) + let* (ignore_page, index_fields) = + Hooks.run_post_index_hook state hook_config file_name source_code env html entry in - Ok (Some {entry with fields=index_fields}) - | None -> Ok (Some entry) + Ok (ignore_page, (Some {entry with fields=index_fields})) + | None -> Ok (false, (Some entry)) let run_pre_process_hook state hooks page_file target_dir target_file content = let settings = state.soupault_settings in @@ -531,15 +534,26 @@ let process_page state page_data index index_hash widgets hooks = let before_index, after_index, widget_hash = widgets in let* () = process_widgets state env before_index widget_hash html in (* Index extraction *) - let* index_entry = extract_metadata state hooks env html in - if settings.index_only then Ok (index_entry, new_pages) else - let* () = process_widgets state env after_index widget_hash html in - let* () = mkdir target_dir in - let* html_str = render_html state hooks env html in - let* () = save_html state hooks env html_str in - (* Finally, run the post-save hook. *) - let* () = run_post_save_hook state hooks env in - Ok (index_entry, new_pages) + let* (ignore_page, index_entry) = extract_metadata state hooks env html in + (* If the render hook told us to ignore the page, pretend it did not exist: + return None for the index entry and do not save to disk. + *) + if ignore_page then + begin + let () = Logs.info @@ fun m -> m "Ignoring page %s according to post-index hook instructions" page_file in + Ok (None, []) + end + else + begin + if settings.index_only then Ok (index_entry, new_pages) else + let* () = mkdir target_dir in + let* () = process_widgets state env after_index widget_hash html in + let* html_str = render_html state hooks env html in + let* () = save_html state hooks env html_str in + (* Finally, run the post-save hook. *) + let* () = run_post_save_hook state hooks env in + Ok (index_entry, new_pages) + end (* Monadic wrapper for process_page that can either return or ignore errors *) let process_page state index index_hash widgets hooks page_data =