Skip to content

Commit

Permalink
Add settings.page_character_encoding option
Browse files Browse the repository at this point in the history
  • Loading branch information
dmbaturin committed Sep 8, 2023
1 parent 6ba7739 commit 85d5032
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 16 deletions.
7 changes: 7 additions & 0 deletions src/config.ml
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,7 @@ let valid_settings = [
"complete_page_selector"; "generator_mode"; "process_pages_first";
"plugin_dirs"; "plugin_discovery";
"force"; "caching"; "cache_dir";
"page_character_encoding";
"pretty_print_html";
"soupault_version";
]
Expand Down Expand Up @@ -463,6 +464,12 @@ let _update_settings settings config =
cache_dir = find_string_or ~default:settings.cache_dir st ["cache_dir"];

force = find_bool_or ~default:settings.force st ["force"];

page_character_encoding =
find_string_or ~default:"utf-8" st ["page_character_encoding"] |>
Utils.encoding_of_string |>
of_result "Incorrect value for page_character_endcoding:";

pretty_print_html = find_bool_or ~default:settings.pretty_print_html st ["pretty_print_html"];

soupault_version = OH.find_string_opt st ["soupault_version"];
Expand Down
5 changes: 5 additions & 0 deletions src/defaults.ml
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ type settings = {

force : bool;

page_character_encoding : Markup.Encoding.t;

pretty_print_html : bool;

soupault_version : string option;
Expand Down Expand Up @@ -333,6 +335,9 @@ let default_settings = {
cache_dir = ".soupault-cache";

force = false;

page_character_encoding = Markup.Encoding.utf_8;

pretty_print_html = true;

soupault_version = None;
Expand Down
5 changes: 2 additions & 3 deletions src/helpers/html_utils.ml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ include Soupault_common
lambdasoup doesn't provide a context-aware parsing function, so we turn to lower level
Markup.ml for that.
*)
let parse_html ?(body=true) str =
let context = if body then `Fragment "body" else `Fragment "head" in
Markup.string str |> Markup.parse_html ~context:context |> Markup.signals |> Soup.from_signals
let parse_html ?context ?(encoding=Markup.Encoding.utf_8) str =
Markup.string str |> Markup.parse_html ?context:context ~encoding:encoding |> Markup.signals |> Soup.from_signals

(* Result-aware element selection functions *)
let wrap_select f selector soup =
Expand Down
21 changes: 20 additions & 1 deletion src/helpers/utils.ml
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ let format_date fmt date =
| None -> soupault_error (Printf.sprintf {|Date format "%s" is invalid|} fmt)
| Some printer -> ODate.Unix.To.string printer date


(* TOML/JSON convertors *)

let string_of_float f =
Expand Down Expand Up @@ -236,3 +235,23 @@ let deprecation_warning f opt msg config =
match value with
| None -> ()
| Some _ -> Logs.warn @@ fun m -> m "Deprecated option %s: %s" opt msg

(* Converts a string encoding name to Markup's internal encoding type. *)
let encoding_of_string name =
let open Markup.Encoding in
let name = String.lowercase_ascii name in
match name with
| "ascii" -> Ok us_ascii
| "iso-8859-1" -> Ok iso_8859_1
| "windows-1251" -> Ok windows_1251
| "windows-1252" -> Ok windows_1252
| "utf-8" -> Ok utf_8
| "utf-16" -> Ok utf_16
| "utf-16le" -> Ok utf_16le
| "utf-16be" -> Ok utf_16be
| _ ->
(* Markup has UTF-32 support as well, and I'm happy to add it,
if anyone ever asks for it.
*)
Error (Printf.sprintf "unsupported character encoding %s" name)

6 changes: 3 additions & 3 deletions src/soupault.ml
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,11 @@ let load_html state hooks page_file =
else Ok page_source
| None -> Ok page_source

let parse_html page_source =
let parse_html settings page_source =
(* As of lambdasoup 0.7.2, Soup.parse never fails, only returns empty element trees,
so there's no need to handle errors here.
*)
Ok (Soup.parse page_source)
Ok (Html_utils.parse_html ~encoding:settings.page_character_encoding page_source)

(* The built-in HTML rendering function that is used when the "render" hook is not configured. *)
let render_html_builtin settings soup =
Expand Down Expand Up @@ -502,7 +502,7 @@ let process_page state page_data index index_hash widgets hooks =
let () = Cache.refresh_page_cache settings page_file content in
Ok content
in
let* content = parse_html page_source in
let* content = parse_html settings page_source in
let page_name = FP.basename page_file |> FP.chop_extension in
let orig_path = nav_path in
let nav_path = fix_nav_path settings nav_path page_name in
Expand Down
21 changes: 12 additions & 9 deletions src/widgets/inclusion_widgets.ml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@ open Widget_utils

let (let*) = Stdlib.Result.bind

let html_of_string ?(parse=true) ?(body_context=true) html_str =
if parse then Html_utils.parse_html ~body:body_context html_str |> Soup.coerce
let html_of_string ?(parse=true) ?(body_context=true) settings html_str =
let context = if body_context then `Fragment "body" else `Fragment "head" in
if parse then Html_utils.parse_html ~context:context ~encoding:settings.page_character_encoding html_str |> Soup.coerce
else Soup.create_text html_str

(** Widgets that include external resources into the page *)

(** Inserts an HTML snippet from the [html] config option
into the first element that matches the [selector] *)
let insert_html _ _ config soup =
let insert_html state _ config soup =
let settings = state.soupault_settings in
let valid_options = List.append Config.common_widget_options ["selector"; "html"; "parse"; "action"; "html_context_body"] in
let () = Config.check_options valid_options config {|widget "insert_html"|} in
let selector = get_selectors config in
Expand All @@ -28,13 +30,14 @@ let insert_html _ _ config soup =
let () = no_container_action selector in Ok ()
| Some container ->
let* html_str = Config.find_string_result config ["html"] in
let content = html_of_string ~parse:parse_content ~body_context:html_body_context html_str in
let content = html_of_string ~parse:parse_content ~body_context:html_body_context settings html_str in
Ok (Html_utils.insert_element action container content)
end

(* Reads a file specified in the [file] config option and inserts its content into the first element
that matches the [selector] *)
let include_file _ _ config soup =
let include_file state _ config soup =
let settings = state.soupault_settings in
let valid_options = List.append Config.common_widget_options ["selector"; "file"; "parse"; "action"; "html_context_body"] in
let () = Config.check_options valid_options config {|widget "include"|} in
let selector = get_selectors config in
Expand All @@ -52,7 +55,7 @@ let include_file _ _ config soup =
| Some container ->
let* file = Config.find_string_result config ["file"] in
let* content = Utils.read_file file in
let content = html_of_string ~parse:parse_content ~body_context:html_body_context content in
let content = html_of_string ~parse:parse_content ~body_context:html_body_context settings content in
Ok (Html_utils.insert_element action container content)
end

Expand Down Expand Up @@ -86,7 +89,7 @@ let include_program_output state env config soup =
let env_array = make_program_env env in
let* cmd = Config.find_string_result config ["command"] in
let* content = Process_utils.get_program_output ~env:env_array ~debug:settings.debug cmd in
let content = html_of_string ~parse:parse_content ~body_context:html_body_context content in
let content = html_of_string ~parse:parse_content ~body_context:html_body_context settings content in
Ok (Html_utils.insert_element action container content)
end

Expand All @@ -108,7 +111,7 @@ let preprocess_element state env config soup =
match cached_result with
| Some output ->
let () = Logs.info @@ fun m -> m {|The result of executing command "%s" was found in cache|} command in
let content = html_of_string ~parse:parse ~body_context:body_context output in
let content = html_of_string ~parse:parse ~body_context:body_context settings output in
let () = Html_utils.insert_element action node content in
Ok ()
| None ->
Expand All @@ -121,7 +124,7 @@ let preprocess_element state env config soup =
match result with
| Ok output ->
let () = Cache.cache_object settings env.page_file command input output in
let content = html_of_string ~parse:parse ~body_context:body_context output in
let content = html_of_string ~parse:parse ~body_context:body_context settings output in
let () = Html_utils.insert_element action node content in
Ok ()
| (Error _) as e -> e
Expand Down

0 comments on commit 85d5032

Please sign in to comment.