Skip to content

Commit

Permalink
Add support for override the clean_conditionally decision
Browse files Browse the repository at this point in the history
Allow the user to intervene in case Readability would include/exclude
content that should be excluded/included instead.
  • Loading branch information
tuzz committed Jul 25, 2024
1 parent d95a60c commit 5c8d9df
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 6 deletions.
23 changes: 18 additions & 5 deletions lib/readability.rb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def initialize(input, options = {})
@input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
@weight_classes = @options[:weight_classes]
@clean_conditionally = @options[:clean_conditionally]
@clean_conditionally = !!@options[:clean_conditionally]
@best_candidate_has_image = true
make_html
handle_exclusions!(@options[:whitelist], @options[:blacklist])
Expand Down Expand Up @@ -475,14 +475,17 @@ def sanitize(node, candidates, options = {})

def clean_conditionally(node, candidates, selector)
return unless @clean_conditionally

node.css(selector).each do |el|
weight = class_weight(el)
content_score = candidates[el] ? candidates[el][:content_score] : 0
name = el.name.downcase
remove = false
message = nil

if weight + content_score < 0
el.remove
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
remove = true
message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
elsif el.text.count(",") < 10
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
counts["li"] -= 100
Expand All @@ -495,10 +498,20 @@ def clean_conditionally(node, candidates, selector)

reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
if reason
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
el.remove
message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
remove = true
end
end

if options[:clean_conditionally].respond_to?(:call)
context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
end

if remove
debug(message || "Conditionally cleaned by user-specified function.")
el.remove
end
end
end

Expand Down
14 changes: 13 additions & 1 deletion spec/readability_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -748,14 +748,26 @@
end

describe "clean_conditionally_reason?" do
let (:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
let(:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }

it "does not raise error" do
@doc = Readability::Document.new(list_fixture)
expect { @doc.content }.to_not raise_error
end
end

describe "clean_conditionally" do
let(:fixture) { "<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>" }

it "can set a clean_conditionally function to allow overriding the default decision" do
clean_conditionally_fn = lambda { |context| !context[:remove] } # Flip the decision.
content = Readability::Document.new(fixture, clean_conditionally: clean_conditionally_fn, min_text_length: 0, retry_length: 1).content

expect(content).to include("sidebar")
expect(content).not_to include('Some content')
end
end

describe "debug" do
it "can set a debug function, e.g. to send output to Rails logger" do
output = []
Expand Down

0 comments on commit 5c8d9df

Please sign in to comment.