Skip to content

Commit

Permalink
Merge pull request #98 from beyondwords-io/improve-related-siblings
Browse files Browse the repository at this point in the history
Ignore redundant nesting when checking for related siblings
  • Loading branch information
cantino authored Aug 29, 2024
2 parents d3f562c + 71e1a0d commit 5652f91
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 3 deletions.
35 changes: 32 additions & 3 deletions lib/readability.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ class Document
:blacklist => nil,
:whitelist => nil,
:elements_to_score => ["p", "td", "pre"],
:likely_siblings => ["p"]
:likely_siblings => ["p"],
:ignore_redundant_nesting => false
}.freeze

REGEXES = {
Expand Down Expand Up @@ -264,9 +265,20 @@ def get_article(candidates, best_candidate)
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
output = Nokogiri::XML::Node.new('div', @html)
best_candidate[:elem].parent.children.each do |sibling|

# If the best candidate is the only element in its parent then we will never find any siblings. Therefore,
# find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the
# related content detection, but could lead to false positives. Not supported in arc90's readability.
node =
if options[:ignore_redundant_nesting]
closest_node_with_siblings(best_candidate[:elem])
else
best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability.
end

node.parent.children.each do |sibling|
append = false
append = true if sibling == best_candidate[:elem]
append = true if sibling == node
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold

if downcased_likely_siblings.include?(sibling.name.downcase)
Expand All @@ -291,6 +303,23 @@ def get_article(candidates, best_candidate)
output
end

def closest_node_with_siblings(element)
node = element

until node.node_name == 'body'
siblings = node.parent.children
non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? }

if non_empty.size > 1
return node
else
node = node.parent
end
end

node
end

def select_best_candidate(candidates)
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }

Expand Down
29 changes: 29 additions & 0 deletions spec/readability_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,35 @@
expect(@doc.content).to include("should be included")
expect(@doc.content).not_to include("too short when stripped")
end

it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do
@doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true)
<html>
<head>
<title>title!</title>
</head>
<body>
<div> <!-- This is the closest node of the best candidate that has siblings. -->
<div>
<section>
<p>Paragraph 1</p>
#{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
</section>
</div>
</div>
<section>
<p>This paragraph is longer than 80 characters and inside a section that is a sibling of the ancestor node.</p>
<p>The likely_siblings now include the section tag so it should be included in the output.</p>
</section>
#{'<a href="/">This link lowers the body score.</a>' * 5}
</body>
</html>
HTML

expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).to include("should be included")
end
end

describe "the cant_read.html fixture" do
Expand Down

0 comments on commit 5652f91

Please sign in to comment.