Skip to content

Commit

Permalink
Merge pull request #97 from tuzz/small-improvements
Browse files Browse the repository at this point in the history
Small improvements to improve debugging and flexibility
  • Loading branch information
cantino authored Jul 28, 2024
2 parents d89d3c8 + 5c8d9df commit d3f562c
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 19 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ You may provide options to `Readability::Document.new`, including:
* `:remove_empty_nodes`: remove `<p>` tags that have no text content; also
removes `<p>` tags that contain only images;
* `:attributes`: whitelist of allowed attributes;
* `:debug`: provide debugging output, defaults false;
* `:debug`: provide debugging output, defaults false; supports setting a Proc;
* `:encoding`: if the page is of a known encoding, you can specify it; if left
unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish
to disable guessing, supply `:do_not_guess_encoding => true`;
Expand Down
52 changes: 35 additions & 17 deletions lib/readability.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class Document
:elements_to_score => ["p", "td", "pre"],
:likely_siblings => ["p"]
}.freeze

REGEXES = {
:unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
:okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
Expand All @@ -35,7 +35,7 @@ class Document
:killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
}

attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image

def initialize(input, options = {})
Expand All @@ -50,7 +50,7 @@ def initialize(input, options = {})
@input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
@weight_classes = @options[:weight_classes]
@clean_conditionally = @options[:clean_conditionally]
@clean_conditionally = !!@options[:clean_conditionally]
@best_candidate_has_image = true
make_html
handle_exclusions!(@options[:whitelist], @options[:blacklist])
Expand Down Expand Up @@ -145,11 +145,11 @@ def images(content=nil, reload=false)

(list_images.empty? and content != @html) ? images(@html, true) : list_images
end

def images_with_fqdn_uris!(source_uri)
images_with_fqdn_uris(@html, source_uri)
end

def images_with_fqdn_uris(document = @html.dup, source_uri)
uri = URI.parse(source_uri)
host = uri.host
Expand All @@ -161,7 +161,7 @@ def images_with_fqdn_uris(document = @html.dup, source_uri)
images = []
document.css("img").each do |elem|
begin
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
images << elem['src'].to_s
rescue URI::InvalidURIError => exc
elem.remove
Expand Down Expand Up @@ -271,7 +271,7 @@ def get_article(candidates, best_candidate)

if downcased_likely_siblings.include?(sibling.name.downcase)
link_density = get_link_density(sibling)
node_content = sibling.text
node_content = sibling.text.strip
node_length = node_content.length

append = if node_length > 80 && link_density < 0.25
Expand Down Expand Up @@ -372,7 +372,11 @@ def score_node(elem)
end

def debug(str)
puts str if options[:debug]
if options[:debug].respond_to?(:call)
options[:debug].call(str)
elsif options[:debug]
puts str
end
end

def remove_unlikely_candidates!
Expand Down Expand Up @@ -426,7 +430,8 @@ def sanitize(node, candidates, options = {})

# We'll sanitize all elements using a whitelist
base_whitelist = @options[:tags] || %w[div p]
all_whitelisted = base_whitelist.include?("*")
all_tags_whitelisted = base_whitelist.include?("*")
all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*")

# We'll add whitespace instead of block elements,
# so a<br>b will have a nice space between them
Expand All @@ -440,8 +445,8 @@ def sanitize(node, candidates, options = {})

([node] + node.css("*")).each do |el|
# If element is in whitelist, delete all its attributes
if all_whitelisted || whitelist[el.node_name]
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
if all_tags_whitelisted || whitelist[el.node_name]
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted

# Otherwise, replace the element with its contents
else
Expand Down Expand Up @@ -470,30 +475,43 @@ def sanitize(node, candidates, options = {})

def clean_conditionally(node, candidates, selector)
return unless @clean_conditionally

node.css(selector).each do |el|
weight = class_weight(el)
content_score = candidates[el] ? candidates[el][:content_score] : 0
name = el.name.downcase

remove = false
message = nil

if weight + content_score < 0
el.remove
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
remove = true
message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
elsif el.text.count(",") < 10
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
counts["li"] -= 100

# For every img under a noscript tag discount one from the count to avoid double counting
counts["img"] -= el.css("noscript").css("img").length

content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
link_density = get_link_density(el)

reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
if reason
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
el.remove
message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
remove = true
end
end

if options[:clean_conditionally].respond_to?(:call)
context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
end

if remove
debug(message || "Conditionally cleaned by user-specified function.")
el.remove
end
end
end

Expand Down
33 changes: 32 additions & 1 deletion spec/readability_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@
expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
end

it "should be able to whitelist all attributes" do
@doc = Readability::Document.new(@nested, attributes: ["*"], tags: ["*"])
expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
end

it "should not try to download local images" do
@doc = Readability::Document.new(<<-HTML)
<html>
Expand Down Expand Up @@ -498,6 +503,9 @@
<p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
<p>The likely_siblings now include the section tag so it should be included in the output.</p>
</section>
<section>
<p>too short when stripped </p>
</section>
#{'<a href="/">This link lowers the body score.</a>' * 5}
</body>
</html>
Expand All @@ -506,6 +514,7 @@
expect(@doc.content).to include("Paragraph 1")
expect(@doc.content).to include("Paragraph 2")
expect(@doc.content).to include("should be included")
expect(@doc.content).not_to include("too short when stripped")
end
end

Expand Down Expand Up @@ -739,11 +748,33 @@
end

describe "clean_conditionally_reason?" do
let (:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
let(:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }

it "does not raise error" do
@doc = Readability::Document.new(list_fixture)
expect { @doc.content }.to_not raise_error
end
end

describe "clean_conditionally" do
let(:fixture) { "<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>" }

it "can set a clean_conditionally function to allow overriding the default decision" do
clean_conditionally_fn = lambda { |context| !context[:remove] } # Flip the decision.
content = Readability::Document.new(fixture, clean_conditionally: clean_conditionally_fn, min_text_length: 0, retry_length: 1).content

expect(content).to include("sidebar")
expect(content).not_to include('Some content')
end
end

describe "debug" do
it "can set a debug function, e.g. to send output to Rails logger" do
output = []
debug_fn = lambda { |str| output << str }

Readability::Document.new(@simple_html_fixture, debug: debug_fn).content
expect(output).not_to be_empty
end
end
end

0 comments on commit d3f562c

Please sign in to comment.