Merge pull request #97 from tuzz/small-improvements

Small improvements to improve debugging and flexibility
cantino · Jul 28, 2024 · d3f562c · d3f562c
2 parents d89d3c8 + 5c8d9df
commit d3f562c
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -41,7 +41,7 @@ You may provide options to `Readability::Document.new`, including:
 * `:remove_empty_nodes`: remove `<p>` tags that have no text content; also
   removes `<p>` tags that contain only images;
 * `:attributes`: whitelist of allowed attributes;
-* `:debug`: provide debugging output, defaults false;
+* `:debug`: provide debugging output, defaults false; supports setting a Proc;
 * `:encoding`: if the page is of a known encoding, you can specify it; if left
    unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish
    to disable guessing, supply `:do_not_guess_encoding => true`;

diff --git a/lib/readability.rb b/lib/readability.rb
@@ -21,7 +21,7 @@ class Document
       :elements_to_score          => ["p", "td", "pre"],
       :likely_siblings            => ["p"]
     }.freeze
-    
+
     REGEXES = {
         :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
         :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
@@ -35,7 +35,7 @@ class Document
         :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
         :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
     }
-    
+
     attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
 
     def initialize(input, options = {})
@@ -50,7 +50,7 @@ def initialize(input, options = {})
       @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
       @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
       @weight_classes = @options[:weight_classes]
-      @clean_conditionally = @options[:clean_conditionally]
+      @clean_conditionally = !!@options[:clean_conditionally]
       @best_candidate_has_image = true
       make_html
       handle_exclusions!(@options[:whitelist], @options[:blacklist])
@@ -145,11 +145,11 @@ def images(content=nil, reload=false)
 
       (list_images.empty? and content != @html) ? images(@html, true) : list_images
     end
-    
+
     def images_with_fqdn_uris!(source_uri)
       images_with_fqdn_uris(@html, source_uri)
     end
-    
+
     def images_with_fqdn_uris(document = @html.dup, source_uri)
       uri = URI.parse(source_uri)
       host = uri.host
@@ -161,7 +161,7 @@ def images_with_fqdn_uris(document = @html.dup, source_uri)
       images = []
       document.css("img").each do |elem|
         begin
-          elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil 
+          elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
           images << elem['src'].to_s
         rescue URI::InvalidURIError => exc
           elem.remove
@@ -271,7 +271,7 @@ def get_article(candidates, best_candidate)
 
         if downcased_likely_siblings.include?(sibling.name.downcase)
           link_density = get_link_density(sibling)
-          node_content = sibling.text
+          node_content = sibling.text.strip
           node_length = node_content.length
 
           append = if node_length > 80 && link_density < 0.25
@@ -372,7 +372,11 @@ def score_node(elem)
     end
 
     def debug(str)
-      puts str if options[:debug]
+      if options[:debug].respond_to?(:call)
+        options[:debug].call(str)
+      elsif options[:debug]
+        puts str
+      end
     end
 
     def remove_unlikely_candidates!
@@ -426,7 +430,8 @@ def sanitize(node, candidates, options = {})
 
       # We'll sanitize all elements using a whitelist
       base_whitelist = @options[:tags] || %w[div p]
-      all_whitelisted = base_whitelist.include?("*")
+      all_tags_whitelisted = base_whitelist.include?("*")
+      all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*")
 
       # We'll add whitespace instead of block elements,
       # so a<br>b will have a nice space between them
@@ -440,8 +445,8 @@ def sanitize(node, candidates, options = {})
 
       ([node] + node.css("*")).each do |el|
         # If element is in whitelist, delete all its attributes
-        if all_whitelisted || whitelist[el.node_name]
-          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
+        if all_tags_whitelisted || whitelist[el.node_name]
+          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted
 
           # Otherwise, replace the element with its contents
         else
@@ -470,30 +475,43 @@ def sanitize(node, candidates, options = {})
 
     def clean_conditionally(node, candidates, selector)
       return unless @clean_conditionally
+
       node.css(selector).each do |el|
         weight = class_weight(el)
         content_score = candidates[el] ? candidates[el][:content_score] : 0
         name = el.name.downcase
-
+        remove = false
+        message = nil
+
         if weight + content_score < 0
-          el.remove
-          debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
+          remove = true
+          message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
         elsif el.text.count(",") < 10
           counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
           counts["li"] -= 100
 
           # For every img under a noscript tag discount one from the count to avoid double counting
           counts["img"] -= el.css("noscript").css("img").length
-                
+
           content_length = el.text.strip.length  # Count the text length excluding any surrounding whitespace
           link_density = get_link_density(el)
 
           reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
           if reason
-            debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
-            el.remove
+            message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
+            remove = true
           end
         end
+
+        if options[:clean_conditionally].respond_to?(:call)
+          context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
+          remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
+        end
+
+        if remove
+          debug(message || "Conditionally cleaned by user-specified function.")
+          el.remove
+        end
       end
     end
 

diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb
@@ -115,6 +115,11 @@
       expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
     end
 
+    it "should be able to whitelist all attributes" do
+      @doc = Readability::Document.new(@nested, attributes: ["*"], tags: ["*"])
+      expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
+    end
+
     it "should not try to download local images" do
       @doc = Readability::Document.new(<<-HTML)
         <html>
@@ -498,6 +503,9 @@
               <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
               <p>The likely_siblings now include the section tag so it should be included in the output.</p>
             </section>
+            <section>
+              <p>too short when stripped                                                                                  </p>
+            </section>
             #{'<a href="/">This link lowers the body score.</a>' * 5}
           </body>
         </html>
@@ -506,6 +514,7 @@
       expect(@doc.content).to include("Paragraph 1")
       expect(@doc.content).to include("Paragraph 2")
       expect(@doc.content).to include("should be included")
+      expect(@doc.content).not_to include("too short when stripped")
     end
   end
 
@@ -739,11 +748,33 @@
   end
 
   describe "clean_conditionally_reason?" do
-    let (:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
+    let(:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
 
     it "does not raise error" do
       @doc = Readability::Document.new(list_fixture)
       expect { @doc.content }.to_not raise_error
     end
   end
+
+  describe "clean_conditionally" do
+    let(:fixture) { "<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>" }
+
+    it "can set a clean_conditionally function to allow overriding the default decision" do
+      clean_conditionally_fn = lambda { |context| !context[:remove] } # Flip the decision.
+      content = Readability::Document.new(fixture, clean_conditionally: clean_conditionally_fn, min_text_length: 0, retry_length: 1).content
+
+      expect(content).to include("sidebar")
+      expect(content).not_to include('Some content')
+    end
+  end
+
+  describe "debug" do
+    it "can set a debug function, e.g. to send output to Rails logger" do
+      output = []
+      debug_fn = lambda { |str| output << str }
+
+      Readability::Document.new(@simple_html_fixture, debug: debug_fn).content
+      expect(output).not_to be_empty
+    end
+  end
 end