From 6a9e309d536899512091659e32e456950a0e3661 Mon Sep 17 00:00:00 2001
From: Joaquin
Date: Fri, 11 Jan 2019 17:58:29 -0500
Subject: [PATCH] Add HTML5Parser option and tests
---
parsel/selector.py | 15 ++++++++++++---
setup.py | 3 ++-
tests/test_selector.py | 37 +++++++++++++++++++++++++++++++++++--
3 files changed, 49 insertions(+), 6 deletions(-)
diff --git a/parsel/selector.py b/parsel/selector.py
index f9292a4f..c708ef97 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -6,6 +6,7 @@
import six
from lxml import etree, html
+from lxml.html import html5parser
from .utils import flatten, iflatten, extract_regex
from .csstranslator import HTMLTranslator, GenericTranslator
@@ -23,6 +24,10 @@ def __init__(self, *args, **kwargs):
'xml': {'_parser': SafeXMLParser,
'_csstranslator': GenericTranslator(),
'_tostring_method': 'xml'},
+ 'html5': {'_parser': html5parser.HTMLParser,
+ '_csstranslator': HTMLTranslator(),
+ '_tostring_method': 'html',
+ },
}
@@ -39,8 +44,12 @@ def create_root_node(text, parser_cls, base_url=None):
"""Create root node for text using given parser class.
"""
body = text.strip().replace('\x00', '').encode('utf8') or b''
- parser = parser_cls(recover=True, encoding='utf8')
- root = etree.fromstring(body, parser=parser, base_url=base_url)
+ if parser_cls != html5parser.HTMLParser:
+ parser = parser_cls(recover=True, encoding='utf8')
+ root = etree.fromstring(body, parser=parser, base_url=base_url)
+ else:
+ parser = parser_cls(namespaceHTMLElements=False)
+ root = html5parser.fromstring(body, parser=parser)
if root is None:
root = etree.fromstring(b'
', parser=parser, base_url=base_url)
return root
@@ -158,7 +167,7 @@ class Selector(object):
``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3
- ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default).
+ ``type`` defines the selector type, it can be ``"html"``, ``"xml"``, ``"html5"`` or ``None`` (default).
If ``type`` is ``None``, the selector defaults to ``"html"``.
"""
diff --git a/setup.py b/setup.py
index 53f6a1c4..b616e03d 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,8 @@ def has_environment_marker_platform_impl_support():
'w3lib>=1.19.0',
'lxml>=2.3',
'six>=1.5.2',
- 'cssselect>=0.9'
+ 'cssselect>=0.9',
+ 'html5lib',
]
extras_require = {}
diff --git a/tests/test_selector.py b/tests/test_selector.py
index e504166a..4c381470 100644
--- a/tests/test_selector.py
+++ b/tests/test_selector.py
@@ -498,7 +498,7 @@ def test_re(self):
["John", "Paul"])
self.assertEqual(x.xpath("//ul/li").re(r"Age: (\d+)"),
["10", "20"])
-
+
# Test named group, hit and miss
x = self.sscls(text=u'foobar')
self.assertEqual(x.re('(?Pfoo)'), ['foo'])
@@ -511,7 +511,7 @@ def test_re(self):
def test_re_replace_entities(self):
body = u""""""
x = self.sscls(text=body)
-
+
name_re = re.compile('{"foo":(.*)}')
# by default, only & and < are preserved ;
@@ -712,6 +712,39 @@ def test_replacement_null_char_from_body(self):
self.assertEqual(u'Grainy
',
self.sscls(text).extract())
+ def test_characters_gt_and_lt(self):
+ """HTML5 parser tests: greater and less than symbols work as expected."""
+ lt_elem = '20 < 100'
+ gt_elem = '120 > 100'
+ body = u'''
+
+
+ {0}
+
+ '''
+
+ sel = self.sscls(text=body.format(lt_elem), type='html5')
+ lt_res = sel.xpath('//div[@id="distance"]/text()').get()
+ self.assertEqual(lt_res, lt_elem, msg='less than(<) parsing does not work as expected')
+
+ sel = self.sscls(text=body.format(gt_elem), type='html5')
+ gt_res = sel.xpath('//div[@id="distance"]/text()').get()
+ self.assertEqual(gt_res, gt_elem, msg='greater than(>) parsing does not work as expected')
+
+ def test_complete_tags(self):
+ """HTML5 parser complete/fill tags as expected."""
+ body = u'''
+
+
+ one
+
two
+
+ '''
+ sel = self.sscls(text=body, type='html5')
+ res = sel.xpath('//div/text()').get()
+ self.assertEqual(res, None)
+
+
class ExsltTestCase(unittest.TestCase):
sscls = Selector