From f3484d5fc02d06bc93a069f10cfe4b41b20edacf Mon Sep 17 00:00:00 2001 From: Joaquin Date: Fri, 11 Jan 2019 17:56:17 -0500 Subject: [PATCH] Add HTML5Parser option and tests --- parsel/selector.py | 15 ++++++++++++--- setup.py | 3 ++- tests/test_selector.py | 37 +++++++++++++++++++++++++++++++++++-- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index f9292a4f..c708ef97 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -6,6 +6,7 @@ import six from lxml import etree, html +from lxml.html import html5parser from .utils import flatten, iflatten, extract_regex from .csstranslator import HTMLTranslator, GenericTranslator @@ -23,6 +24,10 @@ def __init__(self, *args, **kwargs): 'xml': {'_parser': SafeXMLParser, '_csstranslator': GenericTranslator(), '_tostring_method': 'xml'}, + 'html5': {'_parser': html5parser.HTMLParser, + '_csstranslator': HTMLTranslator(), + '_tostring_method': 'html', + }, } @@ -39,8 +44,12 @@ def create_root_node(text, parser_cls, base_url=None): """Create root node for text using given parser class. """ body = text.strip().replace('\x00', '').encode('utf8') or b'' - parser = parser_cls(recover=True, encoding='utf8') - root = etree.fromstring(body, parser=parser, base_url=base_url) + if parser_cls != html5parser.HTMLParser: + parser = parser_cls(recover=True, encoding='utf8') + root = etree.fromstring(body, parser=parser, base_url=base_url) + else: + parser = parser_cls(namespaceHTMLElements=False) + root = html5parser.fromstring(body, parser=parser) if root is None: root = etree.fromstring(b'', parser=parser, base_url=base_url) return root @@ -158,7 +167,7 @@ class Selector(object): ``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3 - ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default). + ``type`` defines the selector type, it can be ``"html"``, ``"xml"``, ``"html5"`` or ``None`` (default). If ``type`` is ``None``, the selector defaults to ``"html"``. """ diff --git a/setup.py b/setup.py index 53f6a1c4..b616e03d 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,8 @@ def has_environment_marker_platform_impl_support(): 'w3lib>=1.19.0', 'lxml>=2.3', 'six>=1.5.2', - 'cssselect>=0.9' + 'cssselect>=0.9', + 'html5lib', ] extras_require = {} diff --git a/tests/test_selector.py b/tests/test_selector.py index e504166a..4c381470 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -498,7 +498,7 @@ def test_re(self): ["John", "Paul"]) self.assertEqual(x.xpath("//ul/li").re(r"Age: (\d+)"), ["10", "20"]) - + # Test named group, hit and miss x = self.sscls(text=u'foobar') self.assertEqual(x.re('(?Pfoo)'), ['foo']) @@ -511,7 +511,7 @@ def test_re(self): def test_re_replace_entities(self): body = u"""""" x = self.sscls(text=body) - + name_re = re.compile('{"foo":(.*)}') # by default, only & and < are preserved ; @@ -712,6 +712,39 @@ def test_replacement_null_char_from_body(self): self.assertEqual(u'

Grainy

', self.sscls(text).extract()) + def test_characters_gt_and_lt(self): + """HTML5 parser tests: greater and less than symbols work as expected.""" + lt_elem = '20 < 100' + gt_elem = '120 > 100' + body = u''' + + +
{0}
+ + ''' + + sel = self.sscls(text=body.format(lt_elem), type='html5') + lt_res = sel.xpath('//div[@id="distance"]/text()').get() + self.assertEqual(lt_res, lt_elem, msg='less than(<) parsing does not work as expected') + + sel = self.sscls(text=body.format(gt_elem), type='html5') + gt_res = sel.xpath('//div[@id="distance"]/text()').get() + self.assertEqual(gt_res, gt_elem, msg='greater than(>) parsing does not work as expected') + + def test_complete_tags(self): + """HTML5 parser complete/fill tags as expected.""" + body = u''' + + +
  • one
  • +
  • two
  • + + ''' + sel = self.sscls(text=body, type='html5') + res = sel.xpath('//div/text()').get() + self.assertEqual(res, None) + + class ExsltTestCase(unittest.TestCase): sscls = Selector