Skip to content

Commit

Permalink
Add HTML5Parser option and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
joaquingx committed Jan 11, 2019
1 parent 8fc608e commit b6030f6
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 6 deletions.
15 changes: 12 additions & 3 deletions parsel/selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import six
from lxml import etree, html
from lxml.html import html5parser

from .utils import flatten, iflatten, extract_regex
from .csstranslator import HTMLTranslator, GenericTranslator
Expand All @@ -23,6 +24,10 @@ def __init__(self, *args, **kwargs):
'xml': {'_parser': SafeXMLParser,
'_csstranslator': GenericTranslator(),
'_tostring_method': 'xml'},
'html5': {'_parser': html5parser.HTMLParser,
'_csstranslator': HTMLTranslator(),
'_tostring_method': 'html',
},
}


Expand All @@ -39,8 +44,12 @@ def create_root_node(text, parser_cls, base_url=None):
"""Create root node for text using given parser class.
"""
body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>'
parser = parser_cls(recover=True, encoding='utf8')
root = etree.fromstring(body, parser=parser, base_url=base_url)
if parser_cls != html5parser.HTMLParser:
parser = parser_cls(recover=True, encoding='utf8')
root = etree.fromstring(body, parser=parser, base_url=base_url)
else:
parser = parser_cls(namespaceHTMLElements=False)
root = html5parser.fromstring(body, parser=parser)
if root is None:
root = etree.fromstring(b'<html/>', parser=parser, base_url=base_url)
return root
Expand Down Expand Up @@ -158,7 +167,7 @@ class Selector(object):
``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3
``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default).
``type`` defines the selector type, it can be ``"html"``, ``"xml"``, ``"html5"`` or ``None`` (default).
If ``type`` is ``None``, the selector defaults to ``"html"``.
"""

Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def has_environment_marker_platform_impl_support():
'w3lib>=1.19.0',
'lxml>=2.3',
'six>=1.5.2',
'cssselect>=0.9'
'cssselect>=0.9',
'html5lib',
]
extras_require = {}

Expand Down
37 changes: 35 additions & 2 deletions tests/test_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ def test_re(self):
["John", "Paul"])
self.assertEqual(x.xpath("//ul/li").re(r"Age: (\d+)"),
["10", "20"])

# Test named group, hit and miss
x = self.sscls(text=u'foobar')
self.assertEqual(x.re('(?P<extract>foo)'), ['foo'])
Expand All @@ -511,7 +511,7 @@ def test_re(self):
def test_re_replace_entities(self):
body = u"""<script>{"foo":"bar &amp; &quot;baz&quot;"}</script>"""
x = self.sscls(text=body)

name_re = re.compile('{"foo":(.*)}')

# by default, only &amp; and &lt; are preserved ;
Expand Down Expand Up @@ -712,6 +712,39 @@ def test_replacement_null_char_from_body(self):
self.assertEqual(u'<html><body><p>Grainy</p></body></html>',
self.sscls(text).extract())

def test_characters_gt_and_lt(self):
"""HTML5 parser tests: greater and less than symbols work as expected."""
lt_elem = '20 < 100'
gt_elem = '120 > 100'
body = u'''<html>
<head></head>
<body>
<div id="distance">{0}</div>
<body>
</html>'''

sel = self.sscls(text=body.format(lt_elem), type='html5')
lt_res = sel.xpath('//div[@id="distance"]/text()').get()
self.assertEqual(lt_res, lt_elem, msg='less than(<) parsing does not work as expected')

sel = self.sscls(text=body.format(gt_elem), type='html5')
gt_res = sel.xpath('//div[@id="distance"]/text()').get()
self.assertEqual(gt_res, gt_elem, msg='greater than(>) parsing does not work as expected')

def test_complete_tags(self):
"""HTML5 parser complete/fill tags as expected."""
body = u'''<html>
<head></head>
<body>
<li>one<div></li>
<li>two</li>
</body>
</html>'''
sel = self.sscls(text=body, type='html5')
res = sel.xpath('//div/text()').get()
self.assertEqual(res, None)


class ExsltTestCase(unittest.TestCase):

sscls = Selector
Expand Down

0 comments on commit b6030f6

Please sign in to comment.