-
Notifications
You must be signed in to change notification settings - Fork 148
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add HTML5Parser option #133
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
|
||
import six | ||
from lxml import etree, html | ||
from lxml.html import html5parser | ||
|
||
from .utils import flatten, iflatten, extract_regex | ||
from .csstranslator import HTMLTranslator, GenericTranslator | ||
|
@@ -23,6 +24,10 @@ def __init__(self, *args, **kwargs): | |
'xml': {'_parser': SafeXMLParser, | ||
'_csstranslator': GenericTranslator(), | ||
'_tostring_method': 'xml'}, | ||
'html5': {'_parser': html5parser.HTMLParser, | ||
'_csstranslator': HTMLTranslator(), | ||
'_tostring_method': 'html', | ||
}, | ||
} | ||
|
||
|
||
|
@@ -39,8 +44,12 @@ def create_root_node(text, parser_cls, base_url=None): | |
"""Create root node for text using given parser class. | ||
""" | ||
body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>' | ||
parser = parser_cls(recover=True, encoding='utf8') | ||
root = etree.fromstring(body, parser=parser, base_url=base_url) | ||
if parser_cls != html5parser.HTMLParser: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it should be legible if we define the option as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey!, thanks for review!, I also think that, i'll change it. changed |
||
parser = parser_cls(recover=True, encoding='utf8') | ||
root = etree.fromstring(body, parser=parser, base_url=base_url) | ||
else: | ||
parser = parser_cls(namespaceHTMLElements=False) | ||
root = html5parser.fromstring(body, parser=parser) | ||
if root is None: | ||
root = etree.fromstring(b'<html/>', parser=parser, base_url=base_url) | ||
return root | ||
|
@@ -158,7 +167,7 @@ class Selector(object): | |
``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3 | ||
``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default). | ||
``type`` defines the selector type, it can be ``"html"``, ``"xml"``, ``"html5"`` or ``None`` (default). | ||
If ``type`` is ``None``, the selector defaults to ``"html"``. | ||
""" | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -498,7 +498,7 @@ def test_re(self): | |
["John", "Paul"]) | ||
self.assertEqual(x.xpath("//ul/li").re(r"Age: (\d+)"), | ||
["10", "20"]) | ||
|
||
# Test named group, hit and miss | ||
x = self.sscls(text=u'foobar') | ||
self.assertEqual(x.re('(?P<extract>foo)'), ['foo']) | ||
|
@@ -511,7 +511,7 @@ def test_re(self): | |
def test_re_replace_entities(self): | ||
body = u"""<script>{"foo":"bar & "baz""}</script>""" | ||
x = self.sscls(text=body) | ||
|
||
name_re = re.compile('{"foo":(.*)}') | ||
|
||
# by default, only & and < are preserved ; | ||
|
@@ -712,6 +712,39 @@ def test_replacement_null_char_from_body(self): | |
self.assertEqual(u'<html><body><p>Grainy</p></body></html>', | ||
self.sscls(text).extract()) | ||
|
||
def test_characters_gt_and_lt(self): | ||
"""HTML5 parser tests: greater and less than symbols work as expected.""" | ||
lt_elem = '20 < 100' | ||
gt_elem = '120 > 100' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we are just defining these two cases, I would prefer if they could be one on its respective test. On the other hand I think we should be adding more cases, and defining them as a list which will also improve the tests below to not repeat code, but iterate that list checking all those cases There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, think that I improved that 👍 |
||
body = u'''<html> | ||
<head></head> | ||
<body> | ||
<div id="distance">{0}</div> | ||
<body> | ||
</html>''' | ||
|
||
sel = self.sscls(text=body.format(lt_elem), type='html5') | ||
lt_res = sel.xpath('//div[@id="distance"]/text()').get() | ||
self.assertEqual(lt_res, lt_elem, msg='less than(<) parsing does not work as expected') | ||
|
||
sel = self.sscls(text=body.format(gt_elem), type='html5') | ||
gt_res = sel.xpath('//div[@id="distance"]/text()').get() | ||
self.assertEqual(gt_res, gt_elem, msg='greater than(>) parsing does not work as expected') | ||
|
||
def test_complete_tags(self): | ||
"""HTML5 parser complete/fill tags as expected.""" | ||
body = u'''<html> | ||
<head></head> | ||
<body> | ||
<li>one<div></li> | ||
<li>two</li> | ||
</body> | ||
</html>''' | ||
sel = self.sscls(text=body, type='html5') | ||
res = sel.xpath('//div/text()').get() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't the test confirm that it gets the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, you're right the testcase isn't clear. The difference that this test pretend to show is the difference in parsers view. In this particular case:
As you can see,
EDIT: Already changed, thanks for the suggestion, test is much clear now. |
||
self.assertEqual(res, None) | ||
|
||
|
||
class ExsltTestCase(unittest.TestCase): | ||
|
||
sscls = Selector | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think
parser_cls
made sense here when both classes had the same API. Now that we are introducing a new class that does not have the same signature, maybe we should switch to a different approach.For example, instead of a class, we could use a function that returns a valid
root
, and modify_ctgroup
so that_parser
values are such functions.That way, the code here remains independent of the
_parser
value.What do you think?