diff --git a/parsel/selector.py b/parsel/selector.py
index f9292a4f..125faa5a 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -6,6 +6,7 @@
import six
from lxml import etree, html
+from lxml.html import html5parser
from .utils import flatten, iflatten, extract_regex
from .csstranslator import HTMLTranslator, GenericTranslator
@@ -23,6 +24,10 @@ def __init__(self, *args, **kwargs):
'xml': {'_parser': SafeXMLParser,
'_csstranslator': GenericTranslator(),
'_tostring_method': 'xml'},
+ 'html5': {'_parser': html5parser.HTMLParser,
+ '_csstranslator': HTMLTranslator(),
+ '_tostring_method': 'html',
+ },
}
@@ -39,8 +44,15 @@ def create_root_node(text, parser_cls, base_url=None):
"""Create root node for text using given parser class.
"""
body = text.strip().replace('\x00', '').encode('utf8') or b'
'
- parser = parser_cls(recover=True, encoding='utf8')
- root = etree.fromstring(body, parser=parser, base_url=base_url)
+ if parser_cls == html5parser.HTMLParser:
+ try:
+ parser = parser_cls(namespaceHTMLElements=False)
+ root = parser.parse(body, useChardet=False, override_encoding='utf8').getroot()
+ except ValueError:
+ raise TypeError('HTML5parser does not support control characters')
+ else:
+ parser = parser_cls(recover=True, encoding='utf8')
+ root = etree.fromstring(body, parser=parser, base_url=base_url)
if root is None:
root = etree.fromstring(b'some text
')
+ @file_data('html_parser.json')
+ def test_pickle_selector(self, parser):
+ sel = self.sscls(text=u'some text
', type=parser)
self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel)
- def test_pickle_selector_list(self):
- sel = self.sscls(text=u'')
+ @file_data('html_parser.json')
+ def test_pickle_selector_list(self, parser):
+ sel = self.sscls(text=u'', type=parser)
sel_list = sel.css('li')
empty_sel_list = sel.css('p')
self.assertIsInstance(sel_list, self.sscls.selectorlist_cls)
@@ -25,10 +28,11 @@ def test_pickle_selector_list(self):
self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list)
self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), empty_sel_list)
- def test_simple_selection(self):
+ @file_data('html_parser.json')
+ def test_simple_selection(self, parser):
"""Simple selector tests"""
body = u""
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
xl = sel.xpath('//input')
self.assertEqual(2, len(xl))
@@ -40,18 +44,21 @@ def test_simple_selection(self):
self.assertEqual([x.extract() for x in sel.xpath("//input[@name='a']/@name")],
[u'a'])
- self.assertEqual([x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")],
- [u'12.0'])
+ self.assertEqual(
+ [x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")],
+ [u'12.0'])
self.assertEqual(sel.xpath("concat('xpath', 'rules')").extract(),
[u'xpathrules'])
- self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
- [u'12'])
+ self.assertEqual(
+ [x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")],
+ [u'12'])
- def test_simple_selection_with_variables(self):
+ @file_data('html_parser.json')
+ def test_simple_selection_with_variables(self, parser):
"""Using XPath variables"""
body = u""
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)],
[u'a'])
@@ -78,11 +85,12 @@ def test_simple_selection_with_variables(self):
tag="input", cnt=2, test=True).extract(),
[u'1'])
- def test_simple_selection_with_variables_escape_friendly(self):
+ @file_data('html_parser.json')
+ def test_simple_selection_with_variables_escape_friendly(self, parser):
"""Using XPath variables with quotes that would need escaping with string formatting"""
body = u"""I'm mixing single and
"double quotes" and I don't care :)
"""
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
t = 'I say "Yeah!"'
# naive string formatting with give something like:
@@ -98,10 +106,11 @@ def test_simple_selection_with_variables_escape_friendly(self):
self.assertRaises(ValueError, sel.xpath, "//p[normalize-space()='{}']//@name".format(lt))
self.assertEqual([x.extract() for x in sel.xpath("//p[normalize-space()=$lng]//@name",
- lng=lt)],
+ lng=lt)],
[u'a'])
- def test_accessing_attributes(self):
+ @file_data('html_parser.json')
+ def test_accessing_attributes(self, parser):
body = u"""
@@ -113,7 +122,7 @@ def test_accessing_attributes(self):
"""
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertEqual({'lang': 'en', 'version': '1.0'}, sel.attrib)
self.assertEqual({'id': 'some-list', 'class': 'list-cls'}, sel.css('ul')[0].attrib)
@@ -129,9 +138,10 @@ def test_accessing_attributes(self):
{'class': 'item-cls', 'id': 'list-item-3'}],
[e.attrib for e in sel.css('li')])
- def test_representation_slice(self):
+ @file_data('html_parser.json')
+ def test_representation_slice(self, parser):
body = u"".format(50 * 'b')
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
representation = "".format(40 * 'b')
if six.PY2:
@@ -142,14 +152,15 @@ def test_representation_slice(self):
[representation]
)
- def test_representation_unicode_query(self):
+ @file_data('html_parser.json')
+ def test_representation_unicode_query(self, parser):
body = u"".format(50 * 'b')
representation = ''
if six.PY2:
representation = ""
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertEqual(
[repr(it) for it in sel.xpath(u'//input[@value="\xa9"]/@value')],
[representation]
@@ -159,10 +170,11 @@ def test_check_text_argument_type(self):
self.assertRaisesRegexp(TypeError, 'text argument should be of type',
self.sscls, b'')
- def test_extract_first(self):
+ @file_data('html_parser.json')
+ def test_extract_first(self, parser):
"""Test if extract_first() returns first element"""
body = u''
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertEqual(sel.xpath('//ul/li/text()').extract_first(),
sel.xpath('//ul/li/text()').extract()[0])
@@ -175,41 +187,46 @@ def test_extract_first(self):
self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').extract_first(), None)
- def test_extract_first_default(self):
+ @file_data('html_parser.json')
+ def test_extract_first_default(self, parser):
"""Test if extract_first() returns default value when no results found"""
body = u''
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertEqual(sel.xpath('//div/text()').extract_first(default='missing'), 'missing')
- def test_selector_get_alias(self):
+ @file_data('html_parser.json')
+ def test_selector_get_alias(self, parser):
"""Test if get() returns extracted value on a Selector"""
body = u''
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertEqual(sel.xpath('//ul/li[position()>1]')[0].get(), u'2')
self.assertEqual(sel.xpath('//ul/li[position()>1]/text()')[0].get(), u'2')
- def test_selector_getall_alias(self):
+ @file_data('html_parser.json')
+ def test_selector_getall_alias(self, parser):
"""Test if get() returns extracted value on a Selector"""
body = u''
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertListEqual(sel.xpath('//ul/li[position()>1]')[0].getall(), [u'2'])
self.assertListEqual(sel.xpath('//ul/li[position()>1]/text()')[0].getall(), [u'2'])
- def test_selectorlist_get_alias(self):
+ @file_data('html_parser.json')
+ def test_selectorlist_get_alias(self, parser):
"""Test if get() returns first element for a selection call"""
body = u''
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertEqual(sel.xpath('//ul/li').get(), u'1')
self.assertEqual(sel.xpath('//ul/li/text()').get(), u'1')
- def test_re_first(self):
+ @file_data('html_parser.json')
+ def test_re_first(self, parser):
"""Test if re_first() returns first matched element"""
body = u''
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertEqual(sel.xpath('//ul/li/text()').re_first(r'\d'),
sel.xpath('//ul/li/text()').re(r'\d')[0])
@@ -235,20 +252,23 @@ def test_extract_first_default(self):
self.assertEqual(sel.xpath('//div/text()').re_first(r'\w+', default='missing'), 'missing')
self.assertEqual(sel.xpath('/ul/li/text()').re_first(r'\w+', default='missing'), 'missing')
- def test_select_unicode_query(self):
+ @file_data('html_parser.json')
+ def test_select_unicode_query(self, parser):
body = u""
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1'])
- def test_list_elements_type(self):
+ @file_data('html_parser.json')
+ def test_list_elements_type(self, parser):
"""Test Selector returning the same type in selection methods"""
text = u'test
'
- assert isinstance(self.sscls(text=text).xpath("//p")[0], self.sscls)
- assert isinstance(self.sscls(text=text).css("p")[0], self.sscls)
+ assert isinstance(self.sscls(text=text, type=parser).xpath("//p")[0], self.sscls)
+ assert isinstance(self.sscls(text=text, type=parser).css("p")[0], self.sscls)
- def test_boolean_result(self):
+ @file_data('html_parser.json')
+ def test_boolean_result(self, parser):
body = u"
"
- xs = self.sscls(text=body)
+ xs = self.sscls(text=body, type=parser)
self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), [u'1'])
self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), [u'0'])
@@ -272,9 +292,10 @@ def test_text_or_root_is_required(self):
'Selector needs either text or root argument',
self.sscls)
- def test_bool(self):
+ @file_data('html_parser.json')
+ def test_bool(self, parser):
text = u'falsetrue'
- hs = self.sscls(text=text, type='html')
+ hs = self.sscls(text=text, type=parser)
falsish = hs.xpath('//a/@href')[0]
self.assertEqual(falsish.extract(), u'')
self.assertFalse(falsish)
@@ -282,16 +303,18 @@ def test_bool(self):
self.assertEqual(trueish.extract(), u'nonempty')
self.assertTrue(trueish)
- def test_slicing(self):
+ @file_data('html_parser.json')
+ def test_slicing(self, parser):
text = u''
- hs = self.sscls(text=text, type='html')
+ hs = self.sscls(text=text, type=parser)
self.assertIsInstance(hs.css('p')[2], self.sscls)
self.assertIsInstance(hs.css('p')[2:3], self.sscls.selectorlist_cls)
self.assertIsInstance(hs.css('p')[:2], self.sscls.selectorlist_cls)
self.assertEqual(hs.css('p')[2:3].extract(), [u'3
'])
self.assertEqual(hs.css('p')[1:3].extract(), [u'2
', u'3
'])
- def test_nested_selectors(self):
+ @file_data('html_parser.json')
+ def test_nested_selectors(self, parser):
"""Nested selector tests"""
body = u"""
@@ -306,7 +329,7 @@ def test_nested_selectors(self):
"""
- x = self.sscls(text=body)
+ x = self.sscls(text=body, type=parser)
divtwo = x.xpath('//div[@class="two"]')
self.assertEqual(divtwo.xpath("//li").extract(),
["one", "two", "four", "five", "six"])
@@ -316,7 +339,8 @@ def test_nested_selectors(self):
["four", "five", "six"])
self.assertEqual(divtwo.xpath("./li").extract(), [])
- def test_selectorlist_getall_alias(self):
+ @file_data('html_parser.json')
+ def test_selectorlist_getall_alias(self, parser):
"""Nested selector tests using getall()"""
body = u"""
@@ -331,7 +355,7 @@ def test_selectorlist_getall_alias(self):
"""
- x = self.sscls(text=body)
+ x = self.sscls(text=body, type=parser)
divtwo = x.xpath('//div[@class="two"]')
self.assertEqual(divtwo.xpath("//li").getall(),
["one", "two", "four", "five", "six"])
@@ -341,17 +365,19 @@ def test_selectorlist_getall_alias(self):
["four", "five", "six"])
self.assertEqual(divtwo.xpath("./li").getall(), [])
- def test_mixed_nested_selectors(self):
+ @file_data('html_parser.json')
+ def test_mixed_nested_selectors(self, parser):
body = u'''
notme
'''
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertEqual(sel.xpath('//div[@id="1"]').css('span::text').extract(), [u'me'])
self.assertEqual(sel.css('#1').xpath('./span/text()').extract(), [u'me'])
- def test_dont_strip(self):
- sel = self.sscls(text=u'')
+ @file_data('html_parser.json')
+ def test_dont_strip(self, parser):
+ sel = self.sscls(text=u'', type=parser)
self.assertEqual(sel.xpath("//text()").extract(), [u'fff: ', u'zzz'])
def test_namespaces_simple(self):
@@ -437,37 +463,38 @@ def test_namespaces_multiple_adhoc(self):
# "b" namespace being passed ad-hoc
self.assertEqual(x.xpath("//b:Operation/text()",
- namespaces={"b": "http://somens.com"}).extract()[0], 'hello')
+ namespaces={"b": "http://somens.com"}).extract()[0], 'hello')
# "b" namespace declaration is not cached
self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att")
# "xmlns" is still defined
self.assertEqual(x.xpath("//xmlns:TestTag/@b:att",
- namespaces={"b": "http://somens.com"}).extract()[0], 'value')
+ namespaces={"b": "http://somens.com"}).extract()[0], 'value')
# chained selectors still have knowledge of register_namespace() operations
self.assertEqual(x.xpath("//p:SecondTestTag",
- namespaces={"p": "http://www.scrapy.org/product"}).xpath("./xmlns:price/text()")[0].extract(), '90')
+ namespaces={"p": "http://www.scrapy.org/product"}).xpath("./xmlns:price/text()")[
+ 0].extract(), '90')
# but chained selector don't know about parent ad-hoc declarations
- self.assertRaises(ValueError,x.xpath("//p:SecondTestTag",
- namespaces={"p": "http://www.scrapy.org/product"}).xpath, "p:name/text()")
+ self.assertRaises(ValueError, x.xpath("//p:SecondTestTag",
+ namespaces={"p": "http://www.scrapy.org/product"}).xpath, "p:name/text()")
# ad-hoc declarations need repeats when chaining
self.assertEqual(x.xpath("//p:SecondTestTag",
- namespaces={"p": "http://www.scrapy.org/product"}
- ).xpath("p:name/text()",
- namespaces={"p": "http://www.scrapy.org/product"}
- ).extract_first(), 'Dried Rose')
+ namespaces={"p": "http://www.scrapy.org/product"}
+ ).xpath("p:name/text()",
+ namespaces={"p": "http://www.scrapy.org/product"}
+ ).extract_first(), 'Dried Rose')
# declaring several ad-hoc namespaces
self.assertEqual(x.xpath("""string(
//b:Operation
/following-sibling::xmlns:TestTag
/following-sibling::*//p:name)""",
- namespaces={"b": "http://somens.com",
- "p": "http://www.scrapy.org/product"}).extract_first(), 'Dried Rose')
+ namespaces={"b": "http://somens.com",
+ "p": "http://www.scrapy.org/product"}).extract_first(), 'Dried Rose')
# "p" prefix is not cached from previous calls
self.assertRaises(ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()")
@@ -481,7 +508,8 @@ def test_make_links_absolute(self):
sel.root.make_links_absolute()
self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first())
- def test_re(self):
+ @file_data('html_parser.json')
+ def test_re(self, parser):
body = u"""Name: Mary
- Name: John
@@ -491,14 +519,14 @@ def test_re(self):
Age: 20
"""
- x = self.sscls(text=body)
+ x = self.sscls(text=body, type=parser)
name_re = re.compile(r"Name: (\w+)")
self.assertEqual(x.xpath("//ul/li").re(name_re),
["John", "Paul"])
self.assertEqual(x.xpath("//ul/li").re(r"Age: (\d+)"),
["10", "20"])
-
+
# Test named group, hit and miss
x = self.sscls(text=u'foobar')
self.assertEqual(x.re('(?Pfoo)'), ['foo'])
@@ -508,10 +536,11 @@ def test_re(self):
x = self.sscls(text=u'baz')
self.assertEqual(x.re('(?Pfoo)|(?Pbaz)'), [])
- def test_re_replace_entities(self):
+ @file_data('html_parser.json')
+ def test_re_replace_entities(self, parser):
body = u""""""
- x = self.sscls(text=body)
-
+ x = self.sscls(text=body, type=parser)
+
name_re = re.compile('{"foo":(.*)}')
# by default, only & and < are preserved ;
@@ -534,9 +563,10 @@ def test_re_replace_entities(self):
self.assertEqual(x.xpath("//script/text()").re_first(name_re, replace_entities=False), expected)
self.assertEqual(x.xpath("//script")[0].re_first(name_re, replace_entities=False), expected)
- def test_re_intl(self):
+ @file_data('html_parser.json')
+ def test_re_intl(self, parser):
body = u'Evento: cumplea\xf1os
'
- x = self.sscls(text=body)
+ x = self.sscls(text=body, type=parser)
self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), [u'cumplea\xf1os'])
def test_selector_over_text(self):
@@ -546,20 +576,23 @@ def test_selector_over_text(self):
self.assertEqual(xs.extract(), u'lala')
self.assertEqual(xs.xpath('.').extract(), [u'lala'])
- def test_invalid_xpath(self):
+ @file_data('html_parser.json')
+ def test_invalid_xpath(self, parser):
"Test invalid xpath raises ValueError with the invalid xpath"
- x = self.sscls(text=u"")
+ x = self.sscls(text=u"", type=parser)
xpath = "//test[@foo='bar]"
self.assertRaisesRegexp(ValueError, re.escape(xpath), x.xpath, xpath)
- def test_invalid_xpath_unicode(self):
+ @file_data('html_parser.json')
+ def test_invalid_xpath_unicode(self, parser):
"Test *Unicode* invalid xpath raises ValueError with the invalid xpath"
- x = self.sscls(text=u"")
+ x = self.sscls(text=u"", type=parser)
xpath = u"//test[@foo='\u0431ar]"
encoded = xpath if six.PY3 else xpath.encode('unicode_escape')
self.assertRaisesRegexp(ValueError, re.escape(encoded), x.xpath, xpath)
- def test_http_header_encoding_precedence(self):
+ @file_data('html_parser.json')
+ def test_http_header_encoding_precedence(self, parser):
# u'\xa3' = pound symbol in unicode
# u'\xc2\xa3' = pound symbol in utf-8
# u'\xa3' = pound symbol in latin-1 (iso-8859-1)
@@ -567,20 +600,22 @@ def test_http_header_encoding_precedence(self):
text = u'''
\xa3'''
- x = self.sscls(text=text)
+ x = self.sscls(text=text, type=parser)
self.assertEqual(x.xpath("//span[@id='blank']/text()").extract(),
- [u'\xa3'])
+ [u'\xa3'])
- def test_empty_bodies_shouldnt_raise_errors(self):
- self.sscls(text=u'').xpath('//text()').extract()
+ @file_data('html_parser.json')
+ def test_empty_bodies_shouldnt_raise_errors(self, parser):
+ self.sscls(text=u'', type=parser).xpath('//text()').extract()
def test_bodies_with_comments_only(self):
sel = self.sscls(text=u'', base_url='http://example.com')
self.assertEqual(u'http://example.com', sel.root.base)
- def test_null_bytes_shouldnt_raise_errors(self):
+ @file_data('html_parser.json')
+ def test_null_bytes_shouldnt_raise_errors(self, parser):
text = u'pre\x00post'
- self.sscls(text).xpath('//text()').extract()
+ self.sscls(text, type=parser).xpath('//text()').extract()
def test_replacement_char_from_badly_encoded_body(self):
# \xe9 alone isn't valid utf8 sequence
@@ -588,8 +623,9 @@ def test_replacement_char_from_badly_encoded_body(self):
self.assertEqual([u'an Jos\ufffd de'],
self.sscls(text).xpath('//text()').extract())
- def test_select_on_unevaluable_nodes(self):
- r = self.sscls(text=u'some text')
+ @file_data('html_parser.json')
+ def test_select_on_unevaluable_nodes(self, parser):
+ r = self.sscls(text=u'some text', type=parser)
# Text node
x1 = r.xpath('//text()')
self.assertEqual(x1.extract(), [u'some text'])
@@ -599,8 +635,9 @@ def test_select_on_unevaluable_nodes(self):
self.assertEqual(x1.extract(), [u'big'])
self.assertEqual(x1.xpath('.//text()').extract(), [])
- def test_select_on_text_nodes(self):
- r = self.sscls(text=u'Options:opt1
Otheropt2
')
+ @file_data('html_parser.json')
+ def test_select_on_text_nodes(self, parser):
+ r = self.sscls(text=u'Options:opt1
Otheropt2
', type=parser)
x1 = r.xpath("//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]")
self.assertEqual(x1.extract(), [u'opt1'])
@@ -615,12 +652,13 @@ def test_nested_select_on_text_nodes(self):
x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]")
self.assertEqual(x2.extract(), [u'Options:'])
- def test_weakref_slots(self):
+ @file_data('html_parser.json')
+ def test_weakref_slots(self, parser):
"""Check that classes are using slots and are weak-referenceable"""
- x = self.sscls(text=u'')
+ x = self.sscls(text=u'', type=parser)
weakref.ref(x)
assert not hasattr(x, '__dict__'), "%s does not use __slots__" % \
- x.__class__.__name__
+ x.__class__.__name__
def test_remove_namespaces(self):
xml = u"""
@@ -648,7 +686,8 @@ def test_remove_attributes_namespaces(self):
sel.remove_namespaces()
self.assertEqual(len(sel.xpath("//link/@type")), 2)
- def test_smart_strings(self):
+ @file_data('html_parser.json')
+ def test_smart_strings(self, parser):
"""Lxml smart strings return values"""
class SmartStringsSelector(Selector):
@@ -669,22 +708,22 @@ class SmartStringsSelector(Selector):
# .getparent() is available for text nodes and attributes
# only when smart_strings are on
- x = self.sscls(text=body)
+ x = self.sscls(text=body, type=parser)
li_text = x.xpath('//li/text()')
self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), li_text)))
div_class = x.xpath('//div/@class')
self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), div_class)))
- x = SmartStringsSelector(text=body)
+ x = SmartStringsSelector(text=body, type=parser)
li_text = x.xpath('//li/text()')
self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), li_text)))
div_class = x.xpath('//div/@class')
self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), div_class)))
def test_xml_entity_expansion(self):
- malicious_xml = u''\
- ' ]>&xxe;'
+ malicious_xml = u'' \
+ ' ]>&xxe;'
sel = self.sscls(text=malicious_xml, type='xml')
@@ -694,29 +733,72 @@ def test_configure_base_url(self):
sel = self.sscls(text=u'nothing', base_url='http://example.com')
self.assertEqual(u'http://example.com', sel.root.base)
- def test_extending_selector(self):
+ @file_data('html_parser.json')
+ def test_extending_selector(self, parser):
class MySelectorList(Selector.selectorlist_cls):
pass
class MySelector(Selector):
selectorlist_cls = MySelectorList
- sel = MySelector(text=u'foo
')
+ sel = MySelector(text=u'foo
', type=parser)
self.assertIsInstance(sel.xpath('//div'), MySelectorList)
self.assertIsInstance(sel.xpath('//div')[0], MySelector)
self.assertIsInstance(sel.css('div'), MySelectorList)
self.assertIsInstance(sel.css('div')[0], MySelector)
- def test_replacement_null_char_from_body(self):
+ @data(
+ ['html', u'Grainy
'],
+ ['html5', u'Grainy
'])
+ def test_replacement_null_char_from_body(self, parser_and_expected):
+ parser, expected = parser_and_expected
text = u'\x00Grainy
'
- self.assertEqual(u'Grainy
',
- self.sscls(text).extract())
-
+ self.assertEqual(expected,
+ self.sscls(text, parser).extract())
+
+ @data('20 < 100', '120 > 100')
+ def test_characters_gt_and_lt(self, elem):
+ """HTML5 parser tests: greater and less than symbols work as expected."""
+ body = u'''
+
+
+ {0}
+
+ '''
+
+ sel = self.sscls(text=body.format(elem), type='html5')
+ lt_res = sel.xpath('//div[@id="distance"]/text()').get()
+ self.assertEqual(lt_res, elem, )
+
+ @data(['html', '\n
two\n '],
+ ['html5', ''])
+ def test_complete_tags(self, parser_and_expected):
+ """HTML5 parser complete/fill tags as expected."""
+ body = u'''
+
+
+ one
+
two
+
+ '''
+ parser, expected = parser_and_expected
+ sel = self.sscls(text=body, type=parser)
+ res = sel.xpath('//div').get()
+ self.assertEqual(res, expected)
+
+ def test_control_characters(self):
+ """HTML5parser can't parse sequence characters."""
+ body = u'
'
+ self.assertRaisesRegexp(TypeError, 'HTML5parser does not support control characters',
+ self.sscls, body, 'html5')
+
+
+@ddt
class ExsltTestCase(unittest.TestCase):
-
sscls = Selector
- def test_regexp(self):
+ @file_data('html_parser.json')
+ def test_regexp(self, parser):
"""EXSLT regular expression tests"""
body = u"""
@@ -726,7 +808,7 @@ def test_regexp(self):
EXSLT match example
"""
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
# re:test()
self.assertEqual(
@@ -736,26 +818,25 @@ def test_regexp(self):
self.assertEqual(
[x.extract()
for x in sel.xpath(
- r'//a[re:test(@href, "\.html$")]/text()')],
+ r'//a[re:test(@href, "\.html$")]/text()')],
[u'first link', u'second link'])
self.assertEqual(
[x.extract()
for x in sel.xpath(
- '//a[re:test(@href, "first")]/text()')],
+ '//a[re:test(@href, "first")]/text()')],
[u'first link'])
self.assertEqual(
[x.extract()
for x in sel.xpath(
- '//a[re:test(@href, "second")]/text()')],
+ '//a[re:test(@href, "second")]/text()')],
[u'second link'])
-
# re:match() is rather special: it returns a node-set of nodes
- #[u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml',
- #u'http',
- #u'www.bayes.co.uk',
- #u'',
- #u'/xml/index.xml?/xml/utils/rechecker.xml']
+ # [u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml',
+ # u'http',
+ # u'www.bayes.co.uk',
+ # u'',
+ # u'/xml/index.xml?/xml/utils/rechecker.xml']
self.assertEqual(
sel.xpath(r're:match(//a[re:test(@href, "\.xml$")]/@href,'
r'"(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)")/text()').extract(),
@@ -771,7 +852,8 @@ def test_regexp(self):
r'"(\w+)://(.+)(\.xml)", "","https://\2.html")').extract(),
[u'https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html'])
- def test_set(self):
+ @file_data('html_parser.json')
+ def test_set(self, parser):
"""EXSLT set manipulation tests"""
# microdata example from http://schema.org/Event
body = u"""
@@ -801,7 +883,7 @@ def test_set(self):
"""
- sel = self.sscls(text=body)
+ sel = self.sscls(text=body, type=parser)
self.assertEqual(
sel.xpath('''//div[@itemtype="http://schema.org/Event"]