diff --git a/parsel/selector.py b/parsel/selector.py index f9292a4f..125faa5a 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -6,6 +6,7 @@ import six from lxml import etree, html +from lxml.html import html5parser from .utils import flatten, iflatten, extract_regex from .csstranslator import HTMLTranslator, GenericTranslator @@ -23,6 +24,10 @@ def __init__(self, *args, **kwargs): 'xml': {'_parser': SafeXMLParser, '_csstranslator': GenericTranslator(), '_tostring_method': 'xml'}, + 'html5': {'_parser': html5parser.HTMLParser, + '_csstranslator': HTMLTranslator(), + '_tostring_method': 'html', + }, } @@ -39,8 +44,15 @@ def create_root_node(text, parser_cls, base_url=None): """Create root node for text using given parser class. """ body = text.strip().replace('\x00', '').encode('utf8') or b'' - parser = parser_cls(recover=True, encoding='utf8') - root = etree.fromstring(body, parser=parser, base_url=base_url) + if parser_cls == html5parser.HTMLParser: + try: + parser = parser_cls(namespaceHTMLElements=False) + root = parser.parse(body, useChardet=False, override_encoding='utf8').getroot() + except ValueError: + raise TypeError('HTML5parser does not support control characters') + else: + parser = parser_cls(recover=True, encoding='utf8') + root = etree.fromstring(body, parser=parser, base_url=base_url) if root is None: root = etree.fromstring(b'', parser=parser, base_url=base_url) return root @@ -158,7 +170,7 @@ class Selector(object): ``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3 - ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default). + ``type`` defines the selector type, it can be ``"html"``, ``"xml"``, ``"html5"`` or ``None`` (default). If ``type`` is ``None``, the selector defaults to ``"html"``. """ diff --git a/setup.py b/setup.py index 53f6a1c4..b616e03d 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,8 @@ def has_environment_marker_platform_impl_support(): 'w3lib>=1.19.0', 'lxml>=2.3', 'six>=1.5.2', - 'cssselect>=0.9' + 'cssselect>=0.9', + 'html5lib', ] extras_require = {} diff --git a/tests/html_parser.json b/tests/html_parser.json new file mode 100644 index 00000000..10c4e78f --- /dev/null +++ b/tests/html_parser.json @@ -0,0 +1,4 @@ +{ + "html_parser": "html", + "html5_parser": "html5" +} \ No newline at end of file diff --git a/tests/requirements.txt b/tests/requirements.txt index 9955decc..88758dbd 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,2 +1,3 @@ pytest pytest-cov +ddt \ No newline at end of file diff --git a/tests/test_selector.py b/tests/test_selector.py index e504166a..9a1b3a9e 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -4,20 +4,23 @@ import six import unittest import pickle +from ddt import ddt, file_data, data from parsel import Selector +@ddt class SelectorTestCase(unittest.TestCase): - sscls = Selector - def test_pickle_selector(self): - sel = self.sscls(text=u'

some text

') + @file_data('html_parser.json') + def test_pickle_selector(self, parser): + sel = self.sscls(text=u'

some text

', type=parser) self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel) - def test_pickle_selector_list(self): - sel = self.sscls(text=u'') + @file_data('html_parser.json') + def test_pickle_selector_list(self, parser): + sel = self.sscls(text=u'', type=parser) sel_list = sel.css('li') empty_sel_list = sel.css('p') self.assertIsInstance(sel_list, self.sscls.selectorlist_cls) @@ -25,10 +28,11 @@ def test_pickle_selector_list(self): self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list) self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), empty_sel_list) - def test_simple_selection(self): + @file_data('html_parser.json') + def test_simple_selection(self, parser): """Simple selector tests""" body = u"

" - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) xl = sel.xpath('//input') self.assertEqual(2, len(xl)) @@ -40,18 +44,21 @@ def test_simple_selection(self): self.assertEqual([x.extract() for x in sel.xpath("//input[@name='a']/@name")], [u'a']) - self.assertEqual([x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")], - [u'12.0']) + self.assertEqual( + [x.extract() for x in sel.xpath("number(concat(//input[@name='a']/@value, //input[@name='b']/@value))")], + [u'12.0']) self.assertEqual(sel.xpath("concat('xpath', 'rules')").extract(), [u'xpathrules']) - self.assertEqual([x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")], - [u'12']) + self.assertEqual( + [x.extract() for x in sel.xpath("concat(//input[@name='a']/@value, //input[@name='b']/@value)")], + [u'12']) - def test_simple_selection_with_variables(self): + @file_data('html_parser.json') + def test_simple_selection_with_variables(self, parser): """Using XPath variables""" body = u"

" - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertEqual([x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)], [u'a']) @@ -78,11 +85,12 @@ def test_simple_selection_with_variables(self): tag="input", cnt=2, test=True).extract(), [u'1']) - def test_simple_selection_with_variables_escape_friendly(self): + @file_data('html_parser.json') + def test_simple_selection_with_variables_escape_friendly(self, parser): """Using XPath variables with quotes that would need escaping with string formatting""" body = u"""

I'm mixing single and "double quotes" and I don't care :)

""" - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) t = 'I say "Yeah!"' # naive string formatting with give something like: @@ -98,10 +106,11 @@ def test_simple_selection_with_variables_escape_friendly(self): self.assertRaises(ValueError, sel.xpath, "//p[normalize-space()='{}']//@name".format(lt)) self.assertEqual([x.extract() for x in sel.xpath("//p[normalize-space()=$lng]//@name", - lng=lt)], + lng=lt)], [u'a']) - def test_accessing_attributes(self): + @file_data('html_parser.json') + def test_accessing_attributes(self, parser): body = u""" @@ -113,7 +122,7 @@ def test_accessing_attributes(self): """ - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertEqual({'lang': 'en', 'version': '1.0'}, sel.attrib) self.assertEqual({'id': 'some-list', 'class': 'list-cls'}, sel.css('ul')[0].attrib) @@ -129,9 +138,10 @@ def test_accessing_attributes(self): {'class': 'item-cls', 'id': 'list-item-3'}], [e.attrib for e in sel.css('li')]) - def test_representation_slice(self): + @file_data('html_parser.json') + def test_representation_slice(self, parser): body = u"

".format(50 * 'b') - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) representation = "".format(40 * 'b') if six.PY2: @@ -142,14 +152,15 @@ def test_representation_slice(self): [representation] ) - def test_representation_unicode_query(self): + @file_data('html_parser.json') + def test_representation_unicode_query(self, parser): body = u"

".format(50 * 'b') representation = '' if six.PY2: representation = "" - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertEqual( [repr(it) for it in sel.xpath(u'//input[@value="\xa9"]/@value')], [representation] @@ -159,10 +170,11 @@ def test_check_text_argument_type(self): self.assertRaisesRegexp(TypeError, 'text argument should be of type', self.sscls, b'') - def test_extract_first(self): + @file_data('html_parser.json') + def test_extract_first(self, parser): """Test if extract_first() returns first element""" body = u'
  • 1
  • 2
' - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertEqual(sel.xpath('//ul/li/text()').extract_first(), sel.xpath('//ul/li/text()').extract()[0]) @@ -175,41 +187,46 @@ def test_extract_first(self): self.assertEqual(sel.xpath('/ul/li[@id="doesnt-exist"]/text()').extract_first(), None) - def test_extract_first_default(self): + @file_data('html_parser.json') + def test_extract_first_default(self, parser): """Test if extract_first() returns default value when no results found""" body = u'
  • 1
  • 2
' - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertEqual(sel.xpath('//div/text()').extract_first(default='missing'), 'missing') - def test_selector_get_alias(self): + @file_data('html_parser.json') + def test_selector_get_alias(self, parser): """Test if get() returns extracted value on a Selector""" body = u'
  • 1
  • 2
  • 3
' - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertEqual(sel.xpath('//ul/li[position()>1]')[0].get(), u'
  • 2
  • ') self.assertEqual(sel.xpath('//ul/li[position()>1]/text()')[0].get(), u'2') - def test_selector_getall_alias(self): + @file_data('html_parser.json') + def test_selector_getall_alias(self, parser): """Test if get() returns extracted value on a Selector""" body = u'
    • 1
    • 2
    • 3
    ' - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertListEqual(sel.xpath('//ul/li[position()>1]')[0].getall(), [u'
  • 2
  • ']) self.assertListEqual(sel.xpath('//ul/li[position()>1]/text()')[0].getall(), [u'2']) - def test_selectorlist_get_alias(self): + @file_data('html_parser.json') + def test_selectorlist_get_alias(self, parser): """Test if get() returns first element for a selection call""" body = u'
    • 1
    • 2
    • 3
    ' - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertEqual(sel.xpath('//ul/li').get(), u'
  • 1
  • ') self.assertEqual(sel.xpath('//ul/li/text()').get(), u'1') - def test_re_first(self): + @file_data('html_parser.json') + def test_re_first(self, parser): """Test if re_first() returns first matched element""" body = u'
    • 1
    • 2
    ' - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertEqual(sel.xpath('//ul/li/text()').re_first(r'\d'), sel.xpath('//ul/li/text()').re(r'\d')[0]) @@ -235,20 +252,23 @@ def test_extract_first_default(self): self.assertEqual(sel.xpath('//div/text()').re_first(r'\w+', default='missing'), 'missing') self.assertEqual(sel.xpath('/ul/li/text()').re_first(r'\w+', default='missing'), 'missing') - def test_select_unicode_query(self): + @file_data('html_parser.json') + def test_select_unicode_query(self, parser): body = u"

    " - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertEqual(sel.xpath(u'//input[@name="\xa9"]/@value').extract(), [u'1']) - def test_list_elements_type(self): + @file_data('html_parser.json') + def test_list_elements_type(self, parser): """Test Selector returning the same type in selection methods""" text = u'

    test

    ' - assert isinstance(self.sscls(text=text).xpath("//p")[0], self.sscls) - assert isinstance(self.sscls(text=text).css("p")[0], self.sscls) + assert isinstance(self.sscls(text=text, type=parser).xpath("//p")[0], self.sscls) + assert isinstance(self.sscls(text=text, type=parser).css("p")[0], self.sscls) - def test_boolean_result(self): + @file_data('html_parser.json') + def test_boolean_result(self, parser): body = u"

    " - xs = self.sscls(text=body) + xs = self.sscls(text=body, type=parser) self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), [u'1']) self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), [u'0']) @@ -272,9 +292,10 @@ def test_text_or_root_is_required(self): 'Selector needs either text or root argument', self.sscls) - def test_bool(self): + @file_data('html_parser.json') + def test_bool(self, parser): text = u'falsetrue' - hs = self.sscls(text=text, type='html') + hs = self.sscls(text=text, type=parser) falsish = hs.xpath('//a/@href')[0] self.assertEqual(falsish.extract(), u'') self.assertFalse(falsish) @@ -282,16 +303,18 @@ def test_bool(self): self.assertEqual(trueish.extract(), u'nonempty') self.assertTrue(trueish) - def test_slicing(self): + @file_data('html_parser.json') + def test_slicing(self, parser): text = u'

    1

    2

    3

    ' - hs = self.sscls(text=text, type='html') + hs = self.sscls(text=text, type=parser) self.assertIsInstance(hs.css('p')[2], self.sscls) self.assertIsInstance(hs.css('p')[2:3], self.sscls.selectorlist_cls) self.assertIsInstance(hs.css('p')[:2], self.sscls.selectorlist_cls) self.assertEqual(hs.css('p')[2:3].extract(), [u'

    3

    ']) self.assertEqual(hs.css('p')[1:3].extract(), [u'

    2

    ', u'

    3

    ']) - def test_nested_selectors(self): + @file_data('html_parser.json') + def test_nested_selectors(self, parser): """Nested selector tests""" body = u"""
    @@ -306,7 +329,7 @@ def test_nested_selectors(self):
    """ - x = self.sscls(text=body) + x = self.sscls(text=body, type=parser) divtwo = x.xpath('//div[@class="two"]') self.assertEqual(divtwo.xpath("//li").extract(), ["
  • one
  • ", "
  • two
  • ", "
  • four
  • ", "
  • five
  • ", "
  • six
  • "]) @@ -316,7 +339,8 @@ def test_nested_selectors(self): ["
  • four
  • ", "
  • five
  • ", "
  • six
  • "]) self.assertEqual(divtwo.xpath("./li").extract(), []) - def test_selectorlist_getall_alias(self): + @file_data('html_parser.json') + def test_selectorlist_getall_alias(self, parser): """Nested selector tests using getall()""" body = u"""
    @@ -331,7 +355,7 @@ def test_selectorlist_getall_alias(self):
    """ - x = self.sscls(text=body) + x = self.sscls(text=body, type=parser) divtwo = x.xpath('//div[@class="two"]') self.assertEqual(divtwo.xpath("//li").getall(), ["
  • one
  • ", "
  • two
  • ", "
  • four
  • ", "
  • five
  • ", "
  • six
  • "]) @@ -341,17 +365,19 @@ def test_selectorlist_getall_alias(self): ["
  • four
  • ", "
  • five
  • ", "
  • six
  • "]) self.assertEqual(divtwo.xpath("./li").getall(), []) - def test_mixed_nested_selectors(self): + @file_data('html_parser.json') + def test_mixed_nested_selectors(self, parser): body = u'''
    notme

    text

    foo
    ''' - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertEqual(sel.xpath('//div[@id="1"]').css('span::text').extract(), [u'me']) self.assertEqual(sel.css('#1').xpath('./span/text()').extract(), [u'me']) - def test_dont_strip(self): - sel = self.sscls(text=u'
    fff: zzz
    ') + @file_data('html_parser.json') + def test_dont_strip(self, parser): + sel = self.sscls(text=u'
    fff: zzz
    ', type=parser) self.assertEqual(sel.xpath("//text()").extract(), [u'fff: ', u'zzz']) def test_namespaces_simple(self): @@ -437,37 +463,38 @@ def test_namespaces_multiple_adhoc(self): # "b" namespace being passed ad-hoc self.assertEqual(x.xpath("//b:Operation/text()", - namespaces={"b": "http://somens.com"}).extract()[0], 'hello') + namespaces={"b": "http://somens.com"}).extract()[0], 'hello') # "b" namespace declaration is not cached self.assertRaises(ValueError, x.xpath, "//xmlns:TestTag/@b:att") # "xmlns" is still defined self.assertEqual(x.xpath("//xmlns:TestTag/@b:att", - namespaces={"b": "http://somens.com"}).extract()[0], 'value') + namespaces={"b": "http://somens.com"}).extract()[0], 'value') # chained selectors still have knowledge of register_namespace() operations self.assertEqual(x.xpath("//p:SecondTestTag", - namespaces={"p": "http://www.scrapy.org/product"}).xpath("./xmlns:price/text()")[0].extract(), '90') + namespaces={"p": "http://www.scrapy.org/product"}).xpath("./xmlns:price/text()")[ + 0].extract(), '90') # but chained selector don't know about parent ad-hoc declarations - self.assertRaises(ValueError,x.xpath("//p:SecondTestTag", - namespaces={"p": "http://www.scrapy.org/product"}).xpath, "p:name/text()") + self.assertRaises(ValueError, x.xpath("//p:SecondTestTag", + namespaces={"p": "http://www.scrapy.org/product"}).xpath, "p:name/text()") # ad-hoc declarations need repeats when chaining self.assertEqual(x.xpath("//p:SecondTestTag", - namespaces={"p": "http://www.scrapy.org/product"} - ).xpath("p:name/text()", - namespaces={"p": "http://www.scrapy.org/product"} - ).extract_first(), 'Dried Rose') + namespaces={"p": "http://www.scrapy.org/product"} + ).xpath("p:name/text()", + namespaces={"p": "http://www.scrapy.org/product"} + ).extract_first(), 'Dried Rose') # declaring several ad-hoc namespaces self.assertEqual(x.xpath("""string( //b:Operation /following-sibling::xmlns:TestTag /following-sibling::*//p:name)""", - namespaces={"b": "http://somens.com", - "p": "http://www.scrapy.org/product"}).extract_first(), 'Dried Rose') + namespaces={"b": "http://somens.com", + "p": "http://www.scrapy.org/product"}).extract_first(), 'Dried Rose') # "p" prefix is not cached from previous calls self.assertRaises(ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()") @@ -481,7 +508,8 @@ def test_make_links_absolute(self): sel.root.make_links_absolute() self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first()) - def test_re(self): + @file_data('html_parser.json') + def test_re(self, parser): body = u"""
    Name: Mary
    • Name: John
    • @@ -491,14 +519,14 @@ def test_re(self):
    Age: 20
    """ - x = self.sscls(text=body) + x = self.sscls(text=body, type=parser) name_re = re.compile(r"Name: (\w+)") self.assertEqual(x.xpath("//ul/li").re(name_re), ["John", "Paul"]) self.assertEqual(x.xpath("//ul/li").re(r"Age: (\d+)"), ["10", "20"]) - + # Test named group, hit and miss x = self.sscls(text=u'foobar') self.assertEqual(x.re('(?Pfoo)'), ['foo']) @@ -508,10 +536,11 @@ def test_re(self): x = self.sscls(text=u'baz') self.assertEqual(x.re('(?Pfoo)|(?Pbaz)'), []) - def test_re_replace_entities(self): + @file_data('html_parser.json') + def test_re_replace_entities(self, parser): body = u"""""" - x = self.sscls(text=body) - + x = self.sscls(text=body, type=parser) + name_re = re.compile('{"foo":(.*)}') # by default, only & and < are preserved ; @@ -534,9 +563,10 @@ def test_re_replace_entities(self): self.assertEqual(x.xpath("//script/text()").re_first(name_re, replace_entities=False), expected) self.assertEqual(x.xpath("//script")[0].re_first(name_re, replace_entities=False), expected) - def test_re_intl(self): + @file_data('html_parser.json') + def test_re_intl(self, parser): body = u'
    Evento: cumplea\xf1os
    ' - x = self.sscls(text=body) + x = self.sscls(text=body, type=parser) self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), [u'cumplea\xf1os']) def test_selector_over_text(self): @@ -546,20 +576,23 @@ def test_selector_over_text(self): self.assertEqual(xs.extract(), u'lala') self.assertEqual(xs.xpath('.').extract(), [u'lala']) - def test_invalid_xpath(self): + @file_data('html_parser.json') + def test_invalid_xpath(self, parser): "Test invalid xpath raises ValueError with the invalid xpath" - x = self.sscls(text=u"") + x = self.sscls(text=u"", type=parser) xpath = "//test[@foo='bar]" self.assertRaisesRegexp(ValueError, re.escape(xpath), x.xpath, xpath) - def test_invalid_xpath_unicode(self): + @file_data('html_parser.json') + def test_invalid_xpath_unicode(self, parser): "Test *Unicode* invalid xpath raises ValueError with the invalid xpath" - x = self.sscls(text=u"") + x = self.sscls(text=u"", type=parser) xpath = u"//test[@foo='\u0431ar]" encoded = xpath if six.PY3 else xpath.encode('unicode_escape') self.assertRaisesRegexp(ValueError, re.escape(encoded), x.xpath, xpath) - def test_http_header_encoding_precedence(self): + @file_data('html_parser.json') + def test_http_header_encoding_precedence(self, parser): # u'\xa3' = pound symbol in unicode # u'\xc2\xa3' = pound symbol in utf-8 # u'\xa3' = pound symbol in latin-1 (iso-8859-1) @@ -567,20 +600,22 @@ def test_http_header_encoding_precedence(self): text = u''' \xa3''' - x = self.sscls(text=text) + x = self.sscls(text=text, type=parser) self.assertEqual(x.xpath("//span[@id='blank']/text()").extract(), - [u'\xa3']) + [u'\xa3']) - def test_empty_bodies_shouldnt_raise_errors(self): - self.sscls(text=u'').xpath('//text()').extract() + @file_data('html_parser.json') + def test_empty_bodies_shouldnt_raise_errors(self, parser): + self.sscls(text=u'', type=parser).xpath('//text()').extract() def test_bodies_with_comments_only(self): sel = self.sscls(text=u'', base_url='http://example.com') self.assertEqual(u'http://example.com', sel.root.base) - def test_null_bytes_shouldnt_raise_errors(self): + @file_data('html_parser.json') + def test_null_bytes_shouldnt_raise_errors(self, parser): text = u'pre\x00post' - self.sscls(text).xpath('//text()').extract() + self.sscls(text, type=parser).xpath('//text()').extract() def test_replacement_char_from_badly_encoded_body(self): # \xe9 alone isn't valid utf8 sequence @@ -588,8 +623,9 @@ def test_replacement_char_from_badly_encoded_body(self): self.assertEqual([u'an Jos\ufffd de'], self.sscls(text).xpath('//text()').extract()) - def test_select_on_unevaluable_nodes(self): - r = self.sscls(text=u'some text') + @file_data('html_parser.json') + def test_select_on_unevaluable_nodes(self, parser): + r = self.sscls(text=u'some text', type=parser) # Text node x1 = r.xpath('//text()') self.assertEqual(x1.extract(), [u'some text']) @@ -599,8 +635,9 @@ def test_select_on_unevaluable_nodes(self): self.assertEqual(x1.extract(), [u'big']) self.assertEqual(x1.xpath('.//text()').extract(), []) - def test_select_on_text_nodes(self): - r = self.sscls(text=u'
    Options:opt1
    Otheropt2
    ') + @file_data('html_parser.json') + def test_select_on_text_nodes(self, parser): + r = self.sscls(text=u'
    Options:opt1
    Otheropt2
    ', type=parser) x1 = r.xpath("//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]") self.assertEqual(x1.extract(), [u'opt1']) @@ -615,12 +652,13 @@ def test_nested_select_on_text_nodes(self): x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]") self.assertEqual(x2.extract(), [u'Options:']) - def test_weakref_slots(self): + @file_data('html_parser.json') + def test_weakref_slots(self, parser): """Check that classes are using slots and are weak-referenceable""" - x = self.sscls(text=u'') + x = self.sscls(text=u'', type=parser) weakref.ref(x) assert not hasattr(x, '__dict__'), "%s does not use __slots__" % \ - x.__class__.__name__ + x.__class__.__name__ def test_remove_namespaces(self): xml = u""" @@ -648,7 +686,8 @@ def test_remove_attributes_namespaces(self): sel.remove_namespaces() self.assertEqual(len(sel.xpath("//link/@type")), 2) - def test_smart_strings(self): + @file_data('html_parser.json') + def test_smart_strings(self, parser): """Lxml smart strings return values""" class SmartStringsSelector(Selector): @@ -669,22 +708,22 @@ class SmartStringsSelector(Selector): # .getparent() is available for text nodes and attributes # only when smart_strings are on - x = self.sscls(text=body) + x = self.sscls(text=body, type=parser) li_text = x.xpath('//li/text()') self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), li_text))) div_class = x.xpath('//div/@class') self.assertFalse(any(map(lambda e: hasattr(e.root, 'getparent'), div_class))) - x = SmartStringsSelector(text=body) + x = SmartStringsSelector(text=body, type=parser) li_text = x.xpath('//li/text()') self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), li_text))) div_class = x.xpath('//div/@class') self.assertTrue(all(map(lambda e: hasattr(e.root, 'getparent'), div_class))) def test_xml_entity_expansion(self): - malicious_xml = u''\ - ' ]>&xxe;' + malicious_xml = u'' \ + ' ]>&xxe;' sel = self.sscls(text=malicious_xml, type='xml') @@ -694,29 +733,72 @@ def test_configure_base_url(self): sel = self.sscls(text=u'nothing', base_url='http://example.com') self.assertEqual(u'http://example.com', sel.root.base) - def test_extending_selector(self): + @file_data('html_parser.json') + def test_extending_selector(self, parser): class MySelectorList(Selector.selectorlist_cls): pass class MySelector(Selector): selectorlist_cls = MySelectorList - sel = MySelector(text=u'
    foo
    ') + sel = MySelector(text=u'
    foo
    ', type=parser) self.assertIsInstance(sel.xpath('//div'), MySelectorList) self.assertIsInstance(sel.xpath('//div')[0], MySelector) self.assertIsInstance(sel.css('div'), MySelectorList) self.assertIsInstance(sel.css('div')[0], MySelector) - def test_replacement_null_char_from_body(self): + @data( + ['html', u'

    Grainy

    '], + ['html5', u'

    Grainy

    ']) + def test_replacement_null_char_from_body(self, parser_and_expected): + parser, expected = parser_and_expected text = u'\x00

    Grainy

    ' - self.assertEqual(u'

    Grainy

    ', - self.sscls(text).extract()) - + self.assertEqual(expected, + self.sscls(text, parser).extract()) + + @data('20 < 100', '120 > 100') + def test_characters_gt_and_lt(self, elem): + """HTML5 parser tests: greater and less than symbols work as expected.""" + body = u''' + + +
    {0}
    + + ''' + + sel = self.sscls(text=body.format(elem), type='html5') + lt_res = sel.xpath('//div[@id="distance"]/text()').get() + self.assertEqual(lt_res, elem, ) + + @data(['html', '
    \n
  • two
  • \n
    '], + ['html5', '
    ']) + def test_complete_tags(self, parser_and_expected): + """HTML5 parser complete/fill tags as expected.""" + body = u''' + + +
  • one
  • +
  • two
  • + + ''' + parser, expected = parser_and_expected + sel = self.sscls(text=body, type=parser) + res = sel.xpath('//div').get() + self.assertEqual(res, expected) + + def test_control_characters(self): + """HTML5parser can't parse sequence characters.""" + body = u'

    ' + self.assertRaisesRegexp(TypeError, 'HTML5parser does not support control characters', + self.sscls, body, 'html5') + + +@ddt class ExsltTestCase(unittest.TestCase): - sscls = Selector - def test_regexp(self): + @file_data('html_parser.json') + def test_regexp(self, parser): """EXSLT regular expression tests""" body = u"""

    @@ -726,7 +808,7 @@ def test_regexp(self): EXSLT match example """ - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) # re:test() self.assertEqual( @@ -736,26 +818,25 @@ def test_regexp(self): self.assertEqual( [x.extract() for x in sel.xpath( - r'//a[re:test(@href, "\.html$")]/text()')], + r'//a[re:test(@href, "\.html$")]/text()')], [u'first link', u'second link']) self.assertEqual( [x.extract() for x in sel.xpath( - '//a[re:test(@href, "first")]/text()')], + '//a[re:test(@href, "first")]/text()')], [u'first link']) self.assertEqual( [x.extract() for x in sel.xpath( - '//a[re:test(@href, "second")]/text()')], + '//a[re:test(@href, "second")]/text()')], [u'second link']) - # re:match() is rather special: it returns a node-set of nodes - #[u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml', - #u'http', - #u'www.bayes.co.uk', - #u'', - #u'/xml/index.xml?/xml/utils/rechecker.xml'] + # [u'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml', + # u'http', + # u'www.bayes.co.uk', + # u'', + # u'/xml/index.xml?/xml/utils/rechecker.xml'] self.assertEqual( sel.xpath(r're:match(//a[re:test(@href, "\.xml$")]/@href,' r'"(\w+):\/\/([^/:]+)(:\d*)?([^# ]*)")/text()').extract(), @@ -771,7 +852,8 @@ def test_regexp(self): r'"(\w+)://(.+)(\.xml)", "","https://\2.html")').extract(), [u'https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html']) - def test_set(self): + @file_data('html_parser.json') + def test_set(self, parser): """EXSLT set manipulation tests""" # microdata example from http://schema.org/Event body = u""" @@ -801,7 +883,7 @@ def test_set(self): """ - sel = self.sscls(text=body) + sel = self.sscls(text=body, type=parser) self.assertEqual( sel.xpath('''//div[@itemtype="http://schema.org/Event"]