diff --git a/parsel/selector.py b/parsel/selector.py index 45c88d09..d9884698 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -6,6 +6,8 @@ from typing import Any, Dict, List, Optional, Mapping, Pattern, Union from lxml import etree, html +from lxml.html import html5parser +from lxml.html.html5parser import document_fromstring from .utils import flatten, iflatten, extract_regex, shorten from .csstranslator import HTMLTranslator, GenericTranslator @@ -38,6 +40,10 @@ def __init__(self, *args, **kwargs) -> None: "_csstranslator": GenericTranslator(), "_tostring_method": "xml", }, + 'html5': {'_parser': html5parser.HTMLParser, + '_csstranslator': HTMLTranslator(), + '_tostring_method': 'html', + }, } @@ -55,10 +61,21 @@ def create_root_node(text, parser_cls, base_url=None): body = text.strip().replace("\x00", "").encode("utf8") or b"" parser = parser_cls(recover=True, encoding="utf8") root = etree.fromstring(body, parser=parser, base_url=base_url) + if parser == html5parser.HTMLParser: + try: + root = parser.parse(body, useChardet=False, override_encoding='utf8').getroot() + except ValueError: + raise TypeError('HTML5parser does not support control characters') + else: + root = etree.fromstring(body, parser=parser, base_url=base_url) if root is None: root = etree.fromstring(b"", parser=parser, base_url=base_url) return root +def selector_for_html5(response): + root = document_fromstring(response.text) + selector = Selector(response, type='html', root=root) + return selector class SelectorList(List[_SelectorType]): """ @@ -225,7 +242,7 @@ class Selector: ``text`` is a `str`` object - ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default). + ``type`` defines the selector type, it can be ``"html"``, ``"xml"``, ``"html5"`` or ``None`` (default). If ``type`` is ``None``, the selector defaults to ``"html"``. ``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths.