From 480d3855bab374ab59312deed79ad1185eef5782 Mon Sep 17 00:00:00 2001 From: aadityasinha-dotcom Date: Fri, 18 Mar 2022 15:54:28 +0530 Subject: [PATCH 1/2] html5 --- parsel/selector.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/parsel/selector.py b/parsel/selector.py index 45c88d09..fed6866e 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -6,6 +6,7 @@ from typing import Any, Dict, List, Optional, Mapping, Pattern, Union from lxml import etree, html +from lxml.html import html5parser from .utils import flatten, iflatten, extract_regex, shorten from .csstranslator import HTMLTranslator, GenericTranslator @@ -38,6 +39,10 @@ def __init__(self, *args, **kwargs) -> None: "_csstranslator": GenericTranslator(), "_tostring_method": "xml", }, + 'html5': {'_parser': html5parser.HTMLParser, + '_csstranslator': HTMLTranslator(), + '_tostring_method': 'html', + }, } @@ -55,6 +60,13 @@ def create_root_node(text, parser_cls, base_url=None): body = text.strip().replace("\x00", "").encode("utf8") or b"" parser = parser_cls(recover=True, encoding="utf8") root = etree.fromstring(body, parser=parser, base_url=base_url) + if parser == html5parser.HTMLParser: + try: + root = parser.parse(body, useChardet=False, override_encoding='utf8').getroot() + except ValueError: + raise TypeError('HTML5parser does not support control characters') + else: + root = etree.fromstring(body, parser=parser, base_url=base_url) if root is None: root = etree.fromstring(b"", parser=parser, base_url=base_url) return root @@ -225,7 +237,7 @@ class Selector: ``text`` is a `str`` object - ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default). + ``type`` defines the selector type, it can be ``"html"``, ``"xml"``, ``"html5"`` or ``None`` (default). If ``type`` is ``None``, the selector defaults to ``"html"``. ``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths. From 50069ac4ffa04b00706292e028604c80e6b4e390 Mon Sep 17 00:00:00 2001 From: aadityasinha-dotcom Date: Fri, 18 Mar 2022 16:03:47 +0530 Subject: [PATCH 2/2] selector for HTML5 --- parsel/selector.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/parsel/selector.py b/parsel/selector.py index fed6866e..d9884698 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -7,6 +7,7 @@ from lxml import etree, html from lxml.html import html5parser +from lxml.html.html5parser import document_fromstring from .utils import flatten, iflatten, extract_regex, shorten from .csstranslator import HTMLTranslator, GenericTranslator @@ -71,6 +72,10 @@ def create_root_node(text, parser_cls, base_url=None): root = etree.fromstring(b"", parser=parser, base_url=base_url) return root +def selector_for_html5(response): + root = document_fromstring(response.text) + selector = Selector(response, type='html', root=root) + return selector class SelectorList(List[_SelectorType]): """