From 480d3855bab374ab59312deed79ad1185eef5782 Mon Sep 17 00:00:00 2001
From: aadityasinha-dotcom
Date: Fri, 18 Mar 2022 15:54:28 +0530
Subject: [PATCH 1/2] html5
---
parsel/selector.py | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/parsel/selector.py b/parsel/selector.py
index 45c88d09..fed6866e 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -6,6 +6,7 @@
from typing import Any, Dict, List, Optional, Mapping, Pattern, Union
from lxml import etree, html
+from lxml.html import html5parser
from .utils import flatten, iflatten, extract_regex, shorten
from .csstranslator import HTMLTranslator, GenericTranslator
@@ -38,6 +39,10 @@ def __init__(self, *args, **kwargs) -> None:
"_csstranslator": GenericTranslator(),
"_tostring_method": "xml",
},
+ 'html5': {'_parser': html5parser.HTMLParser,
+ '_csstranslator': HTMLTranslator(),
+ '_tostring_method': 'html',
+ },
}
@@ -55,6 +60,13 @@ def create_root_node(text, parser_cls, base_url=None):
body = text.strip().replace("\x00", "").encode("utf8") or b""
parser = parser_cls(recover=True, encoding="utf8")
root = etree.fromstring(body, parser=parser, base_url=base_url)
+ if parser == html5parser.HTMLParser:
+ try:
+ root = parser.parse(body, useChardet=False, override_encoding='utf8').getroot()
+ except ValueError:
+ raise TypeError('HTML5parser does not support control characters')
+ else:
+ root = etree.fromstring(body, parser=parser, base_url=base_url)
if root is None:
root = etree.fromstring(b"
", parser=parser, base_url=base_url)
return root
@@ -225,7 +237,7 @@ class Selector:
``text`` is a `str`` object
- ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default).
+ ``type`` defines the selector type, it can be ``"html"``, ``"xml"``, ``"html5"`` or ``None`` (default).
If ``type`` is ``None``, the selector defaults to ``"html"``.
``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths.
From 50069ac4ffa04b00706292e028604c80e6b4e390 Mon Sep 17 00:00:00 2001
From: aadityasinha-dotcom
Date: Fri, 18 Mar 2022 16:03:47 +0530
Subject: [PATCH 2/2] selector for HTML5
---
parsel/selector.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/parsel/selector.py b/parsel/selector.py
index fed6866e..d9884698 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -7,6 +7,7 @@
from lxml import etree, html
from lxml.html import html5parser
+from lxml.html.html5parser import document_fromstring
from .utils import flatten, iflatten, extract_regex, shorten
from .csstranslator import HTMLTranslator, GenericTranslator
@@ -71,6 +72,10 @@ def create_root_node(text, parser_cls, base_url=None):
root = etree.fromstring(b"", parser=parser, base_url=base_url)
return root
+def selector_for_html5(response):
+ root = document_fromstring(response.text)
+ selector = Selector(response, type='html', root=root)
+ return selector
class SelectorList(List[_SelectorType]):
"""