diff --git a/docs/conf.py b/docs/conf.py index 71a46bb2..c08e0076 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,24 +21,24 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'notfound.extension', - 'sphinx.ext.autodoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.viewcode', + "notfound.extension", + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'Parsel' -copyright = '2015, Scrapy Project' +project = "Parsel" +copyright = "2015, Scrapy Project" # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout @@ -51,28 +51,28 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" -suppress_warnings = ['epub.unknown_project_files'] +suppress_warnings = ["epub.unknown_project_files"] # -- Options for HTML output ------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom static files (such as style sheets) # here, relative to this directory. They are copied after the builtin # static files, so a file named "default.css" will overwrite the builtin # "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Output file base name for HTML help builder. -htmlhelp_basename = 'parseldoc' +htmlhelp_basename = "parseldoc" # -- Options for LaTeX output ------------------------------------------ @@ -83,9 +83,13 @@ # (source start file, target name, title, author, documentclass # [howto/manual]). latex_documents = [ - ('index', 'parsel.tex', - 'Parsel Documentation', - 'Scrapy Project', 'manual'), + ( + "index", + "parsel.tex", + "Parsel Documentation", + "Scrapy Project", + "manual", + ), ] @@ -94,9 +98,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'parsel', - 'Parsel Documentation', - ['Scrapy Project'], 1) + ("index", "parsel", "Parsel Documentation", ["Scrapy Project"], 1) ] @@ -106,21 +108,24 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'parsel', - 'Parsel Documentation', - 'Scrapy Project', - 'parsel', - 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "parsel", + "Parsel Documentation", + "Scrapy Project", + "parsel", + "One line description of project.", + "Miscellaneous", + ), ] # -- Options for the InterSphinx extension ------------------------------------ intersphinx_mapping = { - 'cssselect': ('https://cssselect.readthedocs.io/en/latest', None), - 'python': ('https://docs.python.org/3', None), - 'requests': ('https://requests.kennethreitz.org/en/latest', None), + "cssselect": ("https://cssselect.readthedocs.io/en/latest", None), + "python": ("https://docs.python.org/3", None), + "requests": ("https://requests.kennethreitz.org/en/latest", None), } @@ -128,8 +133,8 @@ # nitpicky = True # https://github.com/scrapy/cssselect/pull/110 nitpick_ignore = [ - ('py:class', 'cssselect.xpath.GenericTranslator'), - ('py:class', 'cssselect.xpath.HTMLTranslator'), - ('py:class', 'cssselect.xpath.XPathExpr'), - ('py:class', 'lxml.etree.XMLParser'), + ("py:class", "cssselect.xpath.GenericTranslator"), + ("py:class", "cssselect.xpath.HTMLTranslator"), + ("py:class", "cssselect.xpath.XPathExpr"), + ("py:class", "lxml.etree.XMLParser"), ] diff --git a/docs/conftest.py b/docs/conftest.py index 3f7f7513..fd932aca 100644 --- a/docs/conftest.py +++ b/docs/conftest.py @@ -2,10 +2,13 @@ from doctest import ELLIPSIS, NORMALIZE_WHITESPACE from sybil import Sybil + try: from sybil.parsers.codeblock import PythonCodeBlockParser except ImportError: - from sybil.parsers.codeblock import CodeBlockParser as PythonCodeBlockParser + from sybil.parsers.codeblock import ( + CodeBlockParser as PythonCodeBlockParser, + ) from sybil.parsers.doctest import DocTestParser from sybil.parsers.skip import skip @@ -13,21 +16,21 @@ def load_selector(filename, **kwargs): - input_path = os.path.join(os.path.dirname(__file__), '_static', filename) + input_path = os.path.join(os.path.dirname(__file__), "_static", filename) with open(input_path, encoding="utf-8") as input_file: return Selector(text=input_file.read(), **kwargs) def setup(namespace): - namespace['load_selector'] = load_selector + namespace["load_selector"] = load_selector pytest_collect_file = Sybil( parsers=[ DocTestParser(optionflags=ELLIPSIS | NORMALIZE_WHITESPACE), - PythonCodeBlockParser(future_imports=['print_function']), + PythonCodeBlockParser(future_imports=["print_function"]), skip, ], - pattern='*.rst', + pattern="*.rst", setup=setup, ).pytest() diff --git a/parsel/csstranslator.py b/parsel/csstranslator.py index c822d182..7aa1a25d 100644 --- a/parsel/csstranslator.py +++ b/parsel/csstranslator.py @@ -14,7 +14,9 @@ class XPathExpr(OriginalXPathExpr): @classmethod def from_xpath(cls, xpath, textnode=False, attribute=None): - x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition) + x = cls( + path=xpath.path, element=xpath.element, condition=xpath.condition + ) x.textnode = textnode x.attribute = attribute return x @@ -81,7 +83,9 @@ def xpath_attr_functional_pseudo_element(self, xpath, function): raise ExpressionError( f"Expected a single string or ident for ::attr(), got {function.arguments!r}" ) - return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value) + return XPathExpr.from_xpath( + xpath, attribute=function.arguments[0].value + ) def xpath_text_simple_pseudo_element(self, xpath): """Support selecting text nodes using ::text pseudo-element""" diff --git a/parsel/selector.py b/parsel/selector.py index 45c88d09..5709ba3d 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -6,6 +6,8 @@ from typing import Any, Dict, List, Optional, Mapping, Pattern, Union from lxml import etree, html +import json +from jsonpath_ng import parse as jsonpathParser from .utils import flatten, iflatten, extract_regex, shorten from .csstranslator import HTMLTranslator, GenericTranslator @@ -41,15 +43,6 @@ def __init__(self, *args, **kwargs) -> None: } -def _st(st: Optional[str]) -> str: - if st is None: - return "html" - elif st in _ctgroup: - return st - else: - raise ValueError(f"Invalid type: {st}") - - def create_root_node(text, parser_cls, base_url=None): """Create root node for text using given parser class.""" body = text.strip().replace("\x00", "").encode("utf8") or b"" @@ -93,7 +86,7 @@ def xpath( Call the ``.xpath()`` method for each element in this list and return their results flattened as another :class:`SelectorList`. - ``query`` is the same argument as the one in :meth:`Selector.xpath` + ``xpath`` is the same argument as the one in :meth:`Selector.xpath` ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict) for additional prefixes to those registered with ``register_namespace(prefix, uri)``. @@ -106,7 +99,9 @@ def xpath( selector.xpath('//a[href=$url]', url="http://www.example.com") """ return self.__class__( - flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]) + flatten( + [x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self] + ) ) def css(self, query: str) -> "SelectorList[_SelectorType]": @@ -130,7 +125,18 @@ def re( Passing ``replace_entities`` as ``False`` switches off these replacements. """ - return flatten([x.re(regex, replace_entities=replace_entities) for x in self]) + return flatten( + [x.re(regex, replace_entities=replace_entities) for x in self] + ) + + def jsonpath(self, query: str) -> "SelectorList[_SelectorType]": + """ + Call the ``.jsonpath`` method for each element in the list and return + their results flattened as another :class:`SelectorList`. + + ``query`` is the same argument as the one in :meth:`Selector.jsonpath` + """ + return self.__class__(flatten([x.jsonpath(query) for x in self])) @typing.overload def re_first( @@ -218,6 +224,16 @@ def remove(self) -> None: x.remove() +_NOTSET = object() + + +def _load_json_or_none(text): + try: + return json.loads(text) + except ValueError: + return None + + class Selector: """ :class:`Selector` allows you to select parts of an XML or HTML text using CSS @@ -225,7 +241,7 @@ class Selector: ``text`` is a `str`` object - ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default). + ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` , ``json`` or ``None`` (default). If ``type`` is ``None``, the selector defaults to ``"html"``. ``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths. @@ -233,18 +249,14 @@ class Selector: """ __slots__ = [ - "text", "namespaces", "type", "_expr", "root", + "_text", "__weakref__", - "_parser", - "_csstranslator", - "_tostring_method", ] - _default_type: Optional[str] = None _default_namespaces = { "re": "http://exslt.org/regular-expressions", # supported in libxslt: @@ -263,34 +275,59 @@ def __init__( text: Optional[str] = None, type: Optional[str] = None, namespaces: Optional[Mapping[str, str]] = None, - root: Optional[Any] = None, + root: Optional[Any] = _NOTSET, base_url: Optional[str] = None, _expr: Optional[str] = None, ) -> None: - self.type = st = _st(type or self._default_type) - self._parser = _ctgroup[st]["_parser"] - self._csstranslator = _ctgroup[st]["_csstranslator"] - self._tostring_method = _ctgroup[st]["_tostring_method"] + if type not in ("html", "json", "text", "xml", None): + raise ValueError(f"Invalid type: {type}") + + self._text = text + + if text is None and root is _NOTSET: + raise ValueError("Selector needs either text or root argument") if text is not None: if not isinstance(text, str): msg = f"text argument should be of type str, got {text.__class__}" raise TypeError(msg) - root = self._get_root(text, base_url) - elif root is None: - raise ValueError("Selector needs either text or root argument") + + if text is not None: + if type in ("html", "xml", None): + self._load_lxml_root( + text, type=type or "html", base_url=base_url + ) + elif type == "json": + self.root = _load_json_or_none(text) + self.type = type + else: + self.root = text + self.type = type + else: + self.root = root + if type is None and isinstance(self.root, etree._Element): + type = "html" + self.type = type or "json" + + self._expr = _expr self.namespaces = dict(self._default_namespaces) if namespaces is not None: self.namespaces.update(namespaces) - self.root = root - self._expr = _expr + + def _load_lxml_root(self, text, type, base_url=None): + self.type = type + self.root = self._get_root(text, base_url) def __getstate__(self) -> Any: raise TypeError("can't pickle Selector objects") def _get_root(self, text: str, base_url: Optional[str] = None) -> Any: - return create_root_node(text, self._parser, base_url=base_url) + return create_root_node( + text, + _ctgroup[self.type]["_parser"], + base_url=base_url, + ) def xpath( self: _SelectorType, @@ -315,6 +352,12 @@ def xpath( selector.xpath('//a[href=$url]', url="http://www.example.com") """ + if self.type == "text": + self._load_lxml_root(self.root, type="html") + elif self.type not in ("html", "xml"): + raise ValueError( + f"Cannot use xpath on a Selector of type {repr(self.type)}" + ) try: xpathev = self.root.xpath except AttributeError: @@ -325,7 +368,10 @@ def xpath( nsp.update(namespaces) try: result = xpathev( - query, namespaces=nsp, smart_strings=self._lxml_smart_strings, **kwargs + query, + namespaces=nsp, + smart_strings=self._lxml_smart_strings, + **kwargs, ) except etree.XPathError as exc: raise ValueError(f"XPath error: {exc} in {query}") @@ -352,10 +398,41 @@ def css(self: _SelectorType, query: str) -> SelectorList[_SelectorType]: .. _cssselect: https://pypi.python.org/pypi/cssselect/ """ + if self.type == "text": + self._load_lxml_root(self.root, type="html") + elif self.type not in ("html", "xml"): + raise ValueError( + f"Cannot use css on a Selector of type {repr(self.type)}" + ) return self.xpath(self._css2xpath(query)) + def jsonpath( + self: _SelectorType, query: str, type=None + ) -> SelectorList[_SelectorType]: + """ + Apply the given JSONPath query and return a :class:`SelectorList` instance. + + ``query`` is a string containing the JSONPath query to apply. + """ + + if self.type == "json": + data = self.root + elif isinstance(self.root, str): + data = _load_json_or_none(self.root) + + jsonpath_expr = jsonpathParser(query) + result = [ + json.dumps(match.value) for match in jsonpath_expr.find(data) + ] + + def make_selector(x): + return self.__class__(text=x, _expr=query, type=type or "text") + + result = [make_selector(x) for x in result] + return self.selectorlist_cls(result) + def _css2xpath(self, query: str) -> Any: - return self._csstranslator.css_to_xpath(query) + return _ctgroup[self.type]["_csstranslator"].css_to_xpath(query) def re( self, regex: Union[str, Pattern[str]], replace_entities: bool = True @@ -372,7 +449,9 @@ def re( Passing ``replace_entities`` as ``False`` switches off these replacements. """ - return extract_regex(regex, self.get(), replace_entities=replace_entities) + return extract_regex( + regex, self.get(), replace_entities=replace_entities + ) @typing.overload def re_first( @@ -409,7 +488,8 @@ def re_first( replacements. """ return next( - iflatten(self.re(regex, replace_entities=replace_entities)), default + iflatten(self.re(regex, replace_entities=replace_entities)), + default, ) def get(self) -> str: @@ -417,10 +497,13 @@ def get(self) -> str: Serialize and return the matched nodes in a single string. Percent encoded content is unquoted. """ + if self.type in ("json", "text"): + return self.root + try: return etree.tostring( self.root, - method=self._tostring_method, + method=_ctgroup[self.type]["_tostring_method"], encoding="unicode", with_tail=False, ) @@ -504,6 +587,9 @@ def __bool__(self) -> bool: def __str__(self) -> str: data = repr(shorten(self.get(), width=40)) - return f"<{type(self).__name__} xpath={self._expr!r} data={data}>" + expr_field = "jsonpath" if self.type == "json" else "xpath" + return ( + f"<{type(self).__name__} {expr_field}={self._expr!r} data={data}>" + ) __repr__ = __str__ diff --git a/parsel/xpathfuncs.py b/parsel/xpathfuncs.py index 99b9c796..9e5c0a96 100644 --- a/parsel/xpathfuncs.py +++ b/parsel/xpathfuncs.py @@ -40,10 +40,14 @@ def has_class(context, *classes): """ if not context.eval_context.get("args_checked"): if not classes: - raise ValueError("XPath error: has-class must have at least 1 argument") + raise ValueError( + "XPath error: has-class must have at least 1 argument" + ) for c in classes: if not isinstance(c, str): - raise ValueError("XPath error: has-class arguments must be strings") + raise ValueError( + "XPath error: has-class arguments must be strings" + ) context.eval_context["args_checked"] = True node_cls = context.context_node.get("class") diff --git a/pylintrc b/pylintrc index 6135cb46..e49c393c 100644 --- a/pylintrc +++ b/pylintrc @@ -24,8 +24,10 @@ disable=bad-continuation, too-many-arguments, too-many-lines, too-many-public-methods, + too-many-branches, unidiomatic-typecheck, unused-argument, use-a-generator, wrong-import-order, - wrong-import-position + wrong-import-position, + unused-variable diff --git a/setup.py b/setup.py index 824b9e16..d2df1665 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,7 @@ "cssselect>=0.9", "lxml", "w3lib>=1.19.0", + "jsonpath_ng>=1.5.3", ], python_requires=">=3.6", license="BSD", diff --git a/tests/test_selector.py b/tests/test_selector.py index daf0f326..4afb247d 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -3,7 +3,6 @@ import unittest import pickle from typing import Any - from parsel import Selector, SelectorList from parsel.selector import ( CannotRemoveElementWithoutRoot, @@ -23,7 +22,9 @@ def assertIsSelectorList(self, value: Any) -> None: def test_pickle_selector(self) -> None: sel = self.sscls(text="

some text

") - self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel) + self.assertRaises( + TypeError, lambda s: pickle.dumps(s, protocol=2), sel + ) def test_pickle_selector_list(self) -> None: sel = self.sscls( @@ -33,7 +34,9 @@ def test_pickle_selector_list(self) -> None: empty_sel_list = sel.css("p") self.assertIsSelectorList(sel_list) self.assertIsSelectorList(empty_sel_list) - self.assertRaises(TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list) + self.assertRaises( + TypeError, lambda s: pickle.dumps(s, protocol=2), sel_list + ) self.assertRaises( TypeError, lambda s: pickle.dumps(s, protocol=2), empty_sel_list ) @@ -49,7 +52,8 @@ def test_simple_selection(self) -> None: self.assertIsSelector(x) self.assertEqual( - sel.xpath("//input").extract(), [x.extract() for x in sel.xpath("//input")] + sel.xpath("//input").extract(), + [x.extract() for x in sel.xpath("//input")], ) self.assertEqual( @@ -84,7 +88,10 @@ def test_simple_selection_with_variables(self) -> None: sel = self.sscls(text=body) self.assertEqual( - [x.extract() for x in sel.xpath("//input[@value=$number]/@name", number=1)], + [ + x.extract() + for x in sel.xpath("//input[@value=$number]/@name", number=1) + ], ["a"], ) self.assertEqual( @@ -97,18 +104,24 @@ def test_simple_selection_with_variables(self) -> None: self.assertEqual( sel.xpath( - "count(//input[@value=$number or @name=$letter])", number=2, letter="a" + "count(//input[@value=$number or @name=$letter])", + number=2, + letter="a", ).extract(), ["2.0"], ) # you can also pass booleans self.assertEqual( - sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=2, test=True).extract(), + sel.xpath( + "boolean(count(//input)=$cnt)=$test", cnt=2, test=True + ).extract(), ["1"], ) self.assertEqual( - sel.xpath("boolean(count(//input)=$cnt)=$test", cnt=4, test=True).extract(), + sel.xpath( + "boolean(count(//input)=$cnt)=$test", cnt=4, test=True + ).extract(), ["0"], ) self.assertEqual( @@ -138,11 +151,16 @@ def test_simple_selection_with_variables_escape_friendly(self) -> None: t = 'I say "Yeah!"' # naive string formatting with give something like: # ValueError: XPath error: Invalid predicate in //input[@value="I say "Yeah!""]/@name - self.assertRaises(ValueError, sel.xpath, f'//input[@value="{t}"]/@name') + self.assertRaises( + ValueError, sel.xpath, f'//input[@value="{t}"]/@name' + ) # with XPath variables, escaping is done for you self.assertEqual( - [x.extract() for x in sel.xpath("//input[@value=$text]/@name", text=t)], + [ + x.extract() + for x in sel.xpath("//input[@value=$text]/@name", text=t) + ], ["a"], ) lt = """I'm mixing single and "double quotes" and I don't care :)""" @@ -155,7 +173,9 @@ def test_simple_selection_with_variables_escape_friendly(self) -> None: self.assertEqual( [ x.extract() - for x in sel.xpath("//p[normalize-space()=$lng]//@name", lng=lt) + for x in sel.xpath( + "//p[normalize-space()=$lng]//@name", lng=lt + ) ], ["a"], ) @@ -179,7 +199,9 @@ def test_accessing_attributes(self) -> None: ) # for a SelectorList, bring the attributes of first-element only - self.assertEqual({"id": "some-list", "class": "list-cls"}, sel.css("ul").attrib) + self.assertEqual( + {"id": "some-list", "class": "list-cls"}, sel.css("ul").attrib + ) self.assertEqual( {"class": "item-cls", "id": "list-item-1"}, sel.css("li").attrib ) @@ -199,7 +221,9 @@ def test_representation_slice(self) -> None: body = f"

" sel = self.sscls(text=body) - representation = f"" + representation = ( + f"" + ) self.assertEqual( [repr(it) for it in sel.xpath("//input/@name")], [representation] @@ -208,7 +232,9 @@ def test_representation_slice(self) -> None: def test_representation_unicode_query(self) -> None: body = f"

" - representation = "" + representation = ( + "" + ) sel = self.sscls(text=body) self.assertEqual( @@ -245,7 +271,8 @@ def test_extract_first(self) -> None: ) self.assertEqual( - sel.xpath('/ul/li[@id="doesnt-exist"]/text()').extract_first(), None + sel.xpath('/ul/li[@id="doesnt-exist"]/text()').extract_first(), + None, ) def test_extract_first_default(self) -> None: @@ -254,7 +281,8 @@ def test_extract_first_default(self) -> None: sel = self.sscls(text=body) self.assertEqual( - sel.xpath("//div/text()").extract_first(default="missing"), "missing" + sel.xpath("//div/text()").extract_first(default="missing"), + "missing", ) def test_selector_get_alias(self) -> None: @@ -265,7 +293,9 @@ def test_selector_get_alias(self) -> None: self.assertEqual( sel.xpath("//ul/li[position()>1]")[0].get(), '
  • 2
  • ' ) - self.assertEqual(sel.xpath("//ul/li[position()>1]/text()")[0].get(), "2") + self.assertEqual( + sel.xpath("//ul/li[position()>1]/text()")[0].get(), "2" + ) def test_selector_getall_alias(self) -> None: """Test if get() returns extracted value on a Selector""" @@ -273,7 +303,8 @@ def test_selector_getall_alias(self) -> None: sel = self.sscls(text=body) self.assertListEqual( - sel.xpath("//ul/li[position()>1]")[0].getall(), ['
  • 2
  • '] + sel.xpath("//ul/li[position()>1]")[0].getall(), + ['
  • 2
  • '], ) self.assertListEqual( sel.xpath("//ul/li[position()>1]/text()")[0].getall(), ["2"] @@ -309,7 +340,8 @@ def test_re_first(self) -> None: self.assertEqual(sel.xpath("/ul/li/text()").re_first(r"\w+"), None) self.assertEqual( - sel.xpath('/ul/li[@id="doesnt-exist"]/text()').re_first(r"\d"), None + sel.xpath('/ul/li[@id="doesnt-exist"]/text()').re_first(r"\d"), + None, ) self.assertEqual(sel.re_first(r'id="(\d+)'), "1") @@ -322,32 +354,42 @@ def test_extract_first_re_default(self) -> None: sel = self.sscls(text=body) self.assertEqual( - sel.xpath("//div/text()").re_first(r"\w+", default="missing"), "missing" + sel.xpath("//div/text()").re_first(r"\w+", default="missing"), + "missing", ) self.assertEqual( - sel.xpath("/ul/li/text()").re_first(r"\w+", default="missing"), "missing" + sel.xpath("/ul/li/text()").re_first(r"\w+", default="missing"), + "missing", ) def test_select_unicode_query(self) -> None: body = "

    " sel = self.sscls(text=body) - self.assertEqual(sel.xpath('//input[@name="\xa9"]/@value').extract(), ["1"]) + self.assertEqual( + sel.xpath('//input[@name="\xa9"]/@value').extract(), ["1"] + ) def test_list_elements_type(self) -> None: """Test Selector returning the same type in selection methods""" text = "

    test

    " self.assertEqual( - type(self.sscls(text=text).xpath("//p")[0]), type(self.sscls(text=text)) + type(self.sscls(text=text).xpath("//p")[0]), + type(self.sscls(text=text)), ) self.assertEqual( - type(self.sscls(text=text).css("p")[0]), type(self.sscls(text=text)) + type(self.sscls(text=text).css("p")[0]), + type(self.sscls(text=text)), ) def test_boolean_result(self) -> None: body = "

    " xs = self.sscls(text=body) - self.assertEqual(xs.xpath("//input[@name='a']/@name='a'").extract(), ["1"]) - self.assertEqual(xs.xpath("//input[@name='a']/@name='n'").extract(), ["0"]) + self.assertEqual( + xs.xpath("//input[@name='a']/@name='a'").extract(), ["1"] + ) + self.assertEqual( + xs.xpath("//input[@name='a']/@name='n'").extract(), ["0"] + ) def test_differences_parsing_xml_vs_html(self) -> None: """Test that XML and HTML Selector's behave differently""" @@ -355,7 +397,8 @@ def test_differences_parsing_xml_vs_html(self) -> None: text = '

    Hello

    ' hs = self.sscls(text=text, type="html") self.assertEqual( - hs.xpath("//div").extract(), ['

    Hello

    '] + hs.xpath("//div").extract(), + ['

    Hello

    '], ) xs = self.sscls(text=text, type="xml") @@ -476,7 +519,9 @@ def test_mixed_nested_selectors(self) -> None: self.assertEqual( sel.xpath('//div[@id="1"]').css("span::text").extract(), ["me"] ) - self.assertEqual(sel.css("#1").xpath("./span/text()").extract(), ["me"]) + self.assertEqual( + sel.css("#1").xpath("./span/text()").extract(), ["me"] + ) def test_dont_strip(self) -> None: sel = self.sscls(text='
    fff: zzz
    ') @@ -543,22 +588,28 @@ def test_namespaces_multiple(self) -> None: """ x = self.sscls(text=body, type="xml") x.register_namespace( - "xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05" + "xmlns", + "http://webservices.amazon.com/AWSECommerceService/2005-10-05", ) x.register_namespace("p", "http://www.scrapy.org/product") x.register_namespace("b", "http://somens.com") self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1) self.assertEqual(x.xpath("//b:Operation/text()").extract()[0], "hello") - self.assertEqual(x.xpath("//xmlns:TestTag/@b:att").extract()[0], "value") + self.assertEqual( + x.xpath("//xmlns:TestTag/@b:att").extract()[0], "value" + ) self.assertEqual( x.xpath("//p:SecondTestTag/xmlns:price/text()").extract()[0], "90" ) self.assertEqual( - x.xpath("//p:SecondTestTag").xpath("./xmlns:price/text()")[0].extract(), + x.xpath("//p:SecondTestTag") + .xpath("./xmlns:price/text()")[0] + .extract(), "90", ) self.assertEqual( - x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], "iron" + x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], + "iron", ) def test_namespaces_multiple_adhoc(self) -> None: @@ -573,7 +624,8 @@ def test_namespaces_multiple_adhoc(self) -> None: """ x = self.sscls(text=body, type="xml") x.register_namespace( - "xmlns", "http://webservices.amazon.com/AWSECommerceService/2005-10-05" + "xmlns", + "http://webservices.amazon.com/AWSECommerceService/2005-10-05", ) self.assertEqual(len(x.xpath("//xmlns:TestTag")), 1) @@ -602,7 +654,8 @@ def test_namespaces_multiple_adhoc(self) -> None: # chained selectors still have knowledge of register_namespace() operations self.assertEqual( x.xpath( - "//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"} + "//p:SecondTestTag", + namespaces={"p": "http://www.scrapy.org/product"}, ) .xpath("./xmlns:price/text()")[0] .extract(), @@ -613,7 +666,8 @@ def test_namespaces_multiple_adhoc(self) -> None: self.assertRaises( ValueError, x.xpath( - "//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"} + "//p:SecondTestTag", + namespaces={"p": "http://www.scrapy.org/product"}, ).xpath, "p:name/text()", ) @@ -621,9 +675,13 @@ def test_namespaces_multiple_adhoc(self) -> None: # ad-hoc declarations need repeats when chaining self.assertEqual( x.xpath( - "//p:SecondTestTag", namespaces={"p": "http://www.scrapy.org/product"} + "//p:SecondTestTag", + namespaces={"p": "http://www.scrapy.org/product"}, + ) + .xpath( + "p:name/text()", + namespaces={"p": "http://www.scrapy.org/product"}, ) - .xpath("p:name/text()", namespaces={"p": "http://www.scrapy.org/product"}) .extract_first(), "Dried Rose", ) @@ -642,11 +700,14 @@ def test_namespaces_multiple_adhoc(self) -> None: ) # "p" prefix is not cached from previous calls - self.assertRaises(ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()") + self.assertRaises( + ValueError, x.xpath, "//p:SecondTestTag/xmlns:price/text()" + ) x.register_namespace("p", "http://www.scrapy.org/product") self.assertEqual( - x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], "iron" + x.xpath("//p:SecondTestTag/xmlns:material/text()").extract()[0], + "iron", ) def test_make_links_absolute(self) -> None: @@ -654,7 +715,8 @@ def test_make_links_absolute(self) -> None: sel = Selector(text=text, base_url="http://example.com") sel.root.make_links_absolute() self.assertEqual( - "http://example.com/file.html", sel.xpath("//a/@href").extract_first() + "http://example.com/file.html", + sel.xpath("//a/@href").extract_first(), ) def test_re(self) -> None: @@ -703,28 +765,37 @@ def test_re_replace_entities(self) -> None: # switching off replace_entities will preserve " also expected = '"bar & "baz""' self.assertEqual( - x.xpath("//script/text()").re(name_re, replace_entities=False), [expected] + x.xpath("//script/text()").re(name_re, replace_entities=False), + [expected], ) self.assertEqual( - x.xpath("//script")[0].re(name_re, replace_entities=False), [expected] + x.xpath("//script")[0].re(name_re, replace_entities=False), + [expected], ) self.assertEqual( - x.xpath("//script/text()").re_first(name_re, replace_entities=False), + x.xpath("//script/text()").re_first( + name_re, replace_entities=False + ), expected, ) self.assertEqual( - x.xpath("//script")[0].re_first(name_re, replace_entities=False), expected + x.xpath("//script")[0].re_first(name_re, replace_entities=False), + expected, ) def test_re_intl(self) -> None: body = "
    Evento: cumplea\xf1os
    " x = self.sscls(text=body) - self.assertEqual(x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"]) + self.assertEqual( + x.xpath("//div").re(r"Evento: (\w+)"), ["cumplea\xf1os"] + ) def test_selector_over_text(self) -> None: hs = self.sscls(text="lala") - self.assertEqual(hs.extract(), "lala") + self.assertEqual( + hs.extract(), "lala" + ) xs = self.sscls(text="lala", type="xml") self.assertEqual(xs.extract(), "lala") self.assertEqual(xs.xpath(".").extract(), ["lala"]) @@ -750,13 +821,17 @@ def test_http_header_encoding_precedence(self) -> None: \xa3""" x = self.sscls(text=text) - self.assertEqual(x.xpath("//span[@id='blank']/text()").extract(), ["\xa3"]) + self.assertEqual( + x.xpath("//span[@id='blank']/text()").extract(), ["\xa3"] + ) def test_empty_bodies_shouldnt_raise_errors(self) -> None: self.sscls(text="").xpath("//text()").extract() def test_bodies_with_comments_only(self) -> None: - sel = self.sscls(text="", base_url="http://example.com") + sel = self.sscls( + text="", base_url="http://example.com" + ) self.assertEqual("http://example.com", sel.root.base) def test_null_bytes_shouldnt_raise_errors(self) -> None: @@ -782,7 +857,9 @@ def test_select_on_unevaluable_nodes(self) -> None: self.assertEqual(x1.xpath(".//text()").extract(), []) def test_select_on_text_nodes(self) -> None: - r = self.sscls(text="
    Options:opt1
    Otheropt2
    ") + r = self.sscls( + text="
    Options:opt1
    Otheropt2
    " + ) x1 = r.xpath( "//div/descendant::text()[preceding-sibling::b[contains(text(), 'Options')]]" ) @@ -796,7 +873,9 @@ def test_select_on_text_nodes(self) -> None: @unittest.skip("Text nodes lost parent node reference in lxml") def test_nested_select_on_text_nodes(self) -> None: # FIXME: does not work with lxml backend [upstream] - r = self.sscls(text="
    Options:opt1
    Otheropt2
    ") + r = self.sscls( + text="
    Options:opt1
    Otheropt2
    " + ) x1 = r.xpath("//div/descendant::text()") x2 = x1.xpath("./preceding-sibling::b[contains(text(), 'Options')]") self.assertEqual(x2.extract(), ["Options:"]) @@ -847,11 +926,19 @@ def test_remove_namespaces_embedded(self) -> None: self.assertEqual(len(sel.xpath("//stop")), 0) self.assertEqual(len(sel.xpath("./namespace::*")), 2) self.assertEqual( - len(sel.xpath("//f:link", namespaces={"f": "http://www.w3.org/2005/Atom"})), + len( + sel.xpath( + "//f:link", namespaces={"f": "http://www.w3.org/2005/Atom"} + ) + ), 2, ) self.assertEqual( - len(sel.xpath("//s:stop", namespaces={"s": "http://www.w3.org/2000/svg"})), + len( + sel.xpath( + "//s:stop", namespaces={"s": "http://www.w3.org/2000/svg"} + ) + ), 2, ) sel.remove_namespaces() @@ -899,13 +986,19 @@ class SmartStringsSelector(Selector): li_text = x.xpath("//li/text()") self.assertFalse(any([hasattr(e.root, "getparent") for e in li_text])) div_class = x.xpath("//div/@class") - self.assertFalse(any([hasattr(e.root, "getparent") for e in div_class])) + self.assertFalse( + any([hasattr(e.root, "getparent") for e in div_class]) + ) smart_x = SmartStringsSelector(text=body) smart_li_text = smart_x.xpath("//li/text()") - self.assertTrue(all([hasattr(e.root, "getparent") for e in smart_li_text])) + self.assertTrue( + all([hasattr(e.root, "getparent") for e in smart_li_text]) + ) smart_div_class = smart_x.xpath("//div/@class") - self.assertTrue(all([hasattr(e.root, "getparent") for e in smart_div_class])) + self.assertTrue( + all([hasattr(e.root, "getparent") for e in smart_div_class]) + ) def test_xml_entity_expansion(self) -> None: malicious_xml = ( @@ -943,7 +1036,8 @@ def extra_method(self) -> str: def test_replacement_null_char_from_body(self) -> None: text = "\x00

    Grainy

    " self.assertEqual( - "

    Grainy

    ", self.sscls(text).extract() + "

    Grainy

    ", + self.sscls(text).extract(), ) def test_remove_selector_list(self) -> None: @@ -1006,6 +1100,110 @@ def test_remove_root_element_selector(self) -> None: sel.css("body").remove() self.assertEqual(sel.get(), "") + def test_selector_init_with_etree_element(self): + from lxml import etree + + element = etree.Element("root") + sel = self.sscls(root=element) + self.assertEqual(sel.type, "html") + + def test_invalid_json(self) -> None: + invalid_json = "root" + sel = self.sscls(text=invalid_json, type="json") + self.assertEqual(sel.root, None) + + def test_invalid_selector_calls(self) -> None: + json = '{"attrib":value}' + sel = self.sscls(text=json, type="json") + + with self.assertRaises(ValueError): + sel.xpath("query") + with self.assertRaises(ValueError): + sel.css("query") + + def test_xpath_selector_on_type_text(self) -> None: + html = "" + sel = self.sscls(text=html, type="text") + + sel_list = sel.xpath("//html") + self.assertEqual(sel_list[0].type, "html") + + def test_css_selector_on_type_text(self) -> None: + html = "" + sel = self.sscls(text=html, type="text") + + sel_list = sel.css("html") + self.assertEqual(sel_list[0].type, "html") + + def test_jsonpath_selectors(self) -> None: + + json_data = """{ + "store": { + "book": [ + { + "category": "reference", + "author": "Nigel Rees", + "title": "Sayings of the Century", + "price": 8.95 + }, + { + "category": "fiction", + "author": "Herman Melville", + "title": "Moby Dick", + "isbn": "0-553-21311-3", + "price": 8.99 + }, + { + "category": "fiction", + "author": "J.R.R. Tolkien", + "title": "The Lord of the Rings", + "isbn": "0-395-19395-8", + "price": 22.99 + } + ], + "bicycle": { + "color": "red", + "price": 19.95 + } + }, + "expensive": 10 + }""" + + sel = self.sscls(text=json_data, type="json") + sel_list = sel.jsonpath("$..author") + + self.assertIsSelector(sel) + self.assertIsSelectorList(sel_list) + self.assertEqual( + sel_list.getall(), + ['"Nigel Rees"', '"Herman Melville"', '"J.R.R. Tolkien"'], + ) + + sel_list = sel.jsonpath("$..bicycle") + + self.assertIsSelectorList(sel_list) + self.assertEqual(sel_list.get(), '{"color": "red", "price": 19.95}') + + inner_lst = sel_list[0].jsonpath("$..color") + + self.assertIsSelectorList(inner_lst) + self.assertEqual(inner_lst.getall(), ['"red"']) + + sel_list = sel.jsonpath("$..book[*].title") + + self.assertIsSelectorList(sel_list) + self.assertEqual( + sel_list.getall(), + [ + '"Sayings of the Century"', + '"Moby Dick"', + '"The Lord of the Rings"', + ], + ) + + sel_list_empty = sel.jsonpath("$..contact") + self.assertEqual(sel_list_empty, []) + class ExsltTestCase(unittest.TestCase): @@ -1026,18 +1224,30 @@ def test_regexp(self) -> None: # re:test() self.assertEqual( sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]').extract(), - [x.extract() for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]')], + [ + x.extract() + for x in sel.xpath('//input[re:test(@name, "[A-Z]+", "i")]') + ], ) self.assertEqual( - [x.extract() for x in sel.xpath(r'//a[re:test(@href, "\.html$")]/text()')], + [ + x.extract() + for x in sel.xpath(r'//a[re:test(@href, "\.html$")]/text()') + ], ["first link", "second link"], ) self.assertEqual( - [x.extract() for x in sel.xpath('//a[re:test(@href, "first")]/text()')], + [ + x.extract() + for x in sel.xpath('//a[re:test(@href, "first")]/text()') + ], ["first link"], ) self.assertEqual( - [x.extract() for x in sel.xpath('//a[re:test(@href, "second")]/text()')], + [ + x.extract() + for x in sel.xpath('//a[re:test(@href, "second")]/text()') + ], ["second link"], ) @@ -1067,7 +1277,9 @@ def test_regexp(self) -> None: r're:replace(//a[re:test(@href, "\.xml$")]/@href,' r'"(\w+)://(.+)(\.xml)", "","https://\2.html")' ).extract(), - ["https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html"], + [ + "https://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.html" + ], ) def test_set(self) -> None: diff --git a/tests/test_selector_csstranslator.py b/tests/test_selector_csstranslator.py index 1df3cfe4..c720006f 100644 --- a/tests/test_selector_csstranslator.py +++ b/tests/test_selector_csstranslator.py @@ -53,7 +53,10 @@ def test_attr_function(self): cases = [ ("::attr(name)", "descendant-or-self::*/@name"), ("a::attr(href)", "descendant-or-self::a/@href"), - ("a ::attr(img)", "descendant-or-self::a/descendant-or-self::*/@img"), + ( + "a ::attr(img)", + "descendant-or-self::a/descendant-or-self::*/@img", + ), ("a > ::attr(class)", "descendant-or-self::a/*/@class"), ] for css, xpath in cases: @@ -149,7 +152,9 @@ def setUp(self): self.sel = self.sscls(text=HTMLBODY) def x(self, *a, **kw): - return [v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip()] + return [ + v.strip() for v in self.sel.css(*a, **kw).extract() if v.strip() + ] def test_selector_simple(self): for x in self.sel.css("input"): @@ -165,10 +170,13 @@ def test_text_pseudo_element(self): self.assertEqual(self.x("#p-b2 ::text"), ["guy"]) self.assertEqual(self.x("#paragraph::text"), ["lorem ipsum text"]) self.assertEqual( - self.x("#paragraph ::text"), ["lorem ipsum text", "hi", "there", "guy"] + self.x("#paragraph ::text"), + ["lorem ipsum text", "hi", "there", "guy"], ) self.assertEqual(self.x("p::text"), ["lorem ipsum text"]) - self.assertEqual(self.x("p ::text"), ["lorem ipsum text", "hi", "there", "guy"]) + self.assertEqual( + self.x("p ::text"), ["lorem ipsum text", "hi", "there", "guy"] + ) def test_attribute_function(self): self.assertEqual(self.x("#p-b2::attr(id)"), ["p-b2"]) @@ -181,7 +189,9 @@ def test_attribute_function(self): ) def test_nested_selector(self): - self.assertEqual(self.sel.css("p").css("b::text").extract(), ["hi", "guy"]) + self.assertEqual( + self.sel.css("p").css("b::text").extract(), ["hi", "guy"] + ) self.assertEqual( self.sel.css("div").css("area:last-child").extract(), [''], diff --git a/tests/test_utils.py b/tests/test_utils.py index 556892c1..e2bca559 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -46,7 +46,12 @@ def test_shorten(width, expected): True, ["October"], ], - [r"\w+\s*\d+\s*\,?\s*\d+", "October 25 2019", True, ["October 25 2019"]], + [ + r"\w+\s*\d+\s*\,?\s*\d+", + "October 25 2019", + True, + ["October 25 2019"], + ], [ r"^.*$", ""sometext" & "moretext"", diff --git a/tests/test_xml_attacks.py b/tests/test_xml_attacks.py index e9380745..45b0243a 100644 --- a/tests/test_xml_attacks.py +++ b/tests/test_xml_attacks.py @@ -8,7 +8,7 @@ from parsel import Selector -MiB_1 = 1024 ** 2 +MiB_1 = 1024**2 def _load(attack): diff --git a/tests/test_xpathfuncs.py b/tests/test_xpathfuncs.py index af710946..744472a9 100644 --- a/tests/test_xpathfuncs.py +++ b/tests/test_xpathfuncs.py @@ -17,13 +17,21 @@ def test_has_class_simple(self): ["First", "Second"], ) self.assertEqual( - [x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')], ["Third"] + [x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')], + ["Third"], ) self.assertEqual( - [x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()')], [] + [ + x.extract() + for x in sel.xpath('//p[has-class("foo","bar")]/text()') + ], + [], ) self.assertEqual( - [x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')], + [ + x.extract() + for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()') + ], ["First"], ) @@ -45,7 +53,10 @@ def test_has_class_error_invalid_arg_type(self): """ sel = Selector(text=body) self.assertRaisesRegex( - ValueError, "has-class arguments must be strings", sel.xpath, "has-class(.)" + ValueError, + "has-class arguments must be strings", + sel.xpath, + "has-class(.)", ) def test_has_class_error_invalid_unicode(self): @@ -66,7 +77,8 @@ def test_has_class_unicode(self): """ sel = Selector(text=body) self.assertEqual( - [x.extract() for x in sel.xpath('//p[has-class("fóó")]/text()')], ["First"] + [x.extract() for x in sel.xpath('//p[has-class("fóó")]/text()')], + ["First"], ) def test_has_class_uppercase(self): @@ -75,7 +87,8 @@ def test_has_class_uppercase(self): """ sel = Selector(text=body) self.assertEqual( - [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], ["First"] + [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], + ["First"], ) def test_has_class_newline(self): @@ -85,7 +98,8 @@ def test_has_class_newline(self): """ sel = Selector(text=body) self.assertEqual( - [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], ["First"] + [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], + ["First"], ) def test_has_class_tab(self): @@ -94,7 +108,8 @@ def test_has_class_tab(self): """ sel = Selector(text=body) self.assertEqual( - [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], ["First"] + [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')], + ["First"], ) def test_set_xpathfunc(self): @@ -108,7 +123,10 @@ def myfunc(ctx): """ sel = Selector(text=body) self.assertRaisesRegex( - ValueError, "Unregistered function in myfunc", sel.xpath, "myfunc()" + ValueError, + "Unregistered function in myfunc", + sel.xpath, + "myfunc()", ) set_xpathfunc("myfunc", myfunc) @@ -117,5 +135,8 @@ def myfunc(ctx): set_xpathfunc("myfunc", None) self.assertRaisesRegex( - ValueError, "Unregistered function in myfunc", sel.xpath, "myfunc()" + ValueError, + "Unregistered function in myfunc", + sel.xpath, + "myfunc()", ) diff --git a/tox.ini b/tox.ini index 52c08e58..8dba8d30 100644 --- a/tox.ini +++ b/tox.ini @@ -39,7 +39,7 @@ commands = deps = black commands = - black --check {posargs:parsel tests conftest.py setup.py} + black --line-length=79 --check {posargs:parsel tests docs conftest.py setup.py} [docs] changedir = docs