From 3c0068843e965e34aa0abced4259a6b9b0bac856 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 12 Jul 2024 09:25:18 +0200 Subject: [PATCH] Initial implementation (#1) Co-authored-by: Mikhail Korobov --- .bandit.yml | 3 + .coveragerc | 3 + .flake8 | 4 + .github/workflows/test.yml | 2 +- .gitignore | 2 + .pre-commit-config.yaml | 12 + README.rst | 4 +- dist/form2request-0.0.0.tar.gz | Bin 4457 -> 0 bytes docs/api.rst | 6 +- docs/conf.py | 17 +- docs/usage.rst | 219 +++++++++- form2request/__init__.py | 2 + form2request/_base.py | 239 ++++++++++ pyproject.toml | 2 + tests/test_main.py | 776 ++++++++++++++++++++++++++++++++- tox.ini | 22 +- 16 files changed, 1300 insertions(+), 13 deletions(-) create mode 100644 .bandit.yml create mode 100644 .coveragerc delete mode 100644 dist/form2request-0.0.0.tar.gz create mode 100644 form2request/_base.py diff --git a/.bandit.yml b/.bandit.yml new file mode 100644 index 0000000..2237265 --- /dev/null +++ b/.bandit.yml @@ -0,0 +1,3 @@ +skips: +- B101 # assert_used, needed for mypy +exclude_dirs: ['tests'] diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..0856c03 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,3 @@ +[report] +exclude_lines = + if TYPE_CHECKING: diff --git a/.flake8 b/.flake8 index 9ee8f89..f06e676 100644 --- a/.flake8 +++ b/.flake8 @@ -1,6 +1,10 @@ [flake8] +extend-select = TC, TC1 ignore = +max-line-length = 88 per-file-ignores = + # F401: Imported but unused + form2request/__init__.py:F401 # D100-D104: Missing docstring docs/conf.py:D100 tests/__init__.py:D104 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0d2ca22..70870f8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,7 +41,7 @@ jobs: fail-fast: false matrix: python-version: ['3.12'] - tox-job: ["pre-commit", "mypy", "docs", "twinecheck"] + tox-job: ["pre-commit", "mypy", "docs", "doctest", "twinecheck"] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/.gitignore b/.gitignore index b53725c..bc020ed 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /.coverage /coverage.xml +/dist/ +/.tox/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 663563f..6d5a2de 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,8 +17,20 @@ repos: - flake8-debugger - flake8-docstrings - flake8-string-format + - flake8-type-checking - repo: https://github.com/asottile/pyupgrade rev: v3.16.0 hooks: - id: pyupgrade args: [--py38-plus] +- repo: https://github.com/pycqa/bandit + rev: 1.7.9 + hooks: + - id: bandit + args: [-r, -c, .bandit.yml] +- repo: https://github.com/adamchainz/blacken-docs + rev: 1.18.0 + hooks: + - id: blacken-docs + additional_dependencies: + - black==24.4.2 diff --git a/README.rst b/README.rst index 586477c..a6b7f0d 100644 --- a/README.rst +++ b/README.rst @@ -20,8 +20,8 @@ form2request .. description starts -``form2request`` is an AI-powered Python 3.8+ library to build HTTP requests -out of HTML forms. +``form2request`` is a Python 3.8+ library to build HTTP requests out of HTML +forms. .. description ends diff --git a/dist/form2request-0.0.0.tar.gz b/dist/form2request-0.0.0.tar.gz deleted file mode 100644 index f99bed94a6483bfd6e1f6ffaf4415a2e7b5c50d8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4457 zcmZ{kc{J4j_s10}W#89^tR=~wbx5IXkti9mWs8ZilrV|xvSdk?A$ziveaYBD$WV64 zzGj^fGh^oc`qt ztc`B*^2N?p+F|L#KO|C}W}mcr@D(llAjP6|8rE)eqDXrIfwt_Bzn+SG+42feE_e|FdG#8Y^@+2g7!P*x#D8SIK#L z|F-#Q2MS7Yr7%b?9k+aK?fGR0MREuged+uP-RbL@AK=~6UjYolc6En<&)x?Jmj~I! zKE5g8b?T!BR3}xdbwznl7r287C!Uo4PIQvBhy~RxXw71pWIm)x>C<&=c$ zJMP@gFP415-JMyx71b5pb*iu_`HEPHTT1GB4Lzc4RbUv=-@)W9{KBv)0;#jJ{baG* zB_M7^u+rdedzSY5v~T9$c{TGr%s$>-T~}Rvy?J(G=&AOcFiX#$+$s^(Pp;{5`te(= zWiB1pWBu6^^L}ev>JYSjge8=HHGLR)UZm}2IRd_|WA>3T@_4dktp7y=k00AB-!F}x zQA=Mvq2#*ht~Ts(D3EZ|FSYHE8n~5Lm%p9(Ea8TumH6@$wAgg;pkBw{BUPxj;hi(g`~=c~b%*5vKq%LwAVrs6v_-HsBy!R2O!&NGo6 zH=VE+NY~}Co24cPEE`8k2=QAoA^Jc1Q9MtYu5DEUo&PtTXrO@4#(o^Shehu-TWJ=X28TG%b{@a}@LX!$fx0?UB z=P0zcL7y@t_3?$s8Q1evtNo+Tvy|@qwtNzWE)PMa@a~+7I8$ zl-~A>8e5IeTe_*m)7<;+087msptW=kjMaWWS5`zYi=26&lI(hT!)-NB?+deNXZd(- zRc4ih^iIS#ncZHH1aLhdHs+l$z)n)ol@_DhWr4a?&%VY3K8hVD3f~LS&aif)b<$#b zK{}gFnbKp;Tr0lj@eRi3Qb$7^8hE*l=K^eWHlY0v%m(OCV2c1MW|6o46XJRA0Q>G7 zckvh+e?mNxh>fdqAw{?m^;Yth&1t`KC!L6{B|OE1n8vQf7n)Tt@EPZCCM)N3OJj1`l=_s-9NkmgI`l@LaERI4 zXx={*i~C?!QR)0hD84XwoYdh?$_NX5c{PLK?N-;}(JxEo>4+q?EAL#>3#;^xT~QCM z<8TYeN>ecrNRVu}>`7}(>psZwNYdlaC!sKtx$2-s$F@1GIvF1MsvK*fVpA@WVI3Z3 zjZd}LeZ*+zCgE}t3+|e?m@W9y=!-o6Xrqwv@-(L}v#HRz^XC_N6pR$Vrg-mL;u)x+)raqc=qcYRw-z1LVyw6h= z!&~3!_C6d&=CVKgP#4?Hqs(LC?K01q-0+0mZBI22^zgWXwLUHm5_EzksOP@30ee0C zGwQ;G4gG}d$EGl zucuuUVZ@X%l@MAwPnN}Foy)<6mkt$C7k3}xu?$<75u)8AU}u($Y5+UkYv?QhZmj@P zi$xm=z5$XTr1DnImP+Pj^}yQ%@CHgpJ-t4E?OqD%3eblV$lxszPDa0jSS|`$HfQH; zKb;&G1hISQLaBKIJQ3tWcFMqo4EtxLKn6Gpv>K>t6bQhxt} zEdmFqW(^JGit(SGY?65LVJoOMf^?l zsn#D`yF85%yVaiEltRSD3tqNKBvYnP^I@Ti=zD?h!+d5z#j~438H}-=IVfeb=s8AB zo)c>hO7?Kt3O5b{{C=7}IpK!`KlJVfB8wI!Q z4Pp0>5<1WR?FD*^XtS@;Q9wliNRCrD_(AnAa8Y`Z1m86!Q!p)3B7dENx+#1LXHlQD zmPDmlCM7V1n*Ff-;GXp->$+M|0ZuioHGvW~uMk`lwGYSf@rpP!_d$+s&Q^5jn?0?x z6>;#Qkxf-d%4|s4iGFWXQJhio}Tq{NyqaV!qEJ7iebJ+(@ zmRb>fwRGkhmoiUukV{f;a>K1hZg~^(U1wRsjOJ!^nvW6Ei-VPH=xfXGO6FQB`UL!q zBY(fWB4c>y^d9rmCJ^>eVTU-NCvIir)pLMc}tJvp!+x{6S!N$g~pl~Nu! z(rt$w$S4H!7S?L^yxcpZ6h+@|b`ozun9=E}Unf_NhOR6 zu(NnXk6_ily@H%R|GeXvHri0dOAR;iTjJ|pE)Ow7fyE=1G0^DTuNuv;Zubbuw?%$N z0?QL?6dFunGzG*^U>JZ3{#U~&=wzU<#A@BMCDsFsj_g9n)P$E=Q9Ec`W>7!1B_ZAu z*@CReY~L>O>kjEk9LJ2m52zwM@@Ps0JK*0u?S?-e1Dtcg% zjQg_=)>qLyJ`GS9dDd{LeU>l?9B|;$(ivZ>e)}p~6zEgY2nuWqT>>1Zv(pCwms}*} zKP(Fgz`>j2AE2b1r^&1!ec2V%XYr(J%m5kb2Hm5J1Bnk!xq)m}jwO!UmL&C1*Tvdc9 z`@huk@7d`9WEW6{Ii+zztBXXt0wRWk5C!^A&3@|u7DUp9=M{=p1jPB?&fAI}+R<;@ zY31Tx45o|3ekd|8#XcTxM||0fCMFQE7L;^+6o>)@&1}VwB(!}TaEJou7Jz&QOg0lJ zQ0yFu@F3KhDV8#O!9i;Qz5r?$(ErFo59+^?m4{pwumy1xbj;I?Nq|!A0?y%KusAB< zV%FxxC8jyFtwjM1*dxgrGSL6E`cG4TuOr(52(~PBb41Kb(CFaZ z2g&j{1wozWSDgUh-o-rTDF;8Ke5pWSn?c2>yfUltXv7^}d$Wq)o_x6QbLqLCqA>l= zE8D7-)^A_ZPPMK;`@i%r$rLx`IHcc|+hn|N6jd(M!XaMJp@4f~C0#m#v+Rq_(O7SMBft~w{z2HCxn&0vr2^>9wkVR4_FI)L~1=srJl;>*1 zWa7O@Kd}XGzC9?Mx@(#pau=`BU&s<@ff)DJwL0$qL?-$2lvZ1z-qHh^K2`r*n`80J z|CYdm1Kt3(07ieU1v^u$V(=pn_NquEgf?Ut{7D4;tU&$eYtAE^^@<9brGW9Amek{V z4=Ro;Jef8+HdRTLHNNL*yh^a_V!77MV(TM&Wrs(x5J%PDIB^)6l1=v5cJ8waoh_3! zM;lN*V(^ja?MfcmM z(7y_1W_B~j2q8UdeEWUmzSoDsehR&J8AIrXorI-Ex_5)^MZDPKQV0w&Z+wvLJ!JLb z`n7!J;O5orR#QVkDmnAvEw1(`Cmx&M86HPt>&^MMi+93TkK5{2Gky?R>{NGYVCI-h z|F?(gUDzQIfpB5K+<_?j^-ISH1aCtJfob!Ofa+beI3?)1+2Wzg5Vlrc53n6n=7do> z4<~)v>UFe^;gNDnrt$f~a%AYmktq?H&7bw(kfu?3K#z=fG%XO^;(wC#)<~7)p`3Ze z_=Ry+_hgq_mhokZ_|l)r@S_k{kG<{3kBptR@+;x-5VZ9JIY-3JCqZX141;>;frBJL zj=cJa8}uL6X~|SgF9*#P9`C%#zyVCj?kX6jV&LjKZJ)ZIAHbH-$O!QDG#OO*64u-L4HR$MIig#vhQV6}$fNZep{gDVnz)=z&du|FO0>mIt zUM)taSAzIofRR$!I|lIA_rblJR~okeV5e0cQWI)X?(z2*p_rq@=59aj4Sv5f;o5)0 z-b2Mk0jqYPeq#Y`mNfFz#$3A%alJJ*V!(F?iYYhJgH-9aD8FIk_ z1pjAHS~7G9&L_Wsu!anO|Fd2_`GCxADjY66{rDvJ`{Rtfmmjn}pD*TtU_%~cW*XjO GH2(v8Z(#WV diff --git a/docs/api.rst b/docs/api.rst index 29c1176..18f8d4b 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -2,4 +2,8 @@ API reference ============= -… +.. autofunction:: form2request.form2request + +.. autoclass:: form2request.Request + :members: + :undoc-members: diff --git a/docs/conf.py b/docs/conf.py index acc9c71..f2435e9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -9,11 +9,24 @@ html_theme = "sphinx_rtd_theme" -autodoc_member_order = "groupwise" - intersphinx_disabled_reftypes = [] intersphinx_mapping = { "lxml": ("https://lxml.de/apidoc/", None), "parsel": ("https://parsel.readthedocs.io/en/stable", None), "python": ("https://docs.python.org/3", None), + "scrapy": ("https://docs.scrapy.org/en/latest", None), } + +nitpick_ignore = [ + *( + ("py:class", cls) + for cls in ( + # https://github.com/sphinx-doc/sphinx/issues/11225 + "FormdataType", + "FormElement", + "HtmlElement", + "Selector", + "SelectorList", + ) + ), +] diff --git a/docs/usage.rst b/docs/usage.rst index d9237b5..08abe3a 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -2,4 +2,221 @@ Usage ===== -… +:ref:`Given an HTML form
`: + +.. _parsel-example: + +>>> from parsel import Selector +>>> html = b"""
""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +You can use :func:`~form2request.form2request` to generate form submission +request data: + +>>> from form2request import form2request +>>> req = form2request(form) +>>> req +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +:func:`~form2request.form2request` does not make requests, but you can use its +output to build requests with any HTTP client software, e.g. with the requests_ +library: + +.. _requests: https://requests.readthedocs.io/en/latest/ + +.. _requests-example: + +>>> import requests +>>> requests.request(req.method, req.url, headers=req.headers, data=req.body) # doctest: +SKIP + + +:func:`~form2request.form2request` supports :ref:`user-defined form data +`, :ref:`choosing a specific submit button (or none) `, and +:ref:`overriding form attributes `. + + +.. _form: + +Getting a form +============== + +:func:`~form2request.form2request` requires an HTML form object. You can get +one using :doc:`parsel `, as :ref:`seen above `, +or you can use :doc:`lxml `: + +.. _fromstring-example: + +>>> from lxml.html import fromstring +>>> root = fromstring(html, base_url="https://example.com") +>>> form = root.xpath("//form")[0] + +If you use a library or framework based on :doc:`parsel ` or +:doc:`lxml `, chances are they also let you get a form object. For +example, when using a :doc:`Scrapy ` response: + +>>> from scrapy.http import TextResponse +>>> response = TextResponse("https://example.com", body=html) +>>> form = response.css("form") + +Here are some examples of XPath expressions that can be useful to get a form +using parsel’s :meth:`Selector.xpath ` or +lxml’s :meth:`HtmlElement.xpath `: + +- To find a form by one of its attributes, such as ``id`` or ``name``, use + ``//form[@=""]``. For example, to find ``
`, ``#`` + (e.g. ``#foo``) finds by ``id``, and ``[=""]`` (e.g. + ``[name=foo]`` or ``[name="foo bar"]``) finds by any other attribute. + +- To find a form by index, by order of appearance in the HTML code, use + ``(//form)[n]``, where ``n`` is a 1-based index. For example, to find the + 2nd form, use ``(//form)[2]``. + +If you prefer, you could use the XPath of an element inside the form, and then +visit parent elements until you reach the form element. For example: + +.. code-block:: python + + element = root.xpath('//input[@name="zip_code"]')[0] + while True: + if element.tag == "form": + break + element = element.getparent() + form = element + + +.. _data: + +Setting form data +================= + +While there are forms made entirely of hidden fields, like :ref:`the one above +`, most often you will work with forms that expect +user-defined data: + +>>> html = b"""""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +Use the ``data`` parameter of :func:`~form2request.form2request`, to define +the corresponding data: + +>>> form2request(form, {"foo": "bar"}) +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +You may sometimes find forms where more than one field has the same ``name`` +attribute: + +>>> html = b"""
""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +To specify values for all same-name fields, instead of a dictionary, use an +iterable of key-value tuples: + +>>> form2request(form, (("foo", "bar"), ("foo", "baz"))) +Request(url='https://example.com?foo=bar&foo=baz', method='GET', headers=[], body=b'') + +.. _remove-data: + +Sometimes, you might want to prevent a value from a field from being included +in the generated request data. For example, because the field is removed or +disabled through JavaScript, or because the field or a parent element has the +``disabled`` attribute (currently not supported by form2request): + +>>> html = b"""
""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +To remove a field value, set it to ``None``: + +>>> form2request(form, {"foo": None}) +Request(url='https://example.com', method='GET', headers=[], body=b'') + + +.. _click: + +Choosing a submit button +======================== + +When an HTML form is submitted, the way form submission is triggered has an +impact on the resulting request data. + +Given a submit button with ``name`` and ``value`` attributes: + +>>> html = b"""
""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +If you submit the form by clicking that button, those attributes are included +in the request data, which is what :func:`~form2request.form2request` does +by default: + +>>> form2request(form) +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +However, sometimes it is possible to submit a form without clicking a submit +button, even when there is such a button. In such cases, the button data should +not be part of the request data. For such cases, set ``click`` to ``False``: + +>>> form2request(form, click=False) +Request(url='https://example.com', method='GET', headers=[], body=b'') + +You may also find forms with more than one submit button: + +>>> html = b"""
""" +>>> selector = Selector(body=html, base_url="https://example.com") +>>> form = selector.css("form") + +By default, :func:`~form2request.form2request` clicks the first submit button: + +>>> form2request(form) +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +To change that, set ``click`` to the element that should be clicked: + +>>> submit_baz = form.css("[value=baz]") +>>> form2request(form, click=submit_baz) +Request(url='https://example.com?foo=baz', method='GET', headers=[], body=b'') + + +.. _override: + +Overriding form attributes +========================== + +You can override the method_ and enctype_ attributes of a form: + +.. _enctype: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/form#enctype +.. _method: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/form#method + +>>> form2request(form, method="POST", enctype="text/plain") +Request(url='https://example.com', method='POST', headers=[('Content-Type', 'text/plain')], body=b'foo=bar') + + +.. _request: + +Using request data +================== + +The output of :func:`~form2request.form2request`, +:class:`~form2request.Request`, is a simple request data container: + +>>> req = form2request(form) +>>> req +Request(url='https://example.com?foo=bar', method='GET', headers=[], body=b'') + +While :func:`~form2request.form2request` does not make requests, you can use +its output request data to build an actual request with any HTTP client +software, like the requests_ library (see an example :ref:`above +`) or the :doc:`Scrapy ` web scraping +framework: + +.. _Scrapy: https://docs.scrapy.org/en/latest/ + +>>> from scrapy import Request +>>> Request(req.url, method=req.method, headers=req.headers, body=req.body) + diff --git a/form2request/__init__.py b/form2request/__init__.py index 4908450..9868802 100644 --- a/form2request/__init__.py +++ b/form2request/__init__.py @@ -1 +1,3 @@ """Build HTTP requests out of HTML forms.""" + +from ._base import Request, form2request diff --git a/form2request/_base.py b/form2request/_base.py new file mode 100644 index 0000000..ceeb8f7 --- /dev/null +++ b/form2request/_base.py @@ -0,0 +1,239 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, Iterable, Optional, Tuple, Union +from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit + +from parsel import Selector, SelectorList +from w3lib.html import strip_html5_whitespace + +if TYPE_CHECKING: + from lxml.html import FormElement # nosec + from lxml.html import HtmlElement # nosec + +FormdataVType = Union[str, Iterable[str]] +FormdataKVType = Tuple[str, FormdataVType] +FormdataType = Optional[Union[Dict[str, FormdataVType], Iterable[FormdataKVType]]] + + +def _parsel_to_lxml(element: HtmlElement | Selector | SelectorList) -> HtmlElement: + if isinstance(element, SelectorList): + element = element[0] + if isinstance(element, Selector): + element = element.root + return element + + +def _enctype( + form: FormElement, click_element: HtmlElement | None, enctype: None | str +) -> str: + if enctype: + enctype = enctype.lower() + if enctype not in {"application/x-www-form-urlencoded", "text/plain"}: + raise ValueError( + f"The specified form enctype ({enctype!r}) is not supported " + f"for forms with the POST method." + ) + elif click_element is not None and ( + enctype := (click_element.get("formenctype") or "").lower() + ): + if enctype == "multipart/form-data": + raise NotImplementedError( + f"{click_element} has formenctype set to {enctype!r}, which " + f"form2request does not currently support for forms with the " + f"POST method." + ) + elif ( + enctype := (form.get("enctype") or "").lower() + ) and enctype == "multipart/form-data": + raise NotImplementedError( + f"{form} has enctype set to {enctype!r}, which form2request does " + f"not currently support for forms with the POST method." + ) + return enctype + + +def _url(form: FormElement, click_element: HtmlElement | None) -> str: + if form.base_url is None: + raise ValueError(f"{form} has no base_url set.") + action = ( + click_element.get("formaction") if click_element is not None else None + ) or form.get("action") + if action is None: + return form.base_url + return urljoin(form.base_url, strip_html5_whitespace(action)) + + +USER = object() + + +def _method( + form: FormElement, click_element: HtmlElement | None, method: None | str +) -> str: + if method: + method_src = USER + else: + if click_element is not None: + method = click_element.get("formmethod") + if method: + method_src = click_element + else: + method = form.method + assert method is not None # lxml’s form.method is always filled + method_src = form + method = method.upper() + if method_src is USER and method not in {"GET", "POST"}: + raise ValueError(f"The specified form method ({method!r}) is not supported.") + if method == "DIALOG": + if method_src is click_element: + raise NotImplementedError( + f"Found unsupported form method {method!r} in the formmethod " + f"attribute of the submission button." + ) + raise NotImplementedError(f"Found unsupported form method {method!r}.") + if method not in {"GET", "POST"}: + method = "GET" + return method + + +def _click_element( + form: FormElement, click: None | bool | HtmlElement +) -> HtmlElement | None: + if click is False: + return None + if click is None or click is True: + clickables = list( + form.xpath( + 'descendant::input[re:test(@type, "^(submit|image)$", "i")]' + '|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]', + namespaces={"re": "http://exslt.org/regular-expressions"}, + ) + ) + if not clickables: + if click: + raise ValueError( + f"No clickable elements found in form {form}. Set click=False or " + f"point it to the element to be clicked." + ) + else: + return None + click = clickables[0] + else: + click = _parsel_to_lxml(click) + return click + + +def _data( + form: FormElement, data: FormdataType, click_element: HtmlElement | None +) -> list[tuple[str, str]]: + data = data or {} + if click_element is not None and (name := click_element.get("name")): + click_data = (name, click_element.get("value")) + if isinstance(data, dict): + data = dict(data) + data[click_data[0]] = click_data[1] + else: + data = list(data) + data.append(click_data) + keys = dict(data or ()).keys() + if not data: + data = [] + inputs = form.xpath( + "descendant::textarea" + "|descendant::select" + "|descendant::input[not(@type) or @type[" + ' not(re:test(., "^(?:submit|image|reset)$", "i"))' + " and (../@checked or" + ' not(re:test(., "^(?:checkbox|radio)$", "i")))]]', + namespaces={"re": "http://exslt.org/regular-expressions"}, + ) + values: list[FormdataKVType] = [ + (k, "" if v is None else v) + for k, v in ((e.name, e.value) for e in inputs) + if k and k not in keys + ] + items = data.items() if isinstance(data, dict) else data + values.extend((k, v) for k, v in items if v is not None) + return [ + (k, v) + for k, vs in values + for v in ([vs] if isinstance(vs, (str, bytes)) else vs) + ] + + +@dataclass +class Request: + """HTTP request data.""" + + url: str + method: str + headers: list[tuple[str, str]] + body: bytes + + +def form2request( + form: FormElement | Selector | SelectorList, + data: FormdataType = None, + *, + click: None | bool | HtmlElement = None, + method: None | str = None, + enctype: None | str = None, +) -> Request: + """Return request data for an HTML form submission. + + *form* must be an instance of :class:`parsel.selector.Selector` or + :class:`parsel.selector.SelectorList` that points to an HTML form, or an + instance of :class:`lxml.html.FormElement`. + + *data* should be either a dictionary or a list of 2-item tuples indicating + the key-value pairs to include in the request as submission data. Keys with + ``None`` as value exclude matching form fields. + + *click* can be any of: + + - ``None`` (default): the first submission element of the form (e.g. a + submit button) is used to build a request for a click-based + form submission. + + If no submission elements are found, the request is built for a + non-click-based form submission, i.e. a form submission triggered by a + non-click event, such as pressing the Enter key while the focus is in + a single-line text input field of the form. + + - ``True`` behaves like ``None``, but raises a :exc:`ValueError` + exception if no submission element is found in the form. + + - ``False`` builds a request for a non-click-based form submission. + + - A submit button of *form*, to build a request for a form submission + based on the clicking of that button. + + On forms with multiple submit buttons, specifying the right button here + may be necessary. + + *method* and *enctype* may be used to override matching form attributes. + """ + form = _parsel_to_lxml(form) + click_element = _click_element(form, click) + url = _url(form, click_element) + method = _method(form, click_element, method) + headers = [] + body = "" + data = _data(form, data, click_element) + if method == "GET": + url = urlunsplit(urlsplit(url)._replace(query=urlencode(data, doseq=True))) + else: + assert method == "POST" + enctype = _enctype(form, click_element, enctype) + if enctype == "text/plain": + headers = [("Content-Type", "text/plain")] + body = "\n".join(f"{k}={v}" for k, v in data) + else: + headers = [("Content-Type", "application/x-www-form-urlencoded")] + body = urlencode(data, doseq=True) + return Request( + url=url, + method=method, + headers=headers, + body=body.encode(), + ) diff --git a/pyproject.toml b/pyproject.toml index a566415..8088639 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,8 @@ classifiers = [ requires-python = ">=3.8" dependencies = [ "lxml >= 4.4.1", + "parsel >= 1.8.1", + "w3lib >= 1.19.0", ] [project.urls] diff --git a/tests/test_main.py b/tests/test_main.py index e6ef5c8..82c9788 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,2 +1,774 @@ -def test_main(): - assert True +import pytest +from lxml.html import fromstring +from parsel import Selector + +from form2request import Request, form2request + + +@pytest.mark.parametrize( + ("base_url", "html", "kwargs", "expected"), + ( + # Empty form. + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # Hidden field. + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # User data not defined by any form field. + # We need to support this, for example, to make it easy to deal with + # forms that may have fields injected with JavaScript. + ( + "https://example.com", + b"""
""", + {"data": {"a": "b"}}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # User data setting a value for a form field. + ( + "https://example.com", + b"""
""", + {"data": {"a": "b"}}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # User data overriding the value of a form field. + # Also needed for JavaScript use cases. + ( + "https://example.com", + b"""
""", + {"data": {"a": "c"}}, + Request( + "https://example.com?a=c", + "GET", + [], + b"", + ), + ), + # User data with None as value not present in the form is ignored. + ( + "https://example.com", + b"""
""", + {"data": {"a": None}}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # User data setting a value from a form field to None removes that + # value. + ( + "https://example.com", + b"""
""", + {"data": {"a": None}}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # User data overriding the value of a form field to None removes that + # value. + ( + "https://example.com", + b"""
""", + {"data": {"a": None}}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # Form field with an unset value. + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com?a=", + "GET", + [], + b"", + ), + ), + # User data as an iterable of key-value tuples. + ( + "https://example.com", + b"""
""", + {"data": (("a", "b"), ("a", "c"))}, + Request( + "https://example.com?a=b&a=c", + "GET", + [], + b"", + ), + ), + # A submit button is “clicked” by default, i.e. its attributes are + # taken into account. + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # You can disable the clicking of any submit button. + ( + "https://example.com", + b"""
""", + {"click": False}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ), + # You can force the clicking of the first submit button. + ( + "https://example.com", + b"""
""", + {"click": True}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # Forcing the clicking of the first submit button will trigger a + # ValueError if there are no submit buttons. + ( + "https://example.com", + b"""
""", + {"click": True}, + ValueError, + ), + # If there are 2 or more submit buttons, the first one is used by + # default. + ( + "https://example.com", + b"""
+
""", + {}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # You can force a specific submit button to be used. + ( + "https://example.com", + b"""
+
""", + {"click": './/*[@value="c"]'}, + Request( + "https://example.com?a=c", + "GET", + [], + b"", + ), + ), + # Supported enctypes are application/x-www-form-urlencoded (default) + # and text/plain. Unknown enctypes are treated as the default one. + *( + ( + "https://example.com", + f"""
""".encode(), + {}, + Request( + "https://example.com", + "GET", + [], + b"", + ), + ) + for enctype in ( + "", + "application/x-www-form-urlencoded", + "text/plain", + "foo", + ) + ), + # multipart/form-data raises a NotImplementedError exception when the + # method is POST. + ( + "https://example.com", + b"""
""", + {}, + NotImplementedError, + ), + # multipart/form-data does work when method is GET (default). + ( + "https://example.com", + b"""
+
""", + {}, + Request( + "https://example.com?a=b", + "GET", + [], + b"", + ), + ), + # The formenctype from the submit button is taken into account, even if + # it has an unknown value. + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com", + "POST", + [("Content-Type", "application/x-www-form-urlencoded")], + b"", + ), + ), + ( + "https://example.com", + b"""
""", + {}, + Request( + "https://example.com", + "POST", + [("Content-Type", "application/x-www-form-urlencoded")], + b"", + ), + ), + ( + "https://example.com", + b"""
+
""", + {}, + NotImplementedError, + ), + # enctype may be overridden, in which case it raises ValueError for + # both unknown and unsupported values when method is POST. + ( + "https://example.com", + b"""
""", + {"enctype": "multipart/form-data"}, + ValueError, + ), + ( + "https://example.com", + b"""
""", + {"enctype": "a"}, + ValueError, + ), + # Only submit buttons are detected as such. + *( + ( + "https://example.com", + f"""
{button}