Skip to content

Commit

Permalink
Merge pull request #70 from zytedata/zyte-parsers-gtin
Browse files Browse the repository at this point in the history
Add gtin_processor.
  • Loading branch information
kmike authored Jan 12, 2024
2 parents b195b93 + 75fb11f commit bb10fd5
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 5 deletions.
4 changes: 4 additions & 0 deletions docs/usage/pages.rst
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ All ``descriptionHtml`` fields have the following processor enabled:

.. autofunction:: zyte_common_items.processors.description_html_processor

All ``gtin`` fields have the following processor enabled:

.. autofunction:: zyte_common_items.processors.gtin_processor

All ``price`` fields have the following processor enabled:

.. autofunction:: zyte_common_items.processors.price_processor
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"itemadapter>=0.8.0",
"price-parser>=0.3.4",
"web-poet>=0.14.0",
"zyte-parsers>=0.3.0",
"zyte-parsers>=0.4.0",
],
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
55 changes: 54 additions & 1 deletion tests/test_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
from price_parser import Price
from web_poet import HttpResponse, field
from zyte_parsers import Breadcrumb as zp_Breadcrumb
from zyte_parsers import Gtin as zp_Gtin
from zyte_parsers import extract_breadcrumbs

from zyte_common_items import BasePage, Breadcrumb, ProductPage
from zyte_common_items import BasePage, Breadcrumb, Gtin, ProductPage
from zyte_common_items.processors import (
_format_price,
brand_processor,
breadcrumbs_processor,
gtin_processor,
)

breadcrumbs_html = """
Expand Down Expand Up @@ -174,3 +176,54 @@ def brand(self):
)
def test_format_price(input_value, expected_value):
assert _format_price(Price.fromstring(input_value)) == expected_value


gtin_str = "978-1-933624-34-1"
gtin_html = f"<span>{gtin_str}</span>"
gtin_expected = [Gtin("isbn13", "9781933624341")]


@pytest.mark.parametrize(
"input_value,expected_value",
[
(None, None),
([], None),
("foo", None),
(Selector(text="<html></html>"), None),
(SelectorList([]), None),
(gtin_expected, gtin_expected),
(fromstring(gtin_html), gtin_expected),
(Selector(text=gtin_html), gtin_expected),
(
[
zp_Gtin("isbn13", "9781933624341"),
zp_Gtin("isbn13", "9780525555360"),
],
[
Gtin("isbn13", "9781933624341"),
Gtin("isbn13", "9780525555360"),
],
),
(gtin_str, gtin_expected),
(
[
"978-1-933624-34-1",
"97-805-25555-360",
],
[
Gtin("isbn13", "9781933624341"),
Gtin("isbn13", "9780525555360"),
],
),
],
)
def test_gtin(input_value, expected_value):
base_url = "http://www.example.com/blog/"

class GtinPage(BasePage):
@field(out=[gtin_processor])
def gtin(self):
return input_value

page = GtinPage(base_url) # type: ignore[arg-type]
assert page.gtin == expected_value
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ deps =
itemadapter==0.8.0
price-parser==0.3.4
web-poet==0.14.0
zyte-parsers==0.3.0
zyte-parsers==0.4.0

[testenv:docs]
changedir = docs
Expand Down
2 changes: 2 additions & 0 deletions zyte_common_items/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
breadcrumbs_processor,
description_html_processor,
description_processor,
gtin_processor,
price_processor,
simple_price_processor,
)
Expand Down Expand Up @@ -313,6 +314,7 @@ class Processors(BasePage.Processors):
breadcrumbs = [breadcrumbs_processor]
description = [description_processor]
descriptionHtml = [description_html_processor]
gtin = [gtin_processor]
price = [price_processor]
regularPrice = [simple_price_processor]

Expand Down
47 changes: 45 additions & 2 deletions zyte_common_items/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,15 @@
from price_parser import Price
from web_poet.mixins import ResponseShortcutsMixin
from zyte_parsers import Breadcrumb as zp_Breadcrumb
from zyte_parsers import extract_brand_name, extract_breadcrumbs, extract_price

from zyte_parsers import Gtin as zp_Gtin
from zyte_parsers import (
extract_brand_name,
extract_breadcrumbs,
extract_gtin,
extract_price,
)

from . import Gtin
from .items import Breadcrumb


Expand Down Expand Up @@ -187,3 +194,39 @@ def description_processor(value: Any, page: Any) -> Any:
page._description_node = cleaned_node
page._description_str = cleaned_text
return cleaned_text


def gtin_processor(
value: Union[SelectorList, Selector, HtmlElement, str], page: Any
) -> Any:
"""Convert the data into a list of :class:`~zyte_common_items.Gtin` objects if possible.
Supported inputs are :class:`str`, :class:`~parsel.selector.Selector`,
:class:`~parsel.selector.SelectorList`, :class:`~lxml.html.HtmlElement`, an
iterable of :class:`str` and an iterable of :class:`zyte_parsers.Gtin`
objects.
Other inputs are returned as is.
"""

def _from_zp_gtin(zp_value: zp_Gtin) -> Gtin:
return Gtin(type=zp_value.type, value=zp_value.value)

results = []
if isinstance(value, SelectorList):
for sel in value:
if result := extract_gtin(sel):
results.append(_from_zp_gtin(result))
elif isinstance(value, (Selector, HtmlElement, str)):
if result := extract_gtin(value):
results.append(_from_zp_gtin(result))
elif isinstance(value, Iterable):
for item in value:
if isinstance(item, zp_Gtin):
results.append(_from_zp_gtin(item))
elif isinstance(item, str):
results.append(_from_zp_gtin(extract_gtin(item)))
else:
results.append(item)
else:
return value
return results or None

0 comments on commit bb10fd5

Please sign in to comment.