Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for custom attr results to DropLowProbabilityItemPipeline. #125

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
46 changes: 37 additions & 9 deletions tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,15 @@

import pytest

from zyte_common_items import Article, ArticleListPage, Product, ProductNavigation
from zyte_common_items import (
Article,
ArticleListPage,
CustomAttributes,
CustomAttributesMetadata,
CustomAttributesValues,
Product,
ProductNavigation,
)
from zyte_common_items.pipelines import DropLowProbabilityItemPipeline

scrapy = pytest.importorskip("scrapy") # noqa
Expand Down Expand Up @@ -109,7 +117,7 @@ def test_get_threshold_for_item(


@pytest.mark.parametrize(
"items, item_proba, threshold, expected_stats_calls, expected_return",
"items, item_proba, threshold, expected_stats_calls",
[
(
[
Expand All @@ -126,7 +134,6 @@ def test_get_threshold_for_item(
("drop_low_probability_item/kept/Product", 1),
("drop_low_probability_item/kept/Article", 1),
],
True,
),
(
[
Expand All @@ -143,7 +150,6 @@ def test_get_threshold_for_item(
("drop_low_probability_item/kept/Product", 1),
("drop_low_probability_item/kept/Article", 1),
],
True,
),
(
[
Expand All @@ -160,13 +166,31 @@ def test_get_threshold_for_item(
("drop_low_probability_item/dropped/Product", 1),
("drop_low_probability_item/dropped/Article", 1),
],
None,
),
(
[
{
"product": MagicMock(spec=Product(url="http://example.com")),
"customAttributes": MagicMock(
spec=CustomAttributes(
values=CustomAttributesValues({"foo": "bar"}),
metadata=CustomAttributesMetadata(),
)
),
},
],
0.01,
0.1,
[
("drop_low_probability_item/processed", 1),
("drop_low_probability_item/processed/Product", 1),
("drop_low_probability_item/dropped", 1),
("drop_low_probability_item/dropped/Product", 1),
],
),
],
)
def test_process_item(
items, item_proba, threshold, expected_stats_calls, expected_return
):
def test_process_item(items, item_proba, threshold, expected_stats_calls):
mock_crawler = MagicMock(spec=["spider", "stats"])

pipeline = DropLowProbabilityItemPipeline(mock_crawler)
Expand All @@ -176,7 +200,11 @@ def test_process_item(
mock_get_threshold_for_item.return_value = threshold

for item in items:
item.get_probability.return_value = item_proba
if isinstance(item, dict):
real_item = item["product"]
else:
real_item = item
real_item.get_probability.return_value = item_proba
try:
returned_item = pipeline.process_item(item, mock_crawler.spider)
except scrapy.exceptions.DropItem as e:
Expand Down
19 changes: 15 additions & 4 deletions zyte_common_items/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from copy import deepcopy
from warnings import warn

from zyte_common_items import ae
from zyte_common_items import CustomAttributes, ae


class AEPipeline:
Expand Down Expand Up @@ -118,9 +118,20 @@ def get_item_name(self, item):
def process_item(self, item, spider):
from scrapy.exceptions import DropItem

item_name = self.get_item_name(item)
item_proba = item.get_probability()
threshold = self.get_threshold_for_item(item, spider)
if isinstance(item, dict):
# support for custom attrs
for item_type, item_instance in item.items():
if item_type is not CustomAttributes:
real_item = item_instance
break
else:
return item
else:
real_item = item

item_name = self.get_item_name(real_item)
item_proba = real_item.get_probability()
threshold = self.get_threshold_for_item(real_item, spider)

self.stats.inc_value("drop_low_probability_item/processed")
self.stats.inc_value(f"drop_low_probability_item/processed/{item_name}")
Expand Down
Loading