Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid incorrect currency when no price available #20

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
35 changes: 29 additions & 6 deletions price_parser/parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
import itertools
import re
import string
from typing import Callable, Optional, Pattern, List, Tuple
Expand Down Expand Up @@ -56,8 +57,20 @@ def fromstring(cls, price: Optional[str],


def or_regex(symbols: List[str]) -> Pattern:
""" Return a regex which matches any of ``symbols`` """
return re.compile('|'.join(re.escape(s) for s in symbols))
""" Return a regex which matches any of ``symbols`` surrounded by some special characters """
left_tokens = [r"^", r"\s+?", r"\d+?"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure about the non-greedy here, what's the result of this test case?

    Example(None, '  Prices   in   EUR  ',
            'EUR', None, None),
    Example(None, '13800   ₶  ',
            '₶', '13800', 13800)
            '₶', '13800', 13800),

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That should not be a problem, since their match is not captured. But I guess it’s worth adding a test for it.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both tests are passing. I included them in the test suite just in case...

right_tokens = [r"$", r"\s+?", r"\d+?", r"[^a-zA-Z0-9]+?"]

return re.compile(
"|".join(
[
left + "({})".format(re.escape(s)) + right
for left, right in itertools.product(left_tokens, right_tokens)
for s in symbols
]
)
)
Comment on lines +61 to +72
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like this can be simplified a bit:

symbols_pattern = '|'.join(re.escape(s) for s in symbols)
template = r'(?:^|\b|(?<=\d)){}(?:(?=\d)|\b|$)'
return re.compile(template.format(symbols_pattern))

If I’m not wrong, this should make some other changes below unnecessary, since we are no longer using a capture group.

And since this function is now doing more than simply an OR regex, but also imposing word boundaries specific to currency symbols (it allows digits as word boundaries), maybe we need to deprecate the existing function and provide a new, _private one for the new logic.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested and it didn't work in all examples.
I don't consider simple any code that auto generate regexes. Your code may work but it will require more effort on it to solve all test cases and we could try to improve it, but for now, I believe that solving the existing issues is more interesting for the project.




# If one of these symbols is found either in price or in currency,
Expand Down Expand Up @@ -96,16 +109,21 @@ def or_regex(symbols: List[str]) -> Pattern:
DOLLAR_CODES = [k for k in CURRENCY_CODES if k.endswith('D')]
_DOLLAR_REGEX = re.compile(
r'''
\b
(\b
(?:{}) # currency code like NZD
(?=
\$? # dollar sign to ignore if attached to the currency code
(?:[\W\d]|$) # not a letter
)
))
'''.format('|'.join(re.escape(k) for k in DOLLAR_CODES)),
re.VERBOSE,
)

OTHER_PARTICULAR_REGEXES = [
# HT is the French abbreviation for "Hors Tax" (tax not added to the price)
# and it may appear after € currency symbol
r"(€)HT+?",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can’t the same be achieved by adding '€HT' to SAFE_CURRENCY_SYMBOLS?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

€HT is not a currency symbol (only is) , so it won't work just including there. It is a more specific case.

]

# Other common currency symbols: 3-letter codes, less safe abbreviations
OTHER_CURRENCY_SYMBOLS_SET = (
Expand All @@ -115,7 +133,7 @@ def or_regex(symbols: List[str]) -> Pattern:
CURRENCY_NATIONAL_SYMBOLS +

# even if they appear in text, currency is likely to be rouble
['р', 'Р']
['р', 'Р'],
)
- set(SAFE_CURRENCY_SYMBOLS) # already handled
- {'-', 'XXX'} # placeholder values
Expand All @@ -125,6 +143,7 @@ def or_regex(symbols: List[str]) -> Pattern:
key=len, reverse=True)

_search_dollar_code = _DOLLAR_REGEX.search
_search_other_particular_regexes = re.compile("|".join(OTHER_PARTICULAR_REGEXES), re.VERBOSE).search
_search_safe_currency = or_regex(SAFE_CURRENCY_SYMBOLS).search
_search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search

Expand All @@ -140,6 +159,7 @@ def extract_currency_symbol(price: Optional[str],
(_search_safe_currency, currency_hint),
(_search_unsafe_currency, price),
(_search_unsafe_currency, currency_hint),
(_search_other_particular_regexes, price),
]

if currency_hint and '$' in currency_hint:
Expand All @@ -151,7 +171,10 @@ def extract_currency_symbol(price: Optional[str],
for meth, attr in methods:
m = meth(attr) if attr else None
if m:
return m.group(0)
groups = [match for match in m.groups() if match is not None]
assert groups

return groups.pop()

return None

Expand Down
26 changes: 19 additions & 7 deletions tests/test_price_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,21 @@ def __eq__(self, other):
Example(None, '12,000원',
'원', '12,000', 12000),
Example(None, '3,500円',
'円', '3,500', 3500)
'円', '3,500', 3500),
Example(None, 'EUROPE',
None, None, None),
Example(None, 'NEUROLOGY PRICES',
None, None, None),
Example(None, 'Prices in EUR',
'EUR', None, None),
Example(None, 'Prices in EUR for all products',
'EUR', None, None),
Example(None, 'EUR is the selected currency',
'EUR', None, None),
Example(None, ' Prices in EUR ',
'EUR', None, None),
Example(None, '13800 ₶ ',
'₶', '13800', 13800)
]


Expand Down Expand Up @@ -1919,6 +1933,10 @@ def __eq__(self, other):
'CHF', '19.90', 19.90),
Example('', '530,42 Zł',
'Zł', '530,42', 530.42),
Example('>', 'См. цену в прайсе',
None, None, None),
Example('Купить', 'Печная труба',
None, None, None),
]


Expand All @@ -1941,12 +1959,6 @@ def __eq__(self, other):
Example('Cuneo', '61.858 L', # Romanian New Leu
'L', '61.858', 61858),

# "р" / "руб" is detected as currency
Example('>', 'См. цену в прайсе',
None, None, None),
Example('Купить', 'Печная труба',
None, None, None),

# dates
Example(None, 'July, 2004',
None, None, None),
Expand Down