-
Notifications
You must be signed in to change notification settings - Fork 50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Avoid incorrect currency when no price available #20
base: master
Are you sure you want to change the base?
Changes from all commits
a57854b
32c025b
e90794d
1240656
c6d410c
b847644
b19a770
d14a509
d09418f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
# -*- coding: utf-8 -*- | ||
import itertools | ||
import re | ||
import string | ||
from typing import Callable, Optional, Pattern, List, Tuple | ||
|
@@ -56,8 +57,20 @@ def fromstring(cls, price: Optional[str], | |
|
||
|
||
def or_regex(symbols: List[str]) -> Pattern: | ||
""" Return a regex which matches any of ``symbols`` """ | ||
return re.compile('|'.join(re.escape(s) for s in symbols)) | ||
""" Return a regex which matches any of ``symbols`` surrounded by some special characters """ | ||
left_tokens = [r"^", r"\s+?", r"\d+?"] | ||
right_tokens = [r"$", r"\s+?", r"\d+?", r"[^a-zA-Z0-9]+?"] | ||
|
||
return re.compile( | ||
"|".join( | ||
[ | ||
left + "({})".format(re.escape(s)) + right | ||
for left, right in itertools.product(left_tokens, right_tokens) | ||
for s in symbols | ||
] | ||
) | ||
) | ||
Comment on lines
+61
to
+72
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like this can be simplified a bit: symbols_pattern = '|'.join(re.escape(s) for s in symbols)
template = r'(?:^|\b|(?<=\d)){}(?:(?=\d)|\b|$)'
return re.compile(template.format(symbols_pattern)) If I’m not wrong, this should make some other changes below unnecessary, since we are no longer using a capture group. And since this function is now doing more than simply an OR regex, but also imposing word boundaries specific to currency symbols (it allows digits as word boundaries), maybe we need to deprecate the existing function and provide a new, _private one for the new logic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tested and it didn't work in all examples. |
||
|
||
|
||
|
||
# If one of these symbols is found either in price or in currency, | ||
|
@@ -96,16 +109,21 @@ def or_regex(symbols: List[str]) -> Pattern: | |
DOLLAR_CODES = [k for k in CURRENCY_CODES if k.endswith('D')] | ||
_DOLLAR_REGEX = re.compile( | ||
r''' | ||
\b | ||
(\b | ||
(?:{}) # currency code like NZD | ||
(?= | ||
\$? # dollar sign to ignore if attached to the currency code | ||
(?:[\W\d]|$) # not a letter | ||
) | ||
)) | ||
'''.format('|'.join(re.escape(k) for k in DOLLAR_CODES)), | ||
re.VERBOSE, | ||
) | ||
|
||
OTHER_PARTICULAR_REGEXES = [ | ||
# HT is the French abbreviation for "Hors Tax" (tax not added to the price) | ||
# and it may appear after € currency symbol | ||
r"(€)HT+?", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can’t the same be achieved by adding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
] | ||
|
||
# Other common currency symbols: 3-letter codes, less safe abbreviations | ||
OTHER_CURRENCY_SYMBOLS_SET = ( | ||
|
@@ -115,7 +133,7 @@ def or_regex(symbols: List[str]) -> Pattern: | |
CURRENCY_NATIONAL_SYMBOLS + | ||
|
||
# even if they appear in text, currency is likely to be rouble | ||
['р', 'Р'] | ||
['р', 'Р'], | ||
) | ||
- set(SAFE_CURRENCY_SYMBOLS) # already handled | ||
- {'-', 'XXX'} # placeholder values | ||
|
@@ -125,6 +143,7 @@ def or_regex(symbols: List[str]) -> Pattern: | |
key=len, reverse=True) | ||
|
||
_search_dollar_code = _DOLLAR_REGEX.search | ||
_search_other_particular_regexes = re.compile("|".join(OTHER_PARTICULAR_REGEXES), re.VERBOSE).search | ||
_search_safe_currency = or_regex(SAFE_CURRENCY_SYMBOLS).search | ||
_search_unsafe_currency = or_regex(OTHER_CURRENCY_SYMBOLS).search | ||
|
||
|
@@ -140,6 +159,7 @@ def extract_currency_symbol(price: Optional[str], | |
(_search_safe_currency, currency_hint), | ||
(_search_unsafe_currency, price), | ||
(_search_unsafe_currency, currency_hint), | ||
(_search_other_particular_regexes, price), | ||
] | ||
|
||
if currency_hint and '$' in currency_hint: | ||
|
@@ -151,7 +171,10 @@ def extract_currency_symbol(price: Optional[str], | |
for meth, attr in methods: | ||
m = meth(attr) if attr else None | ||
if m: | ||
return m.group(0) | ||
groups = [match for match in m.groups() if match is not None] | ||
assert groups | ||
|
||
return groups.pop() | ||
|
||
return None | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure about the non-greedy here, what's the result of this test case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That should not be a problem, since their match is not captured. But I guess it’s worth adding a test for it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both tests are passing. I included them in the test suite just in case...