Skip to content

Commit

Permalink
Handle keys that contain keywords such as "or", "and" or "with"
Browse files Browse the repository at this point in the history
 * tokenize() now handles two cases: the Licensing was created with
   or without symbols. In the first case, the automaton-based tokenizer
   is used; otherwise a plain regex-based splitter is used and more
   constraints are enforced on license symbols: they cannot contain
   spaces and only the "+" is recognized as or later.

Link: #9
Signed-off-by: Philippe Ombredanne <[email protected]>
  • Loading branch information
pombredanne committed Apr 18, 2017
1 parent cd209a7 commit 6587a8d
Show file tree
Hide file tree
Showing 3 changed files with 523 additions and 323 deletions.
6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,14 @@ For example::

>>> from license_expression import Licensing, LicenseSymbol
>>> licensing = Licensing()
>>> expression = ' GPL-2.0 or LGPL 2.1 and mit '
>>> expression = ' GPL-2.0 or LGPL-2.1 and mit '
>>> parsed = licensing.parse(expression)
>>> expected = 'GPL-2.0 OR (LGPL 2.1 AND mit)'
>>> expected = 'GPL-2.0 OR (LGPL-2.1 AND mit)'
>>> assert expected == parsed.render('{symbol.key}')

>>> expected = [
... LicenseSymbol('GPL-2.0'),
... LicenseSymbol('LGPL 2.1'),
... LicenseSymbol('LGPL-2.1'),
... LicenseSymbol('mit')
... ]
>>> assert expected == licensing.license_symbols(expression)
Expand Down
126 changes: 106 additions & 20 deletions src/license_expression/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@
from boolean.boolean import PARSE_UNKNOWN_TOKEN
from boolean.boolean import ParseError
from boolean.boolean import TOKEN_SYMBOL
from boolean.boolean import TOKEN_AND
from boolean.boolean import TOKEN_OR
from boolean.boolean import TOKEN_LPAR
from boolean.boolean import TOKEN_RPAR

from license_expression._pyahocorasick import Trie as Scanner
from license_expression._pyahocorasick import Output
Expand Down Expand Up @@ -103,14 +107,16 @@ class ExpressionError(Exception):
TOKEN_OR_LATER = 11

# actual keyword types
KW_LPAR = Keyword('(', TOKEN_LPAR)
KW_RPAR = Keyword(')', TOKEN_RPAR)
_KEYWORDS = [
Keyword('and', boolean.TOKEN_AND),
Keyword('or', boolean.TOKEN_OR),
Keyword('(', boolean.TOKEN_LPAR),
Keyword(')', boolean.TOKEN_RPAR),
Keyword('with', TOKEN_WITH),
Keyword(' and ', TOKEN_AND),
Keyword(' or ', TOKEN_OR),
KW_LPAR,
KW_RPAR,
Keyword(' with ', TOKEN_WITH),
Keyword('+', TOKEN_OR_LATER),
Keyword('or later', TOKEN_OR_LATER),
Keyword(' or later', TOKEN_OR_LATER),
]

KEYWORDS = tuple(kw.value for kw in _KEYWORDS)
Expand All @@ -123,13 +129,13 @@ class Licensing(boolean.BooleanAlgebra):
For example:
>>> l = Licensing()
>>> expr = l.parse(" GPL-2.0 or LGPL 2.1 and mit ")
>>> expected = 'GPL-2.0 OR (LGPL 2.1 AND mit)'
>>> expr = l.parse(" GPL-2.0 or LGPL-2.1 and mit ")
>>> expected = 'GPL-2.0 OR (LGPL-2.1 AND mit)'
>>> assert expected == expr.render('{symbol.key}')
>>> expected = [
... LicenseSymbol('GPL-2.0'),
... LicenseSymbol('LGPL 2.1'),
... LicenseSymbol('LGPL-2.1'),
... LicenseSymbol('mit')
... ]
>>> assert expected == l.license_symbols(expr)
Expand Down Expand Up @@ -174,6 +180,8 @@ def __init__(self, symbols=tuple(), quiet=True):
raise ValueError('\n'.join(warns + errors))

# mapping of known symbol used for parsing and resolution as (key, symbol)
# TODO: inject lpar, rpar and spaces sourround, before and after
# e.g "(sym)" "(sym " "sym)" " sym "
self.known_symbols = {symbol.key: symbol for symbol in symbols}

# Aho-Corasick automaton-based Scanner used for expression tokenizing
Expand Down Expand Up @@ -283,8 +291,8 @@ def license_keys(self, expression, unique=True, **kwargs):
For example:
>>> l = Licensing()
>>> expr = ' GPL-2.0 and mit or later with blabla and mit or LGPL 2.1 and mit and mit or later with GPL-2.0'
>>> expected = ['GPL-2.0', 'mit', 'blabla', 'LGPL 2.1']
>>> expr = ' GPL-2.0 and mit+ with blabla and mit or LGPL-2.1 and mit and mit+ with GPL-2.0'
>>> expected = ['GPL-2.0', 'mit', 'blabla', 'LGPL-2.1']
>>> assert expected == l.license_keys(l.parse(expr))
"""
symbols = self.license_symbols(expression, unique=False, decompose=True, **kwargs)
Expand Down Expand Up @@ -349,9 +357,9 @@ def parse(self, expression, validate=False, strict=False, **kwargs):
the YYY symbol has `is_exception` set to False.
For example:
>>> expression = 'EPL 1.0 and Apache 1.1 OR GPL 2.0 with Classpath exception'
>>> expression = 'EPL-1.0 and Apache-1.1 OR GPL-2.0 with Classpath-exception'
>>> parsed = Licensing().parse(expression)
>>> expected = '(EPL 1.0 AND Apache 1.1) OR GPL 2.0 WITH Classpath exception'
>>> expected = '(EPL-1.0 AND Apache-1.1) OR GPL-2.0 WITH Classpath-exception'
>>> assert expected == parsed.render(template='{symbol.key}')
"""
if expression is None:
Expand Down Expand Up @@ -406,12 +414,16 @@ def tokenize(self, expression, strict=False):
If `strict` is True, additional exceptions will be raised in a expression
such as "XXX with ZZZ" if the XXX symbol has is_exception` set to True or the
YYY symbol has `is_exception` set to False.
ZZZ symbol has `is_exception` set to False.
"""
scanner = self.get_scanner()
if self.known_symbols:
# scan with an automaton, recognize whole symbols+keywords or only keywords
scanner = self.get_scanner()
results = scanner.scan(expression)
else:
# scan with a simple regex-based splitter
results = splitter(expression)

# scan with an automaton, recognize whole symbols+keywords or only keywords
results = scanner.scan(expression)
results = strip_and_skip_spaces(results)
results = merge_or_later_results(results)
result_groups = group_results_for_with_subexpression(results)
Expand Down Expand Up @@ -471,9 +483,10 @@ def tokenize(self, expression, strict=False):
TOKEN_SYMBOL, string, start, PARSE_INVALID_EXPRESSION)

# this is a A with B seq of three results
lic_res, _WITH , exc_res = group
lic_res, WITH , exc_res = group
pos = lic_res.start
token_string = ' '.join([t.string for t in group])
WITHs = ' ' + WITH.string.strip() + ' '
token_string = ''.join([lic_res.string, WITHs, exc_res.string])

# licenses
lic_out = lic_res.output
Expand Down Expand Up @@ -511,7 +524,7 @@ def tokenize(self, expression, strict=False):
raise ParseError(TOKEN_SYMBOL, exc_res.string, exc_res.start,
PARSE_INVALID_SYMBOL)

if strict and not exc_sym.is_exception:
if strict and self.known_symbols and not exc_sym.is_exception:
raise ParseError(TOKEN_SYMBOL, exc_res.string, exc_res.start,
PARSE_INVALID_SYMBOL_AS_EXCEPTION)

Expand Down Expand Up @@ -1189,3 +1202,76 @@ def validate_symbols(symbols, validate_keys=False, _keywords=KEYWORDS):
errors.append('Duplicated or empty aliases ignored for license key: %(dupeal)r.' % locals())

return warnings, errors


_splitter = re.compile('''
(?P<symbol>[^\s\(\)]+)
|
(?P<space>\s+)
|
(?P<lpar>\()
|
(?P<rpar>\))
''',
re.VERBOSE | re.MULTILINE | re.UNICODE
).finditer


def splitter(expression):
"""
Return an iterable of Result describing each token given an
expression unicode string.
This is a simpler tokenizer used when the Licensing does not have
known symbols. The split is done on spaces and parens. Anything else
is either a token or a symbol.
"""
if not expression:
return

if not isinstance(expression, str):
raise ParseError(error_code=PARSE_EXPRESSION_NOT_UNICODE)

# mapping of lowercase token strings to a token type id
TOKENS = {
'and': Keyword(value='and', type=TOKEN_AND),
'or': Keyword(value='or', type=TOKEN_OR),
'with': Keyword(value='with', type=TOKEN_WITH),
}
KW_PLUS = Keyword(value='+', type=TOKEN_OR_LATER)

for match in _splitter(expression):
if not match:
continue

start, end = match.span()
end = end - 1
mgd = match.groupdict()

space = mgd.get('space')
if space:
yield Result(start, end, space, None)

lpar = mgd.get('lpar')
if lpar:
yield Result(start, end, lpar, Output(lpar, KW_LPAR))

rpar = mgd.get('rpar')
if rpar:
yield Result(start, end, rpar, Output(rpar, KW_RPAR))

token_or_sym = mgd.get('symbol')
if not token_or_sym:
continue

token = TOKENS.get(token_or_sym.lower())
if token:
yield Result(start, end, token_or_sym, Output(token_or_sym, token))
elif token_or_sym.endswith('+') and token_or_sym != '+':
val = token_or_sym[:-1]
sym = LicenseSymbol(key=val)
yield Result(start, end - 1, val, Output(val, sym))
yield Result(end, end, '+', Output('+', KW_PLUS))
else:
sym = LicenseSymbol(key=token_or_sym)
yield Result(start, end, token_or_sym, Output(token_or_sym, sym))
Loading

0 comments on commit 6587a8d

Please sign in to comment.