From f9e73aa90134d3f208b5269e131b7453d1375011 Mon Sep 17 00:00:00 2001 From: iamkroot Date: Tue, 18 Dec 2018 21:45:38 +0530 Subject: [PATCH 01/10] tools: Add pygments import script Currently, we can retrieve the regex patterns for the required tokens of all languages not found in the coAST schema. Closes #96 --- tools/pygments_import.py | 87 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 tools/pygments_import.py diff --git a/tools/pygments_import.py b/tools/pygments_import.py new file mode 100644 index 0000000..23fa044 --- /dev/null +++ b/tools/pygments_import.py @@ -0,0 +1,87 @@ +import os.path +import yaml +from collections import defaultdict +from itertools import chain + +import pygments +from pygments.token import Token +from pygments.lexers import get_all_lexers, get_lexer_by_name + +LANGUAGE_FOLD = os.path.abspath("/data/Language") + + +def get_coast_aliases(): + for lang_file in os.listdir(LANGUAGE_FOLD): + with open(os.path.join(LANGUAGE_FOLD, lang_file)) as f: + lang = yaml.load(f) + possible_names = set((lang['identifier'], lang_file.rstrip('.yaml'))) + possible_names.update(lang.get('aliases', [])) + if 'full_name' in lang: + possible_names.add(lang['full_name']) + yield list(possible_names) + + +def get_existing_lexers(): + for aliases in get_coast_aliases(): + for alias in aliases: + try: + yield get_lexer_by_name(alias) + break + except pygments.util.ClassNotFound: + pass + else: + print("No lexer for ", aliases[0]) + + +def find_new_lexers(): + known_lexers = list(get_existing_lexers()) + all_lexers = [get_lexer_by_name(lexer[1][0]) for lexer in get_all_lexers()] + print("Number of known lexers:", len(known_lexers)) + print("Number of total pygments lexers:", len(all_lexers)) + for lexer in all_lexers: + if not any(known_lexer.name == lexer.name + for known_lexer in known_lexers): + yield lexer + + +def get_lexer_patterns(lexer, required_token_types=[]): + patterns = defaultdict(list) + if not hasattr(lexer, 'tokens'): + print('Skipping {}: no tokens'.format(lexer.name)) + return patterns + + # no need to handle each section separately + for token in chain(*lexer.tokens.values()): + if not isinstance(token, tuple) or len(token) != 2: + continue + re_pattern, token_type = token + current_token_type = None + if not required_token_types: + current_token_type = token_type + for super_type in required_token_types: + assert super_type in Token + if token_type in super_type: + current_token_type = super_type + break + if not current_token_type: + continue + if isinstance(re_pattern, pygments.lexer.words): + re_pattern = re_pattern.get() + if not isinstance(re_pattern, str): + print("Invalid re_pattern for ", token_type, re_pattern) + continue + patterns[current_token_type].append(re_pattern) + return patterns + + +def process_pygments(): + required_token_types = [Token.Comment, Token.Keyword] + new_lexers = list(find_new_lexers()) + print("Number of unknown lexers:", len(new_lexers)) + lexer_patterns = {} + for lexer in new_lexers: + patterns = get_lexer_patterns(lexer, required_token_types) + if patterns: + lexer_patterns[lexer] = patterns + print("Number of new patterns:", len(lexer_patterns)) + print(lexer_patterns) From e7b6db4cafbfe0e9f837702401d92a740837b348 Mon Sep 17 00:00:00 2001 From: iamkroot Date: Fri, 21 Dec 2018 22:57:40 +0530 Subject: [PATCH 02/10] tools: Add keywords extraction logic for pygments Many edge cases are yet to be covered. For now, the script simply skips over all the languages for which it was unable to parse the patterns properly. --- tools/pygments_import.py | 159 ++++++++++++++++++++++++++++++++++----- 1 file changed, 140 insertions(+), 19 deletions(-) diff --git a/tools/pygments_import.py b/tools/pygments_import.py index 23fa044..c26f81d 100644 --- a/tools/pygments_import.py +++ b/tools/pygments_import.py @@ -2,12 +2,15 @@ import yaml from collections import defaultdict from itertools import chain +import re import pygments from pygments.token import Token from pygments.lexers import get_all_lexers, get_lexer_by_name -LANGUAGE_FOLD = os.path.abspath("/data/Language") + +LANGUAGE_FOLD = os.path.abspath("../data/Language") # Remove '../' for travis +REQUIRED_TOKEN_TYPES = (Token.Keyword, ) def get_coast_aliases(): @@ -21,7 +24,7 @@ def get_coast_aliases(): yield list(possible_names) -def get_existing_lexers(): +def get_coast_lexers(): for aliases in get_coast_aliases(): for alias in aliases: try: @@ -30,11 +33,11 @@ def get_existing_lexers(): except pygments.util.ClassNotFound: pass else: - print("No lexer for ", aliases[0]) + print("No lexer for", aliases[0]) -def find_new_lexers(): - known_lexers = list(get_existing_lexers()) +def get_new_lexers(): + known_lexers = list(get_coast_lexers()) all_lexers = [get_lexer_by_name(lexer[1][0]) for lexer in get_all_lexers()] print("Number of known lexers:", len(known_lexers)) print("Number of total pygments lexers:", len(all_lexers)) @@ -44,7 +47,7 @@ def find_new_lexers(): yield lexer -def get_lexer_patterns(lexer, required_token_types=[]): +def get_lexer_patterns(lexer, required_token_types=()): patterns = defaultdict(list) if not hasattr(lexer, 'tokens'): print('Skipping {}: no tokens'.format(lexer.name)) @@ -67,21 +70,139 @@ def get_lexer_patterns(lexer, required_token_types=[]): continue if isinstance(re_pattern, pygments.lexer.words): re_pattern = re_pattern.get() - if not isinstance(re_pattern, str): - print("Invalid re_pattern for ", token_type, re_pattern) - continue patterns[current_token_type].append(re_pattern) return patterns +def clean_pattern(pattern): + """Remove unecessary parts from the regex pattern.""" + pattern = pattern.replace('?:', '').replace('\\b', '') + return pattern.replace('\\\\', '\\').replace('\\s+', ' ') + + +def split_on_paren(re_pattern): + """ Split the pattern into three parts, one enclosed by the outermost + parentheses, one to the left of opening paren, and one to the right.""" + parts = [part for part in re.split(r'(\W)', re_pattern) if part] + try: + left_ind = parts.index('(') + except ValueError: + left_ind = 0 + try: + right_ind = -parts[::-1].index(')') + except ValueError: + right_ind = 0 + prefix = ''.join(parts[:left_ind]) + middle = ''.join(parts[left_ind:right_ind or len(parts)]) + suffix = ''.join(parts[right_ind:]) if right_ind else '' + if any(c in suffix or c in prefix for c in '(|)'): + return '', re_pattern, '' + return prefix, middle, suffix + + +def get_subparts(re_pattern, depth=0): + """ Break down the pattern into smaller parts, due to '|'""" + + if not re_pattern: + return [] + if re_pattern[0] == '(' and re_pattern[-1] == ')': + re_pattern = re_pattern[1:-1] + parts = [part for part in re.split(r'(\W)', re_pattern) if part] + sub_parts = [] + prev_end = 0 + open_paren_count = 0 + + # Handle '|' metacharacter, match either of the two + for index, part in enumerate(parts): + if part == '(': + open_paren_count += 1 + elif part == ')': + open_paren_count -= 1 + elif part == '|' and open_paren_count == depth: + sub_parts.append(''.join(parts[prev_end:index])) + prev_end = index + 1 + sub_parts.append(''.join(parts[prev_end:])) + + # Handle '?' metacharacter, either 0 or 1 match + for index, sub_part in enumerate(sub_parts): + if sub_part.endswith(')?'): + prefix, middle, suffix = split_on_paren(sub_part) + sub_parts[index] = prefix + middle[1:-1] + sub_parts.append(prefix) + + # Expand '[]' metachars, match any one char inside + sub_parts_removed = [] # parts to be removed + for index, sub_part in enumerate(sub_parts): + if sub_part.startswith('[') and sub_part.endswith(']'): + prefix, middle, suffix = split_on_paren(sub_part[1:-1]) + parts = [] + if not prefix and not suffix: + parts = middle + else: + prefix.split() + [middle] + suffix.split() + for part in parts: + sub_parts.append(part) + sub_parts_removed.append(index) + + # remove original subpart, which contains [...] + for ind, to_be_removed in enumerate(sub_parts_removed): + del sub_parts[to_be_removed - ind] + + return sub_parts + + +def extract_keywords(re_pattern): + """ Recursively parse the regex pattern to find all the possible + strings that may match the pattern.""" + if not re_pattern: + return [''] + + sub_parts = get_subparts(re_pattern) + if len(sub_parts) == 1 and sub_parts[0] == re_pattern: + prefix, middle, suffix = split_on_paren(re_pattern) + if not suffix and not prefix: # no further splitting is possible + return sub_parts + + keywords = [] + for part in sub_parts: + prefix, middle, suffix = split_on_paren(part) + for keyword in extract_keywords(middle): + keywords.append(prefix + keyword + suffix) + return keywords + + +def convert_to_keywords(lexer_patterns): + lexer_keywords = defaultdict(list) + success = True + for pattern_type, patterns in lexer_patterns.items(): + for pattern in patterns: + keywords = extract_keywords(clean_pattern(pattern)) + if any(any(c in keyword for c in '(|)?') for keyword in keywords): + success = False + lexer_keywords[pattern_type].append(keywords) + return success, lexer_keywords + + def process_pygments(): - required_token_types = [Token.Comment, Token.Keyword] - new_lexers = list(find_new_lexers()) - print("Number of unknown lexers:", len(new_lexers)) - lexer_patterns = {} - for lexer in new_lexers: - patterns = get_lexer_patterns(lexer, required_token_types) - if patterns: - lexer_patterns[lexer] = patterns - print("Number of new patterns:", len(lexer_patterns)) - print(lexer_patterns) + all_keywords = {} + improper = {} + for lexer in get_new_lexers(): + patterns = get_lexer_patterns(lexer, REQUIRED_TOKEN_TYPES) + if not patterns: + print('Skipping {}: no required tokens'.format(lexer.name)) + continue + success, keywords = convert_to_keywords(patterns) + if success: + all_keywords[lexer.name] = keywords + else: + improper[lexer.name] = (keywords, patterns) + + print("Found new keywords for {} languages.".format(len(all_keywords))) + print(*all_keywords.keys(), sep='\n') + print("Couldn't extract keywords for {} languages".format(len(improper))) + for lexer, data in improper.items(): + print(lexer, data[0], data[1], sep='\n\t') + return all_keywords, improper + + +all_keywords, improper = process_pygments() From 05173cc57495553eda89a283fa66547e55acbfe1 Mon Sep 17 00:00:00 2001 From: iamkroot Date: Sun, 23 Dec 2018 21:58:00 +0530 Subject: [PATCH 03/10] tools: pygments: Verify extracted keyword using regex A simple way to check the correctness of the keyword, is to ensure that it matches the regex pattern that it was extracted from. --- tools/pygments_import.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/tools/pygments_import.py b/tools/pygments_import.py index c26f81d..1683a23 100644 --- a/tools/pygments_import.py +++ b/tools/pygments_import.py @@ -75,14 +75,26 @@ def get_lexer_patterns(lexer, required_token_types=()): def clean_pattern(pattern): - """Remove unecessary parts from the regex pattern.""" - pattern = pattern.replace('?:', '').replace('\\b', '') - return pattern.replace('\\\\', '\\').replace('\\s+', ' ') + """Unescape and remove unecessary parts from the regex pattern.""" + REPLACEMENTS = { + '?:': '', + '\\b': '', + '\\s+': ' ', + '\\s*': '', + '\\*': '*', + '\\-': '-', + '\\.': '.', + '\\?': '?', + '\\\\': '\\' + } + for orig, repl in REPLACEMENTS.items(): + pattern = pattern.replace(orig, repl) + return pattern def split_on_paren(re_pattern): - """ Split the pattern into three parts, one enclosed by the outermost - parentheses, one to the left of opening paren, and one to the right.""" + """Split the pattern into three parts, one enclosed by the outermost + parentheses, one to the left of opening paren, and one to the right.""" parts = [part for part in re.split(r'(\W)', re_pattern) if part] try: left_ind = parts.index('(') @@ -101,7 +113,7 @@ def split_on_paren(re_pattern): def get_subparts(re_pattern, depth=0): - """ Break down the pattern into smaller parts, due to '|'""" + """Break down the pattern into smaller parts, due to '|'""" if not re_pattern: return [] @@ -152,8 +164,8 @@ def get_subparts(re_pattern, depth=0): def extract_keywords(re_pattern): - """ Recursively parse the regex pattern to find all the possible - strings that may match the pattern.""" + """Recursively parse the regex pattern to find all the possible + strings that may match the pattern.""" if not re_pattern: return [''] @@ -176,8 +188,9 @@ def convert_to_keywords(lexer_patterns): success = True for pattern_type, patterns in lexer_patterns.items(): for pattern in patterns: + compiled = re.compile(pattern) keywords = extract_keywords(clean_pattern(pattern)) - if any(any(c in keyword for c in '(|)?') for keyword in keywords): + if any(compiled.match(keyword) is None for keyword in keywords): success = False lexer_keywords[pattern_type].append(keywords) return success, lexer_keywords @@ -198,7 +211,7 @@ def process_pygments(): improper[lexer.name] = (keywords, patterns) print("Found new keywords for {} languages.".format(len(all_keywords))) - print(*all_keywords.keys(), sep='\n') + print(*all_keywords.keys(), sep='\n' if len(all_keywords) < 10 else ', ') print("Couldn't extract keywords for {} languages".format(len(improper))) for lexer, data in improper.items(): print(lexer, data[0], data[1], sep='\n\t') From 896c3e48922782f352aed8d82cad72a44b02a103 Mon Sep 17 00:00:00 2001 From: iamkroot Date: Tue, 1 Jan 2019 21:18:50 +0530 Subject: [PATCH 04/10] tools: Break up pygments import script --- tools/pygments-import/main.py | 108 ++++++++++++++++++ .../parser.py} | 105 ++--------------- 2 files changed, 116 insertions(+), 97 deletions(-) create mode 100644 tools/pygments-import/main.py rename tools/{pygments_import.py => pygments-import/parser.py} (55%) diff --git a/tools/pygments-import/main.py b/tools/pygments-import/main.py new file mode 100644 index 0000000..2189f5e --- /dev/null +++ b/tools/pygments-import/main.py @@ -0,0 +1,108 @@ +import os.path +import yaml +from collections import defaultdict +from itertools import chain + +import pygments +from pygments.token import Token +from pygments.lexers import get_all_lexers, get_lexer_by_name + +from parser import convert_to_keywords + +REQUIRED_TOKEN_TYPES = (Token.Keyword, ) + + +def read_coast_langs(): + # Remove '../../' for travis + LANGUAGE_FOLD = os.path.abspath("../../data/Language") + + for lang_file in os.listdir(LANGUAGE_FOLD): + with open(os.path.join(LANGUAGE_FOLD, lang_file)) as f: + yield lang_file, yaml.load(f) + + +coast_langs = dict(read_coast_langs()) + + +def get_coast_aliases(): + for lang_file, lang in coast_langs.items(): + possible_names = set((lang['identifier'], lang_file.rstrip('.yaml'))) + possible_names.update(lang.get('aliases', [])) + if 'full_name' in lang: + possible_names.add(lang['full_name']) + yield list(possible_names) + + +def get_coast_lexers(): + for aliases in get_coast_aliases(): + for alias in aliases: + try: + yield get_lexer_by_name(alias) + break + except pygments.util.ClassNotFound: + pass + else: + print("No lexer for", aliases[0]) + + +def get_new_lexers(): + known_lexers = list(get_coast_lexers()) + all_lexers = [get_lexer_by_name(lexer[1][0]) for lexer in get_all_lexers()] + print("Number of known lexers:", len(known_lexers)) + print("Number of total pygments lexers:", len(all_lexers)) + for lexer in all_lexers: + if not any(known_lexer.name == lexer.name + for known_lexer in known_lexers): + yield lexer + + +def get_lexer_patterns(lexer, required_token_types=()): + patterns = defaultdict(list) + if not hasattr(lexer, 'tokens'): + # print('Skipping {}: no tokens'.format(lexer.name)) + return patterns + + # no need to handle each section separately + for token in chain(*lexer.tokens.values()): + if not isinstance(token, tuple) or len(token) != 2: + continue + re_pattern, token_type = token + current_token_type = None + if not required_token_types: + current_token_type = token_type + for super_type in required_token_types: + assert super_type in Token + if token_type in super_type: + current_token_type = super_type + break + if not current_token_type: + continue + if isinstance(re_pattern, pygments.lexer.words): + re_pattern = re_pattern.get() + patterns[current_token_type].append(re_pattern) + return patterns + + +def process_pygments(): + all_keywords = {} + improper = {} + for lexer in get_new_lexers(): + patterns = get_lexer_patterns(lexer, REQUIRED_TOKEN_TYPES) + if not patterns: + # print('Skipping {}: no required tokens'.format(lexer.name)) + continue + success, keywords = convert_to_keywords(patterns) + if success: + all_keywords[lexer.name] = keywords + else: + improper[lexer.name] = (keywords, patterns) + + print("Found new keywords for {} languages.".format(len(all_keywords))) + print(*all_keywords.keys(), sep='\n' if len(all_keywords) < 10 else ', ') + print("Couldn't extract keywords for {} languages".format(len(improper))) + for lexer, data in improper.items(): + print(lexer, data[0], data[1], sep='\n\t') + return all_keywords, improper + + +process_pygments() diff --git a/tools/pygments_import.py b/tools/pygments-import/parser.py similarity index 55% rename from tools/pygments_import.py rename to tools/pygments-import/parser.py index 1683a23..0e89fba 100644 --- a/tools/pygments_import.py +++ b/tools/pygments-import/parser.py @@ -1,82 +1,11 @@ -import os.path -import yaml from collections import defaultdict -from itertools import chain import re -import pygments -from pygments.token import Token -from pygments.lexers import get_all_lexers, get_lexer_by_name - - -LANGUAGE_FOLD = os.path.abspath("../data/Language") # Remove '../' for travis -REQUIRED_TOKEN_TYPES = (Token.Keyword, ) - - -def get_coast_aliases(): - for lang_file in os.listdir(LANGUAGE_FOLD): - with open(os.path.join(LANGUAGE_FOLD, lang_file)) as f: - lang = yaml.load(f) - possible_names = set((lang['identifier'], lang_file.rstrip('.yaml'))) - possible_names.update(lang.get('aliases', [])) - if 'full_name' in lang: - possible_names.add(lang['full_name']) - yield list(possible_names) - - -def get_coast_lexers(): - for aliases in get_coast_aliases(): - for alias in aliases: - try: - yield get_lexer_by_name(alias) - break - except pygments.util.ClassNotFound: - pass - else: - print("No lexer for", aliases[0]) - - -def get_new_lexers(): - known_lexers = list(get_coast_lexers()) - all_lexers = [get_lexer_by_name(lexer[1][0]) for lexer in get_all_lexers()] - print("Number of known lexers:", len(known_lexers)) - print("Number of total pygments lexers:", len(all_lexers)) - for lexer in all_lexers: - if not any(known_lexer.name == lexer.name - for known_lexer in known_lexers): - yield lexer - - -def get_lexer_patterns(lexer, required_token_types=()): - patterns = defaultdict(list) - if not hasattr(lexer, 'tokens'): - print('Skipping {}: no tokens'.format(lexer.name)) - return patterns - - # no need to handle each section separately - for token in chain(*lexer.tokens.values()): - if not isinstance(token, tuple) or len(token) != 2: - continue - re_pattern, token_type = token - current_token_type = None - if not required_token_types: - current_token_type = token_type - for super_type in required_token_types: - assert super_type in Token - if token_type in super_type: - current_token_type = super_type - break - if not current_token_type: - continue - if isinstance(re_pattern, pygments.lexer.words): - re_pattern = re_pattern.get() - patterns[current_token_type].append(re_pattern) - return patterns - def clean_pattern(pattern): """Unescape and remove unecessary parts from the regex pattern.""" REPLACEMENTS = { + '^': '', '?:': '', '\\b': '', '\\s+': ' ', @@ -85,6 +14,13 @@ def clean_pattern(pattern): '\\-': '-', '\\.': '.', '\\?': '?', + '\\+': '+', + '\\$': '$', + '\\!': '!', + '\\=': '=', + '\\<': '<', + '\\>': '>', + '\\ ': ' ', '\\\\': '\\' } for orig, repl in REPLACEMENTS.items(): @@ -194,28 +130,3 @@ def convert_to_keywords(lexer_patterns): success = False lexer_keywords[pattern_type].append(keywords) return success, lexer_keywords - - -def process_pygments(): - all_keywords = {} - improper = {} - for lexer in get_new_lexers(): - patterns = get_lexer_patterns(lexer, REQUIRED_TOKEN_TYPES) - if not patterns: - print('Skipping {}: no required tokens'.format(lexer.name)) - continue - success, keywords = convert_to_keywords(patterns) - if success: - all_keywords[lexer.name] = keywords - else: - improper[lexer.name] = (keywords, patterns) - - print("Found new keywords for {} languages.".format(len(all_keywords))) - print(*all_keywords.keys(), sep='\n' if len(all_keywords) < 10 else ', ') - print("Couldn't extract keywords for {} languages".format(len(improper))) - for lexer, data in improper.items(): - print(lexer, data[0], data[1], sep='\n\t') - return all_keywords, improper - - -all_keywords, improper = process_pygments() From 0cba7a95bf36cdff7376682c3a7d368d6b71f6d6 Mon Sep 17 00:00:00 2001 From: iamkroot Date: Thu, 3 Jan 2019 15:53:30 +0530 Subject: [PATCH 05/10] pygments: Use words.words instead of words.get --- tools/pygments-import/main.py | 12 +++++++----- tools/pygments-import/parser.py | 5 ++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/pygments-import/main.py b/tools/pygments-import/main.py index 2189f5e..e7e321b 100644 --- a/tools/pygments-import/main.py +++ b/tools/pygments-import/main.py @@ -22,6 +22,8 @@ def read_coast_langs(): coast_langs = dict(read_coast_langs()) +pygments_lexers = {lexer[1][0]: get_lexer_by_name(lexer[1][0]) + for lexer in get_all_lexers()} def get_coast_aliases(): @@ -47,10 +49,9 @@ def get_coast_lexers(): def get_new_lexers(): known_lexers = list(get_coast_lexers()) - all_lexers = [get_lexer_by_name(lexer[1][0]) for lexer in get_all_lexers()] print("Number of known lexers:", len(known_lexers)) - print("Number of total pygments lexers:", len(all_lexers)) - for lexer in all_lexers: + print("Number of total pygments lexers:", len(pygments_lexers)) + for lexer in pygments_lexers.values(): if not any(known_lexer.name == lexer.name for known_lexer in known_lexers): yield lexer @@ -78,8 +79,9 @@ def get_lexer_patterns(lexer, required_token_types=()): if not current_token_type: continue if isinstance(re_pattern, pygments.lexer.words): - re_pattern = re_pattern.get() - patterns[current_token_type].append(re_pattern) + patterns[current_token_type].extend(re_pattern.words) + else: + patterns[current_token_type].append(re_pattern) return patterns diff --git a/tools/pygments-import/parser.py b/tools/pygments-import/parser.py index 0e89fba..a0e731f 100644 --- a/tools/pygments-import/parser.py +++ b/tools/pygments-import/parser.py @@ -124,7 +124,10 @@ def convert_to_keywords(lexer_patterns): success = True for pattern_type, patterns in lexer_patterns.items(): for pattern in patterns: - compiled = re.compile(pattern) + try: + compiled = re.compile(pattern) + except re.sre_compile.error: + compiled = re.compile(re.escape(pattern)) keywords = extract_keywords(clean_pattern(pattern)) if any(compiled.match(keyword) is None for keyword in keywords): success = False From a47c3732afc7a2d55b42b33a2d916d323ca654a2 Mon Sep 17 00:00:00 2001 From: iamkroot Date: Wed, 16 Jan 2019 12:58:10 +0530 Subject: [PATCH 06/10] pygments: Update coast lang defs from lexers --- tools/pygments-import/main.py | 178 +++++++++++++++++++++----------- tools/pygments-import/parser.py | 21 ++-- 2 files changed, 130 insertions(+), 69 deletions(-) diff --git a/tools/pygments-import/main.py b/tools/pygments-import/main.py index e7e321b..a893884 100644 --- a/tools/pygments-import/main.py +++ b/tools/pygments-import/main.py @@ -1,66 +1,96 @@ -import os.path +import re import yaml +import os.path from collections import defaultdict from itertools import chain +from packaging.version import Version, InvalidVersion import pygments from pygments.token import Token -from pygments.lexers import get_all_lexers, get_lexer_by_name +from pygments.lexers import get_all_lexers, find_lexer_class_by_name from parser import convert_to_keywords REQUIRED_TOKEN_TYPES = (Token.Keyword, ) +# Remove '../../' for travis +LANGUAGE_FOLD = os.path.abspath("../../data/Language") -def read_coast_langs(): - # Remove '../../' for travis - LANGUAGE_FOLD = os.path.abspath("../../data/Language") - - for lang_file in os.listdir(LANGUAGE_FOLD): - with open(os.path.join(LANGUAGE_FOLD, lang_file)) as f: - yield lang_file, yaml.load(f) +# Read coast language definitions +coast_langs = {} +for lang_file in os.listdir(LANGUAGE_FOLD): + with open(os.path.join(LANGUAGE_FOLD, lang_file)) as f: + coast_langs[lang_file.rstrip('.yaml')] = yaml.load(f) - -coast_langs = dict(read_coast_langs()) -pygments_lexers = {lexer[1][0]: get_lexer_by_name(lexer[1][0]) +# # Get all pygments lexers +pygments_lexers = {find_lexer_class_by_name(lexer[1][0]) for lexer in get_all_lexers()} -def get_coast_aliases(): - for lang_file, lang in coast_langs.items(): - possible_names = set((lang['identifier'], lang_file.rstrip('.yaml'))) - possible_names.update(lang.get('aliases', [])) - if 'full_name' in lang: - possible_names.add(lang['full_name']) - yield list(possible_names) - - -def get_coast_lexers(): - for aliases in get_coast_aliases(): - for alias in aliases: +def get_coast_lang_lexers(lang_filename, lang): + def gen_versioned_names(name, version): + try: + version = Version(version) + # print(version.) + except InvalidVersion: + return + yield name + " " + str(version.release[0]) + yield name + str(version.release[0]) + yield name + " " + version.base_version + yield name + version.base_version + + possible_names = {lang_filename, lang['identifier']} + possible_names.update(lang.get('aliases', [])) + if 'full_name' in lang: + possible_names.add(lang['full_name']) + + lexers = set() + + for alias in possible_names: + try: + lex = find_lexer_class_by_name(alias) + except pygments.util.ClassNotFound: + pass + else: + lexers.add(lex) + break + else: + print("No lexer for", lang_filename) + return + + for version in lang.get('versions', '').split(', '): + for versioned_name in gen_versioned_names(lex.name, version): try: - yield get_lexer_by_name(alias) - break + versioned_lex = find_lexer_class_by_name(versioned_name) except pygments.util.ClassNotFound: pass - else: - print("No lexer for", aliases[0]) + else: + lexers.add(versioned_lex) + break + return tuple(lexers) -def get_new_lexers(): - known_lexers = list(get_coast_lexers()) - print("Number of known lexers:", len(known_lexers)) - print("Number of total pygments lexers:", len(pygments_lexers)) - for lexer in pygments_lexers.values(): - if not any(known_lexer.name == lexer.name - for known_lexer in known_lexers): - yield lexer +coast_lexers = {lang_file: get_coast_lang_lexers(lang_file, lang) + for lang_file, lang in coast_langs.items()} + + +def update_coast_def(lang, lexer): + lexer_data = extract_lexer_data(lexer) + + def update_list(param): + lex_keywords = set(lexer_data.get(param, [])) + lang_keywords = set(lang.get(param, [])) + lang_keywords.update(lex_keywords) + if lang_keywords: + lang[param] = list(sorted(lang_keywords)) + + update_list('keywords') def get_lexer_patterns(lexer, required_token_types=()): patterns = defaultdict(list) if not hasattr(lexer, 'tokens'): - # print('Skipping {}: no tokens'.format(lexer.name)) + print('Skipping {}: no tokens'.format(lexer.name)) return patterns # no need to handle each section separately @@ -85,26 +115,58 @@ def get_lexer_patterns(lexer, required_token_types=()): return patterns -def process_pygments(): - all_keywords = {} - improper = {} - for lexer in get_new_lexers(): - patterns = get_lexer_patterns(lexer, REQUIRED_TOKEN_TYPES) - if not patterns: - # print('Skipping {}: no required tokens'.format(lexer.name)) - continue - success, keywords = convert_to_keywords(patterns) +def extract_lexer_data(lexer): + data = {} + patterns = get_lexer_patterns(lexer, REQUIRED_TOKEN_TYPES) + if patterns.get(Token.Keyword): + success, keywords = convert_to_keywords(patterns[Token.Keyword]) if success: - all_keywords[lexer.name] = keywords + data['keywords'] = list(chain(*keywords)) + return data + + +def detect_versioned_langs(name1, name2): + realname1, version1 = parse_lang_name(name1) + realname2, version2 = parse_lang_name(name2) + if version1 is None and version2 is None: + return False + elif realname1 == realname2: + return True, version1, version2 + return False + + +def parse_lang_name(name): + """Identify the version number of a language from its name.""" + try: + realname, version = name.rsplit(maxsplit=1) + version = Version(version) + return realname, version + except (ValueError, InvalidVersion): # look for trailing numbers + match = re.match(r"(?P.*?)(?P[0-9.]+)$", name) + if match: + realname, version = match.groupdict().values() + try: + version = Version(version) + return realname, version + except InvalidVersion: + return name, None else: - improper[lexer.name] = (keywords, patterns) - - print("Found new keywords for {} languages.".format(len(all_keywords))) - print(*all_keywords.keys(), sep='\n' if len(all_keywords) < 10 else ', ') - print("Couldn't extract keywords for {} languages".format(len(improper))) - for lexer, data in improper.items(): - print(lexer, data[0], data[1], sep='\n\t') - return all_keywords, improper - - -process_pygments() + return name, None + + +if __name__ == '__main__': + import json + import difflib + c = coast_langs['Python'] + s = json.dumps(c, indent=4).splitlines(keepends=True) + l = get_coast_lang_lexers('Python', c) + update_coast_def(c, l[0]) + b = json.dumps(c, indent=4).splitlines(keepends=True) + print(*difflib.unified_diff(s, b)) + + # lexer_product = ((x, y) for i, x in enumerate(pygments_lexers) + # for y in pygments_lexers[i + 1:]) + # versioned_lexers = list( + # filter(lambda x: detect_versioned_langs(x[0].name, x[1].name), + # lexer_product)) + # print(versioned_lexers) diff --git a/tools/pygments-import/parser.py b/tools/pygments-import/parser.py index a0e731f..c169878 100644 --- a/tools/pygments-import/parser.py +++ b/tools/pygments-import/parser.py @@ -120,16 +120,15 @@ def extract_keywords(re_pattern): def convert_to_keywords(lexer_patterns): - lexer_keywords = defaultdict(list) + lexer_keywords = [] success = True - for pattern_type, patterns in lexer_patterns.items(): - for pattern in patterns: - try: - compiled = re.compile(pattern) - except re.sre_compile.error: - compiled = re.compile(re.escape(pattern)) - keywords = extract_keywords(clean_pattern(pattern)) - if any(compiled.match(keyword) is None for keyword in keywords): - success = False - lexer_keywords[pattern_type].append(keywords) + for pattern in lexer_patterns: + try: + compiled = re.compile(pattern) + except re.sre_compile.error: + compiled = re.compile(re.escape(pattern)) + keywords = extract_keywords(clean_pattern(pattern)) + if any(compiled.match(keyword) is None for keyword in keywords): + success = False + lexer_keywords.append(keywords) return success, lexer_keywords From 9bc0f90c13174d33c54e15d63a13853e804d6dc2 Mon Sep 17 00:00:00 2001 From: iamkroot Date: Wed, 6 Mar 2019 10:52:04 +0530 Subject: [PATCH 07/10] pygments-parser: Raise exception on failure --- tools/pygments-import/parser.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/tools/pygments-import/parser.py b/tools/pygments-import/parser.py index c169878..109f2eb 100644 --- a/tools/pygments-import/parser.py +++ b/tools/pygments-import/parser.py @@ -1,7 +1,13 @@ -from collections import defaultdict import re +class ParseException(Exception): + """Represents failed conversion""" + + def __init__(self, keywords): + self.keywords = keywords # partially parsed keywords + + def clean_pattern(pattern): """Unescape and remove unecessary parts from the regex pattern.""" REPLACEMENTS = { @@ -10,6 +16,7 @@ def clean_pattern(pattern): '\\b': '', '\\s+': ' ', '\\s*': '', + '\\S+': '', '\\*': '*', '\\-': '-', '\\.': '.', @@ -21,6 +28,10 @@ def clean_pattern(pattern): '\\<': '<', '\\>': '>', '\\ ': ' ', + '\\)': ')', + '\\|': '|', + '\\[': '[', + '\\]': ']', '\\\\': '\\' } for orig, repl in REPLACEMENTS.items(): @@ -82,6 +93,9 @@ def get_subparts(re_pattern, depth=0): sub_parts_removed = [] # parts to be removed for index, sub_part in enumerate(sub_parts): if sub_part.startswith('[') and sub_part.endswith(']'): + if any(e in sub_part for e in ('a-z', 'A-Z', '0-9')): + sub_parts_removed.append(index) + continue prefix, middle, suffix = split_on_paren(sub_part[1:-1]) parts = [] if not prefix and not suffix: @@ -115,7 +129,7 @@ def extract_keywords(re_pattern): for part in sub_parts: prefix, middle, suffix = split_on_paren(part) for keyword in extract_keywords(middle): - keywords.append(prefix + keyword + suffix) + keywords.append((prefix + keyword + suffix).strip()) return keywords @@ -128,7 +142,12 @@ def convert_to_keywords(lexer_patterns): except re.sre_compile.error: compiled = re.compile(re.escape(pattern)) keywords = extract_keywords(clean_pattern(pattern)) - if any(compiled.match(keyword) is None for keyword in keywords): - success = False - lexer_keywords.append(keywords) - return success, lexer_keywords + for keyword in keywords: + m = compiled.match(keyword) + if m is None or m.group() != keyword: + success = False + break + lexer_keywords.extend(keywords) + if not success: + raise ParseException(lexer_keywords) + return lexer_keywords From 02388005d469f582d6818f016b4fd22da482cef2 Mon Sep 17 00:00:00 2001 From: iamkroot Date: Wed, 6 Mar 2019 10:53:50 +0530 Subject: [PATCH 08/10] pygments: Overhaul processing logic --- tools/pygments-import/main.py | 272 ++++++++++++++++++++-------------- 1 file changed, 160 insertions(+), 112 deletions(-) diff --git a/tools/pygments-import/main.py b/tools/pygments-import/main.py index a893884..9ecd8e7 100644 --- a/tools/pygments-import/main.py +++ b/tools/pygments-import/main.py @@ -1,17 +1,18 @@ +import inspect +import os import re -import yaml -import os.path from collections import defaultdict -from itertools import chain - +from itertools import chain, groupby +import yaml from packaging.version import Version, InvalidVersion import pygments -from pygments.token import Token +from pygments.token import Token, Name from pygments.lexers import get_all_lexers, find_lexer_class_by_name -from parser import convert_to_keywords -REQUIRED_TOKEN_TYPES = (Token.Keyword, ) +from parser import convert_to_keywords, ParseException + +REQUIRED_TOKEN_TYPES = (Token.Keyword, Name.Builtin) # Remove '../../' for travis LANGUAGE_FOLD = os.path.abspath("../../data/Language") @@ -22,75 +23,30 @@ with open(os.path.join(LANGUAGE_FOLD, lang_file)) as f: coast_langs[lang_file.rstrip('.yaml')] = yaml.load(f) -# # Get all pygments lexers -pygments_lexers = {find_lexer_class_by_name(lexer[1][0]) - for lexer in get_all_lexers()} - - -def get_coast_lang_lexers(lang_filename, lang): - def gen_versioned_names(name, version): - try: - version = Version(version) - # print(version.) - except InvalidVersion: - return - yield name + " " + str(version.release[0]) - yield name + str(version.release[0]) - yield name + " " + version.base_version - yield name + version.base_version - - possible_names = {lang_filename, lang['identifier']} - possible_names.update(lang.get('aliases', [])) - if 'full_name' in lang: - possible_names.add(lang['full_name']) - - lexers = set() - - for alias in possible_names: - try: - lex = find_lexer_class_by_name(alias) - except pygments.util.ClassNotFound: - pass - else: - lexers.add(lex) - break - else: - print("No lexer for", lang_filename) - return - for version in lang.get('versions', '').split(', '): - for versioned_name in gen_versioned_names(lex.name, version): +def parse_lang_name(name): + """Identify the version number of a language if present in its name.""" + try: + realname, version = name.rsplit(maxsplit=1) + version = Version(version) + return realname, version + except (ValueError, InvalidVersion): # look for trailing numbers + match = re.match(r"(?P.*?)(?P[0-9.]+)$", name) + if match: + realname, version = match.groupdict().values() try: - versioned_lex = find_lexer_class_by_name(versioned_name) - except pygments.util.ClassNotFound: - pass - else: - lexers.add(versioned_lex) - break - return tuple(lexers) - - -coast_lexers = {lang_file: get_coast_lang_lexers(lang_file, lang) - for lang_file, lang in coast_langs.items()} - - -def update_coast_def(lang, lexer): - lexer_data = extract_lexer_data(lexer) - - def update_list(param): - lex_keywords = set(lexer_data.get(param, [])) - lang_keywords = set(lang.get(param, [])) - lang_keywords.update(lex_keywords) - if lang_keywords: - lang[param] = list(sorted(lang_keywords)) - - update_list('keywords') + version = Version(version) + return realname, version + except InvalidVersion: + return name, None + else: + return name, None def get_lexer_patterns(lexer, required_token_types=()): patterns = defaultdict(list) if not hasattr(lexer, 'tokens'): - print('Skipping {}: no tokens'.format(lexer.name)) + # print('Skipping {}: no tokens'.format(lexer.name)) return patterns # no need to handle each section separately @@ -117,56 +73,148 @@ def get_lexer_patterns(lexer, required_token_types=()): def extract_lexer_data(lexer): data = {} + + data['name'] = lexer.name + + aliases = getattr(lexer, 'aliases', []) + + if len(aliases) < 10: + data['aliases'] = aliases + else: # dirty workaround for powershell + print("Too many aliases in", data['name']) + data['aliases'] = [] + + if not data['name'].replace(' ', '').isalnum(): + print(data['name'], data['aliases'], inspect.getsourcefile(lexer)) + + filenames = getattr(lexer, 'filenames', []) + data['extensions'] = [] + data['filenames'] = [] + for name in filenames: + if '*.' in name: + ext = name[name.rfind('.') + 1:] + m = re.match(r'(\w*)\[(\d+)\]$', ext) + if m: + name, versions = m.groups() + if not name: + continue + data['versions'] = [] + for ver in versions: + data['extensions'].append(name + ver) + data['versions'].append(Version(ver)) + else: + data['extensions'].append(ext) + elif '*' not in name: + data['filenames'].append(name) + patterns = get_lexer_patterns(lexer, REQUIRED_TOKEN_TYPES) - if patterns.get(Token.Keyword): - success, keywords = convert_to_keywords(patterns[Token.Keyword]) - if success: - data['keywords'] = list(chain(*keywords)) + keyword_patterns = patterns.get(Token.Keyword) + if keyword_patterns: + try: + keywords = convert_to_keywords(keyword_patterns) + except ParseException as e: + # print("Keyword parsing failed for", lexer.name) + # print('Before parse:', keyword_patterns) + # print('After parse :', e.keywords) + data['keywords'] = [] + else: + data['keywords'] = sorted(keywords) + return data -def detect_versioned_langs(name1, name2): - realname1, version1 = parse_lang_name(name1) - realname2, version2 = parse_lang_name(name2) - if version1 is None and version2 is None: - return False - elif realname1 == realname2: - return True, version1, version2 - return False +def merge_lists(list1, list2): + return list(sorted(set(list1).union(list2))) -def parse_lang_name(name): - """Identify the version number of a language from its name.""" - try: - realname, version = name.rsplit(maxsplit=1) - version = Version(version) - return realname, version - except (ValueError, InvalidVersion): # look for trailing numbers - match = re.match(r"(?P.*?)(?P[0-9.]+)$", name) - if match: - realname, version = match.groupdict().values() - try: - version = Version(version) - return realname, version - except InvalidVersion: - return name, None +def merge_dict_list(dest, src, param): + if param not in src: + return + dest[param] = merge_lists(dest.get(param, []), src[param]) + + +def merge_versioned_lexers(name, lexers): + final_data = {'name': name, 'versions': set()} + for lex in lexers: + lexer_data = extract_lexer_data(lex) + merge_dict_list(final_data, lexer_data, 'aliases') + merge_dict_list(final_data, lexer_data, 'filenames') + merge_dict_list(final_data, lexer_data, 'extensions') + merge_dict_list(final_data, lexer_data, 'keywords') + ver = parse_lang_name(lexer_data['name'])[1] + if ver: + final_data['versions'].add(ver) + final_data['versions'] = list(sorted(final_data['versions'])) + return final_data + + +def process_lexers(): + pygments_lexers = [find_lexer_class_by_name(lexer[1][0]) + for lexer in get_all_lexers()] + + grouped_lexers = defaultdict(set) # group versioned lexers by name + for name, group in groupby(pygments_lexers, + lambda lex: parse_lang_name(lex.name)[0]): + grouped_lexers[name].update(group) + + for name, lexers in grouped_lexers.items(): + if len(lexers) == 1: + yield extract_lexer_data(lexers.pop()) else: - return name, None + yield merge_versioned_lexers(name, lexers) + + +def get_coast_lang(lexer): + # In case the coast lang identifier matches exactly with the lexer name + lang = coast_langs.get(lexer['name']) + if lang is not None: + return lang + + for lang in coast_langs.values(): + full_name = lang.get('full_name', '').lower() + if (lexer['name'] == full_name or full_name in lexer['aliases'] or + lang['identifier'].lower() in lexer['aliases']): + return lang + + +def update_coast_def(lang, lexer_data): + if not lang.get('identifier'): + lang['identifier'] = lexer_data['name'] + if not lexer_data['name'].replace(' ', '').isalnum(): + lang['identifier'] = lang['identifier'].replace( + '+', 'Plus').replace('/', '-') + + def update_list(param): + lex_words = set(lexer_data.get(param, [])) + lang_words = set(lang.get(param, [])) + if lex_words - lang_words: + lang_words.update(lex_words) + # print("Updated {} for {}".format(param, lang['identifier'])) + # print("\tBefore:", lang.get(param)) + lang[param] = list(sorted(lang_words)) + # print("\tAfter: ", lang[param]) + + update_list('aliases') + update_list('extensions') + update_list('filenames') + update_list('keywords') + + for alias in lang.get('aliases', []): + if alias.lower() == lexer_data['name'].lower(): + lang['aliases'].remove(alias) + + +def main(): + for lex_data in process_lexers(): + if '+' in lex_data['name'] and not lex_data.get('keywords'): + continue + coast_def = get_coast_lang(lex_data) + if not coast_def: + # print("New lexer found:", lex_data['name']) + coast_def = {} + update_coast_def(coast_def, lex_data) + print(coast_def) if __name__ == '__main__': - import json - import difflib - c = coast_langs['Python'] - s = json.dumps(c, indent=4).splitlines(keepends=True) - l = get_coast_lang_lexers('Python', c) - update_coast_def(c, l[0]) - b = json.dumps(c, indent=4).splitlines(keepends=True) - print(*difflib.unified_diff(s, b)) - - # lexer_product = ((x, y) for i, x in enumerate(pygments_lexers) - # for y in pygments_lexers[i + 1:]) - # versioned_lexers = list( - # filter(lambda x: detect_versioned_langs(x[0].name, x[1].name), - # lexer_product)) - # print(versioned_lexers) + main() From 2603dc2d31a4d177e11b665bccdfa2862f1c3a80 Mon Sep 17 00:00:00 2001 From: iamkroot Date: Sun, 10 Mar 2019 11:06:42 +0530 Subject: [PATCH 09/10] pygments: Save definitions to YAML files --- tools/pygments-import/main.py | 106 ++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 38 deletions(-) diff --git a/tools/pygments-import/main.py b/tools/pygments-import/main.py index 9ecd8e7..648f068 100644 --- a/tools/pygments-import/main.py +++ b/tools/pygments-import/main.py @@ -1,27 +1,50 @@ -import inspect import os import re -from collections import defaultdict +from collections import defaultdict, OrderedDict from itertools import chain, groupby import yaml +from yaml.dumper import SafeDumper +from yaml.loader import SafeLoader from packaging.version import Version, InvalidVersion import pygments from pygments.token import Token, Name from pygments.lexers import get_all_lexers, find_lexer_class_by_name - from parser import convert_to_keywords, ParseException +DEFAULT_MAPPING_TAG = yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG + + +class YAMLLoader(SafeLoader): + """Custom loader for YAML to preserve ordering of keys.""" + + def construct_ordered_dict(self, node): + return OrderedDict(self.construct_pairs(node)) + + +class YAMLDumper(SafeDumper): + """Custom dumper for YAML to match coAST style""" + + def increase_indent(self, flow=False, indentless=False): + return super().increase_indent(flow, False) + + def represent_ordered_dict(self, data): + return self.represent_dict(data.items()) + + +YAMLLoader.add_constructor( + DEFAULT_MAPPING_TAG, YAMLLoader.construct_ordered_dict) +YAMLDumper.add_representer(OrderedDict, YAMLDumper.represent_ordered_dict) REQUIRED_TOKEN_TYPES = (Token.Keyword, Name.Builtin) -# Remove '../../' for travis LANGUAGE_FOLD = os.path.abspath("../../data/Language") # Read coast language definitions coast_langs = {} for lang_file in os.listdir(LANGUAGE_FOLD): + name = lang_file[:lang_file.rfind('.yaml')] with open(os.path.join(LANGUAGE_FOLD, lang_file)) as f: - coast_langs[lang_file.rstrip('.yaml')] = yaml.load(f) + coast_langs[name] = yaml.load(f, Loader=YAMLLoader) def parse_lang_name(name): @@ -46,7 +69,7 @@ def parse_lang_name(name): def get_lexer_patterns(lexer, required_token_types=()): patterns = defaultdict(list) if not hasattr(lexer, 'tokens'): - # print('Skipping {}: no tokens'.format(lexer.name)) + print('Skipping {}: no tokens'.format(lexer.name)) return patterns # no need to handle each section separately @@ -77,6 +100,10 @@ def extract_lexer_data(lexer): data['name'] = lexer.name aliases = getattr(lexer, 'aliases', []) + try: + aliases.remove(data['name'].lower()) + except ValueError: + pass if len(aliases) < 10: data['aliases'] = aliases @@ -84,9 +111,6 @@ def extract_lexer_data(lexer): print("Too many aliases in", data['name']) data['aliases'] = [] - if not data['name'].replace(' ', '').isalnum(): - print(data['name'], data['aliases'], inspect.getsourcefile(lexer)) - filenames = getattr(lexer, 'filenames', []) data['extensions'] = [] data['filenames'] = [] @@ -113,34 +137,30 @@ def extract_lexer_data(lexer): try: keywords = convert_to_keywords(keyword_patterns) except ParseException as e: - # print("Keyword parsing failed for", lexer.name) - # print('Before parse:', keyword_patterns) - # print('After parse :', e.keywords) + print("Keyword parsing failed for", lexer.name) + print('Before parse:', keyword_patterns) + print('After parse :', e.keywords) data['keywords'] = [] else: - data['keywords'] = sorted(keywords) + data['keywords'] = list(sorted(set(keywords))) return data -def merge_lists(list1, list2): - return list(sorted(set(list1).union(list2))) - - -def merge_dict_list(dest, src, param): - if param not in src: - return - dest[param] = merge_lists(dest.get(param, []), src[param]) - - def merge_versioned_lexers(name, lexers): final_data = {'name': name, 'versions': set()} for lex in lexers: lexer_data = extract_lexer_data(lex) - merge_dict_list(final_data, lexer_data, 'aliases') - merge_dict_list(final_data, lexer_data, 'filenames') - merge_dict_list(final_data, lexer_data, 'extensions') - merge_dict_list(final_data, lexer_data, 'keywords') + + def update_list(param): + data = set(final_data.get(param, [])).union( + lexer_data.get(param, [])) + final_data[param] = list(sorted(data)) + + update_list('aliases') + update_list('filenames') + update_list('extensions') + update_list('keywords') ver = parse_lang_name(lexer_data['name'])[1] if ver: final_data['versions'].add(ver) @@ -164,6 +184,15 @@ def process_lexers(): yield merge_versioned_lexers(name, lexers) +def filter_lexer(lex_data): + """Determine if the lexer data should be added to coast definitions.""" + return not lex_data.get('keywords') and ( + any(word in lex_data['name'] for word in ('+', 'Template', 'ANTLR')) or + (len(lex_data.get('aliases', [])) < 2 and + len(lex_data.get('extensions', [])) < 2) + ) + + def get_coast_lang(lexer): # In case the coast lang identifier matches exactly with the lexer name lang = coast_langs.get(lexer['name']) @@ -189,31 +218,32 @@ def update_list(param): lang_words = set(lang.get(param, [])) if lex_words - lang_words: lang_words.update(lex_words) - # print("Updated {} for {}".format(param, lang['identifier'])) - # print("\tBefore:", lang.get(param)) + print("Updated {} for {}".format(param, lang['identifier'])) + print("\tBefore:", lang.get(param)) lang[param] = list(sorted(lang_words)) - # print("\tAfter: ", lang[param]) + print("\tAfter: ", lang[param]) update_list('aliases') update_list('extensions') update_list('filenames') update_list('keywords') - for alias in lang.get('aliases', []): - if alias.lower() == lexer_data['name'].lower(): - lang['aliases'].remove(alias) + +def write_yaml(coast_def): + file_name = coast_def['identifier'] + '.yaml' + with open(os.path.join(LANGUAGE_FOLD, file_name), 'w') as f: + yaml.dump(coast_def, f, allow_unicode=True, + default_flow_style=False, Dumper=YAMLDumper) def main(): - for lex_data in process_lexers(): - if '+' in lex_data['name'] and not lex_data.get('keywords'): - continue + for lex_data in filter(lambda l: not filter_lexer(l), process_lexers()): coast_def = get_coast_lang(lex_data) if not coast_def: - # print("New lexer found:", lex_data['name']) coast_def = {} + coast_def = OrderedDict(coast_def) update_coast_def(coast_def, lex_data) - print(coast_def) + write_yaml(coast_def) if __name__ == '__main__': From 64665394be8e1e9482662ef8c77d46655f2da5a1 Mon Sep 17 00:00:00 2001 From: iamkroot Date: Tue, 24 Dec 2019 11:34:47 +0530 Subject: [PATCH 10/10] pygments: Use sre_yield to generate regexes --- tools/pygments-import/main.py | 13 +-- tools/pygments-import/parser.py | 171 ++++++-------------------------- 2 files changed, 32 insertions(+), 152 deletions(-) diff --git a/tools/pygments-import/main.py b/tools/pygments-import/main.py index 648f068..3ad834f 100644 --- a/tools/pygments-import/main.py +++ b/tools/pygments-import/main.py @@ -10,7 +10,7 @@ from pygments.token import Token, Name from pygments.lexers import get_all_lexers, find_lexer_class_by_name -from parser import convert_to_keywords, ParseException +from parser import convert_to_keywords DEFAULT_MAPPING_TAG = yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG @@ -134,15 +134,8 @@ def extract_lexer_data(lexer): patterns = get_lexer_patterns(lexer, REQUIRED_TOKEN_TYPES) keyword_patterns = patterns.get(Token.Keyword) if keyword_patterns: - try: - keywords = convert_to_keywords(keyword_patterns) - except ParseException as e: - print("Keyword parsing failed for", lexer.name) - print('Before parse:', keyword_patterns) - print('After parse :', e.keywords) - data['keywords'] = [] - else: - data['keywords'] = list(sorted(set(keywords))) + keywords = convert_to_keywords(keyword_patterns) + data['keywords'] = list(sorted(set(keywords))) return data diff --git a/tools/pygments-import/parser.py b/tools/pygments-import/parser.py index 109f2eb..f9f0654 100644 --- a/tools/pygments-import/parser.py +++ b/tools/pygments-import/parser.py @@ -1,153 +1,40 @@ import re - - -class ParseException(Exception): - """Represents failed conversion""" - - def __init__(self, keywords): - self.keywords = keywords # partially parsed keywords +import sre_yield + +REPLACEMENTS = { + '?:': '', + '\\s': ' ', + '\\t': ' ', + '\\r': ' ', + '\\n': ' ', + '\\!': '!', + '\\=': '=', + '\\<': '<', + '\\>': '>', + '\\b': '', + '\\w': '' +} def clean_pattern(pattern): """Unescape and remove unecessary parts from the regex pattern.""" - REPLACEMENTS = { - '^': '', - '?:': '', - '\\b': '', - '\\s+': ' ', - '\\s*': '', - '\\S+': '', - '\\*': '*', - '\\-': '-', - '\\.': '.', - '\\?': '?', - '\\+': '+', - '\\$': '$', - '\\!': '!', - '\\=': '=', - '\\<': '<', - '\\>': '>', - '\\ ': ' ', - '\\)': ')', - '\\|': '|', - '\\[': '[', - '\\]': ']', - '\\\\': '\\' - } + pattern = re.sub(r'(?