forked from monperrus/crawler-user-agents
-
Notifications
You must be signed in to change notification settings - Fork 0
/
validate.py
103 lines (83 loc) · 3.86 KB
/
validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
Validate JSON to ensure that patterns all work
"""
from __future__ import print_function
import json
import re
from collections import Counter
from jsonschema import validate
JSON_SCHEMA = {
"type": "array",
"items": {
"type": "object",
"properties": {
"pattern": {"type": "string"}, # required
"instances": {"type": "array"}, # required
"url": {"type": "string"}, # optional
"description": {"type": "string"}, # optional
"addition_date": {"type": "string"}, # optional
"depends_on": {"type": "array"} # allows an instance to match twice
},
"required": ["pattern", "instances"]
}
}
def main():
with open('crawler-user-agents.json') as f:
json_data = json.load(f)
# check format using JSON Schema
validate(json_data, JSON_SCHEMA)
# check for simple duplicates
pattern_counts = Counter(entry['pattern'] for entry in json_data)
for pattern, count in pattern_counts.most_common():
if count > 1:
raise ValueError('Pattern {!r} appears {} times'.format(pattern,
count))
# check for duplicates with different capitalization
pattern_counts = Counter(entry['pattern'].lower() for entry in json_data)
for pattern, count in pattern_counts.most_common():
if count > 1:
raise ValueError('Pattern {!r} is duplicated {} times with different capitalization'
.format(pattern, count))
# checks that no pattern contains unescaped slash /
for entry in json_data:
pattern = entry['pattern']
if re.search('[^\\\\]/', pattern):
raise ValueError('Pattern {!r} has an unescaped slash character'.format(pattern))
# check that we match the given instances
num_instances = 0
for entry in json_data:
pattern = entry['pattern']
# canonicalize entry
if 'depends_on' not in entry: entry['depends_on'] = []
# check that we have only the rights properties (not handled by default in module jsonschema)
assert set([str(x) for x in entry.keys()]).issubset(set(JSON_SCHEMA['items']['properties'].keys())), "the entry contains unknown properties"
instances = entry.get('instances')
if instances:
# check that there is no duplicate
if not len(instances) == len(set(instances)):
raise Exception("duplicate instances in "+pattern)
for instance in instances:
num_instances += 1
if not re.search(pattern, instance):
raise ValueError('Pattern {!r} misses instance {!r}'
.format(pattern, instance))
# contract: we want to avoid that instances are matched twice
# exceptions are handled with the special metadata "depends_on"
for entry2 in json_data:
pattern2 = entry2['pattern']
if pattern2 == pattern: continue
if re.search(pattern2, instance) and pattern2 not in entry['depends_on']:
raise ValueError(instance + 'is matched by both ' + pattern + ' and ' + pattern2)
# Make sure we have at least one pattern
if len(json_data) < 1:
raise Exception("no pattern")
# Check for patterns that match other patterns
for entry1 in json_data:
for entry2 in json_data:
if entry1 != entry2 and re.search(entry1['pattern'],
entry2['pattern'],re.IGNORECASE):
raise ValueError('Pattern {!r} is a subset of {!r}'
.format(entry2['pattern'], entry1['pattern']))
print('Validation passed')
if __name__ == '__main__':
main()