forked from seomoz/simhash-py
-
Notifications
You must be signed in to change notification settings - Fork 15
/
bench.py
executable file
·79 lines (62 loc) · 2.25 KB
/
bench.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#! /usr/bin/env python
import time
import random
import simhash
import argparse
# Generate some random hashes with known
parser = argparse.ArgumentParser(description='Run a quick bench')
parser.add_argument('--random', dest='random', type=int, default=None,
help='Generate N random hashes for querying')
parser.add_argument('--blocks', dest='blocks', type=int, default=6,
help='Number of blocks to divide 64-bit hashes into')
parser.add_argument('--bits', dest='bits', type=int, default=3,
help='How many bits may differ')
parser.add_argument('--hashes', dest='hashes', type=str, default=None,
help='Path to file with hashes to insert')
parser.add_argument('--queries', dest='queries', type=str, default=None,
help='Path to file with queries to run')
args = parser.parse_args()
corpus = simhash.Corpus(args.blocks, args.bits)
# Hashes to run, query
hashes = []
queries = []
if args.hashes:
with open(args.hashes) as f:
hashes = [int(l) for l in f.split('\n')]
if args.queries:
with open(args.queries) as f:
queries = [int(l) for l in f.split('\n')]
if args.random:
if args.hashes and args.queries:
print 'Random supplied with both --hashes and --queries'
exit(1)
if not hashes:
print 'Generating %i hashes' % args.random
hashes = [random.randint(0, 1 << 64) for i in range(args.random)]
if not queries:
print 'Generating %i queries' % args.random
queries = [random.randint(0, 1 << 64) for i in range(args.random)]
elif not args.hashes or args.queries:
print 'No hashes or queries supplied'
exit(2)
class Timer(object):
def __init__(self, name):
self.name = name
def __enter__(self):
self.start = -time.time()
print 'Starting %s' % self.name
return self
def __exit__(self, t, v, tb):
self.start += time.time()
if t:
print ' Failed %s in %fs' % (self.name, self.start)
else:
print ' Ran %s in %fs' % (self.name, self.start)
with Timer('Bulk Insertion'):
corpus.insert_bulk(hashes)
with Timer('Bulk Find First'):
corpus.find_first_bulk(queries)
with Timer('Bulk Find All'):
corpus.find_all_bulk(queries)
with Timer('Bulk Removal'):
corpus.remove_bulk(hashes)