-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCompression.py
94 lines (70 loc) · 2.78 KB
/
Compression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# -*- coding: utf-8 -*-
# attempt to implement some compression algorithms as well as ideas of my own
import sys
def lzw_compress(s):
# Lempel-Ziv-Welch algorithm
raise NotImplementedError
def get_longest_prefix(s, t):
# stolen from https://stackoverflow.com/questions/10355103/finding-the-longest-repeated-substring
p = ""
for x, y in zip(s, t):
if x != y:
return p
p += x
min_str = s if len(s) <= len(t) else t
assert p == min_str
return p
def get_longest_repeated_substring(s):
# stolen from https://stackoverflow.com/questions/10355103/finding-the-longest-repeated-substring
# could improve by choosing substring with most total chars in text (len * n_occurrences)
suffixes = sorted(s[i:] for i in range(len(s)))
result = ""
for s1, s2 in zip(suffixes[:-1], suffixes[1:]):
prefix = get_longest_prefix(s1, s2)
if len(prefix) > len(result) and "=" not in prefix:
result = prefix
return result
def char_ints():
i = ord("0")
while True:
yield i
i += 1
def grammar_compress_one_stage(s, seen_chars):
substr = get_longest_repeated_substring(s)
if len(substr) <= 1:
return s, seen_chars
try:
current_char = next(c for c in seen_chars if c not in s)
except StopIteration:
current_char = next(x for x in char_ints() if chr(x) not in s)
new_symbol = chr(current_char)
seen_chars.add(new_symbol)
s_compressed = s.replace(substr, new_symbol)
s_compressed += "\n{}{}".format(new_symbol, substr) # used to separate these with "=", but since it always begins with one char this is unnecessary
return s_compressed, seen_chars
def grammar_compress(s):
# idea based on formal grammars
# each stage acts on the whole text (probably pretty slow)
# in each stage, find some long string that is repeated, and replace it with a symbol not in the grammar's keys
# in the final file, print the grammar for decoding (if this can be eliminated, it would probably save a lot of space)
seen_chars = set(s)
s += "=" # mark end of ciphertext and beginning of grammar
while True:
s_compressed, seen_chars = grammar_compress_one_stage(s, seen_chars)
if len(s_compressed) >= len(s):
return s
s = s_compressed
def evaluate_compression_function(func, s):
ls = len(s)
lc = len(func(s))
r = 100 * (1 - lc / ls)
print("original len {}, new len {} (decreased by {:.2f}%)".format(ls, lc, r))
if __name__ == "__main__":
fp = "LoremIpsum.txt"
# fp = "Compression.py"
s = open(fp).read()
func = grammar_compress
evaluate_compression_function(func, s)
output_fp = "CompressionOutput.txt"
with open(output_fp, "wb") as f:
f.write(func(s).encode("utf-8"))