-
Notifications
You must be signed in to change notification settings - Fork 4
/
analyze.py
207 lines (168 loc) · 4.55 KB
/
analyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import math
import os
import re
import tempfile
import time
HEXCHARS = "0123456789abcdef"
def system(command, data):
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.write(data)
tmp.close()
with os.popen("%s %s" % (command, tmp.name)) as stream:
output = stream.read()
os.remove(tmp.name)
return output
def file_type(data):
"""Attempt to identify file MIME type."""
output = system("file", data)
return re.sub(r"\S+: ", "", system("file", data)).strip()
def hexdump(data):
"""Perform a hex-dump of the decoded data."""
return system("hexdump -C", data)
def histogram(data):
result = [0] * 256
for c in data:
result[ord(c)] += 1
return result
def histogram_analysis(data, dist):
# Assume a binominal distribution.
# If any of the values is more than 6 standard deviations from
# the mean then it may be statistically significant.
n = len(data)
p = 1 / 256.0
expected = n * p
stddev = math.sqrt(n * p * (1 - p))
biggest_diff = 0
for d in dist:
diff = abs(expected - d)
if diff > biggest_diff:
biggest_diff = diff
if stddev > 0:
diff_stddev = float(biggest_diff) / stddev
else:
diff_stddev = 0
if diff_stddev > 6:
return "Possibly non-uniform (%.02f stddevs)" % diff_stddev
else:
return "Uniform (<= %.02f stddevs)" % diff_stddev
def analyze_time(post):
"""Analyze time in post title."""
try:
title_time = time.strptime(post["data"]["title"], "%Y%m%d%H%M")
except:
return None
title_time = (title_time.tm_year, title_time.tm_mon, title_time.tm_mday,
title_time.tm_hour, title_time.tm_min, title_time.tm_sec,
title_time.tm_wday, 0, 0)
title_time_secs = time.mktime(title_time) - time.timezone
post_time_secs = int(float(post["data"]["created_utc"]))
post_time = time.gmtime(post_time_secs)
# Offset of time in title from the UTC post time gives the time zone.
# Round to nearest time zone.
if title_time_secs < post_time_secs:
tz_hours = int((title_time_secs - post_time_secs - 1800) / 3600)
else:
tz_hours = int((title_time_secs - post_time_secs + 1800) / 3600)
if tz_hours < 0:
tz = "UTC%i" % tz_hours
elif tz_hours == 0:
tz = "UTC"
else:
tz = "UTC+%i" % tz_hours
# Assume the messages are posted by an automated script; if the
# script starts running at the head of the minute, it will take
# a short amount of time to start, construct the message and
# post to Reddit. What is the difference in seconds between the
# time in the title and the time when the message was posted?
post_delay = post_time_secs - (title_time_secs - tz_hours * 3600)
return {
"title_time" : tuple(title_time),
"title_time_str" : time.asctime(title_time),
"post_time_str" : time.asctime(post_time),
"timezone" : tz,
"post_delay" : post_delay,
}
def decode_data(text):
"""Decode hex-encoded data into binary."""
result = []
oldindex = None
for c in text:
c = c.lower()
if c not in HEXCHARS:
continue
i = HEXCHARS.index(c)
if oldindex is None:
oldindex = i
else:
n = (oldindex << 4) | i
result.append("%c" % n)
oldindex = None
return "".join(result)
def mean(data):
n = len(data)
if n == 0:
return 0
result = 0
for c in data:
result += ord(c)
return float(result) / float(n)
def variance(data, mean):
n = len(data)
if n == 0:
return 0
result = 0
for c in data:
result += math.pow((ord(c) - mean), 2)
return result / n
def skewness(data, mean, sigma):
n = len(data)
if n == 0 or sigma == 0:
return 0
result = 0
for c in data:
result += math.pow((ord(c) - mean),3)
return (result / n) / math.pow(sigma,3)
def excess(data, mean, sigma):
n = len(data)
if n == 0 or sigma == 0:
return 0
result = 0
for c in data:
result += math.pow((ord(c) - mean),4)
return ((result / n) / math.pow(sigma, 4)) - 3
def probabilities(data, dist):
result = [0] * 256
n = len(data)
if n > 0:
for c in range(256):
result[c] = float(dist[c]) / float(n)
return result
def entropy(prob):
ent = 0
for p in prob:
if (p != 0):
ent += p * math.log(1 / p,2)
return "%0.2f bits per byte" % ent
def analyze(post):
data = decode_data(post["data"]["selftext"])
dist = histogram(data)
prob = probabilities(data, dist)
meann = mean(data)
variancen = variance(data, meann)
sigma = math.sqrt(variancen)
sk = skewness(data, meann, sigma)
ex = excess(data, meann, sigma)
post["analysis"] = {
"data": data,
"histogram": dist,
"distribution": histogram_analysis(data, dist),
"entropy": entropy(prob),
"mean": meann,
"variance": variancen,
"sigma": sigma,
"sk": sk,
"ex": ex,
"mime": file_type(data),
"hexdump": hexdump(data),
"time": analyze_time(post),
}