-
Notifications
You must be signed in to change notification settings - Fork 6
/
updateFallbackDirs.py
executable file
·2589 lines (2329 loc) · 104 KB
/
updateFallbackDirs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# Usage:
#
# Regenerate the list:
# $ TOR_FB_DATE=`date -u "+%Y-%m-%d-%H-%M-%S"`
# $ TOR_FB_COUNTRY=ZZ
# $ TOR_FB_COMMIT=`git rev-parse --short=16 HEAD`
# $ ./updateFallbackDirs.py \
# > fallback_dirs_"$TOR_FB_DATE"_"$TOR_FB_COUNTRY"_"$TOR_FB_COMMIT".inc \
# 2> fallback_dirs_"$TOR_FB_DATE"_"$TOR_FB_COUNTRY"_"$TOR_FB_COMMIT".log
# $ cp fallback_dirs_*.inc ../tor/src/app/config/fallback_dirs.inc
#
# Check the existing list:
# $ TOR_FB_MODE="check_existing"
# Then use the commands above.
#
# Most script variables can be overridden using TOR_FB_* environmental
# variables.
#
# This script should be run from a stable, reliable network connection,
# with no other network activity (and not over tor).
# If this is not possible, please disable:
# PERFORM_IPV4_DIRPORT_CHECKS and PERFORM_IPV6_DIRPORT_CHECKS
#
# Needs [python-]dateutil, stem, and potentially other python packages.
# Optionally uses ipaddress (python 3 builtin) or py2-ipaddress (package)
# for netblock analysis.
#
# After running this script, read the logs to make sure the fallbacks aren't
# dominated by a single netblock or port.
# Script by weasel, April 2015
# Portions by gsathya & karsten, 2013
# https://trac.torproject.org/projects/tor/attachment/ticket/8374/dir_list.2.py
# Modifications by teor, 2015
# Future imports for Python 2.7, mandatory in 3.0
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
## Imports: version-independent
import copy
import datetime
import dateutil.parser
import gzip
import hashlib
import json
import logging
import math
import os
import os.path
import re
import string
import sys
from stem.descriptor import DocumentHandler
from stem.descriptor.remote import get_consensus, get_server_descriptors, MAX_FINGERPRINTS
## Imports: python 2/3 compatibility
import six
from six.moves import urllib
## Logging Configuration
logging.root.name = ''
## Imports: optional
HAVE_IPADDRESS = False
try:
# python 3 builtin, or install package py2-ipaddress
# there are several ipaddress implementations for python 2
# with slightly different semantics with bytes
# to avoid these issues, we make sure our IP addresses are in six.text_type
import ipaddress
HAVE_IPADDRESS = True
except ImportError:
# if this happens, we avoid doing netblock analysis
logging.warning('Unable to import ipaddress, please install py2-ipaddress.' +
' A fallback list will be created, but optional netblock' +
' analysis will not be performed.')
## Top-Level Configuration
def getenv_conf(var_name, default_val, type_fn, optional=False):
"""Get var_name from the environment, using default_val if it is unset.
Cast the result using type_fn. If conversion fails, log an error and
exit.
type_fn must not be bool. Instead, use custom_bool, which correctly
handles empty env vars and defaults, and bad values."""
try:
original_type_fn = type_fn
# Use our custom bool function instead
assert type_fn != bool
# Make the type function optional
if optional:
type_fn = opt(type_fn)
# Look up and convert the value
if original_type_fn == custom_bool:
# custom_bool does its own default handling
return type_fn(os.getenv(var_name), default_val, var_name)
else:
return type_fn(os.getenv(var_name, default_val))
except ValueError as e:
# Log a useful message if conversion fails
logging.error('Could not cast env var "{}" using function "{}" and default "{}". ValueError: "{}"'.format(var_name, type_fn, default_val, e))
sys.exit(1)
def opt(type_fn):
"""Higher-order function, which returns a function that converts a value
using type_fn. The returned function handles some conversion failures.
See its description for details."""
def opt_type_fn(var_value, default_val=None, var_name=None):
"""Converts its argument var_value using the type_fn passed to the outer
function, and returns the result.
If conversion fails, and var_value is None or the empty string, returns
None.
If conversion fails for any other values, throws a ValueError
exception, with an error string containing var_name, if present.
Performs special handling for bool conversion using custom_bool()."""
try:
if type_fn == custom_bool:
assert default_val is not None
assert var_name is not None
return custom_bool(var_value, default_val, var_name)
else:
return type_fn(var_value)
# Make type_fn(None) always return None for types that don't cast None
except TypeError:
return None
except ValueError as e:
# Make type_fn('') and type_fn(None) silently return None
if var_value == '' or var_value is None:
return None
else:
# Log a useful message if conversion fails, and throw the error to
# getenv_conf()
logging.error('Could not cast optional env var value "{}" using function "{}". ValueError: "{}"'.format(var_value, type_fn, e))
raise e
return opt_type_fn
# Permitted True and False values for custom_bool(). Must be lowercase.
CUSTOM_BOOL_TRUE = ['true', 'yes', '1']
CUSTOM_BOOL_FALSE = ['false', 'no', '0']
def custom_bool(raw_var_value, default_val, var_name=None):
"""Custom bool conversion function.
If raw_var_value is None, returns default_val.
If raw_var_value is the empty string, returns not default_val,
Otherwise, checks CUSTOM_BOOL_TRUE and CUSTOM_BOOL_FALSE for
raw_var_value, returning True or False respectively.
Any other raw_var_value throws a ValueError.
If var_name is not None, it is included in the ValueError string."""
if raw_var_value is None:
return default_val
elif raw_var_value == '':
return not default_val
elif str.lower(raw_var_value) in CUSTOM_BOOL_TRUE:
return True
elif str.lower(raw_var_value) in CUSTOM_BOOL_FALSE:
return False
else:
error_str = "invalid literal for custom_bool(): '{}', default_val: '{}'".format(raw_var_value, default_val)
if var_name is not None:
error_str += ", var_name: '{}'".format(var_name)
raise ValueError(error_str)
# We use semantic versioning: https://semver.org
# In particular:
# * major changes include removing a mandatory field, or anything else that
# would break an appropriately tolerant parser,
# * minor changes include adding a field,
# * patch changes include changing header comments or other unstructured
# content
# These variables are not configureable, because format changes need a spec
# and code update.
FALLBACK_FORMAT_VERSION = '3.0.0'
SECTION_SEPARATOR_BASE = '====='
SECTION_SEPARATOR_COMMENT = '/* ' + SECTION_SEPARATOR_BASE + ' */'
## Mode Settings
# Use "check_existing" to check existing fallbacks, or anything else to create
# a new list. Overridden by the command-line argument "check_existing".
MODE = getenv_conf('TOR_FB_MODE',
'', str)
# Output all candidate fallbacks, or only output selected fallbacks?
OUTPUT_CANDIDATES = getenv_conf('TOR_FB_OUTPUT_CANDIDATES',
False, custom_bool)
# Perform DirPort checks over IPv4?
# Change this to False if IPv4 doesn't work for you, or if you don't want to
# download a consensus for each fallback
# Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
PERFORM_IPV4_DIRPORT_CHECKS = getenv_conf('TOR_FB_PERFORM_IPV4_DIRPORT_CHECKS',
not OUTPUT_CANDIDATES, custom_bool)
# Perform DirPort checks over IPv6?
# There are no IPv6 DirPorts in the Tor protocol, so we disable this option by
# default. When #18394 is implemented, we'll be able to check IPv6 ORPorts.
PERFORM_IPV6_DIRPORT_CHECKS = getenv_conf('TOR_FB_PERFORM_IPV6_DIRPORT_CHECKS',
False, custom_bool)
# Must relays be running now?
MUST_BE_RUNNING_NOW = getenv_conf('TOR_FB_MUST_BE_RUNNING_NOW',
(PERFORM_IPV4_DIRPORT_CHECKS
or PERFORM_IPV6_DIRPORT_CHECKS), custom_bool)
# Clients have been using microdesc consensuses by default for a while now
DOWNLOAD_MICRODESC_CONSENSUS = (
getenv_conf('TOR_FB_DOWNLOAD_MICRODESC_CONSENSUS',
True, custom_bool))
# If a relay delivers an invalid consensus, if it will become valid less than
# this many seconds in the future, or expired less than this many seconds ago,
# accept the relay as a fallback. For the consensus expiry check to be
# accurate, the machine running this script needs an accurate clock.
#
# Relays on 0.3.0 and later return a 404 when they are about to serve a
# consensus that expired more than 24 hours ago. 0.2.9 and earlier relays
# will serve consensuses that are very old.
#
# Relays on 0.3.5.6-rc? and later return a 404 when they are about to serve a
# consensus that will become valid more than 24 hours in the future. Older
# relays don't serve future consensuses.
#
# A 404 makes relays fail the download check. We use a tolerance of 24 hours,
# so that 0.2.9 relays also fail the download check if they serve a consensus
# that is not reasonably live.
#
# REASONABLY_LIVE_TIME should never be more than Tor's REASONABLY_LIVE_TIME,
# (24 hours), because clients reject consensuses that are older than that.
# Clients on 0.3.5.5-alpha? and earlier also won't select guards from
# consensuses that have expired, but can bootstrap if they already have guards
# in their state file.
REASONABLY_LIVE_TIME = getenv_conf('TOR_FB_REASONABLY_LIVE_TIME',
24*60*60, int)
# Output fallback name, flags, bandwidth, and ContactInfo in a C comment?
OUTPUT_COMMENTS = getenv_conf('TOR_FB_OUTPUT_COMMENTS',
OUTPUT_CANDIDATES, custom_bool)
# Output matching ContactInfo in fallbacks list?
# Useful if you're trying to contact operators
CONTACT_COUNT = getenv_conf('TOR_FB_CONTACT_COUNT',
OUTPUT_CANDIDATES, custom_bool)
# How the list should be sorted:
# fingerprint: is useful for stable diffs of fallback lists
# measured_bandwidth: is useful when pruning the list based on bandwidth
# contact: is useful for contacting operators once the list has been pruned
OUTPUT_SORT_FIELD = getenv_conf('TOR_FB_OUTPUT_SORT_FIELD',
('contact' if OUTPUT_CANDIDATES
else 'fingerprint'), str)
## Onionoo Settings
ONIONOO = getenv_conf('TOR_FB_ONIONOO',
'https://onionoo.torproject.org/', str)
#ONIONOO = 'https://onionoo.thecthulhu.com/'
# How many rows should we request from Onionoo?
# We ask Onionoo to exclude the slowest and most recent relays.
# None means "all relays".
# Set env TOR_FB_ONIONOO_LIMIT="None" to request all relays.
ONIONOO_LIMIT = getenv_conf('TOR_FB_ONIONOO_LIMIT',
None, int, optional=True)
# Don't bother going out to the Internet, just use the files available locally,
# even if they're very old
LOCAL_FILES_ONLY = getenv_conf('TOR_FB_LOCAL_FILES_ONLY',
False, custom_bool)
## Offer List Filter Settings
# The offer list contains entries that are included if one of the unique
# attributes matches (IPv4, id, or IPv6 (optional))
# What happens to relays not in the offer list?
# When True, they are included, when False, they are excluded
INCLUDE_UNLISTED_ENTRIES = getenv_conf('TOR_FB_INCLUDE_UNLISTED_ENTRIES',
OUTPUT_CANDIDATES, custom_bool)
OFFER_LIST_FILE_NAME = getenv_conf('TOR_FB_OFFER_LIST_FILE_NAME',
'fallback_offer_list', str)
FALLBACK_FILE_NAME = (
getenv_conf('TOR_FB_FALLBACK_FILE_NAME',
'../tor/src/app/config/fallback_dirs.inc', str))
# The number of bytes we'll read from the offer list file before giving up
MAX_LIST_FILE_SIZE = getenv_conf('TOR_FB_MAX_LIST_FILE_SIZE',
1024 * 1024, int)
## Eligibility Settings
# Require fallbacks to have the same address and port for a set amount of time
# We used to have this at 1 week, but that caused many fallback failures, which
# meant that we had to rebuild the list more often. We want fallbacks to be
# stable for 2 years, so we set it to a few months.
#
# If a relay changes address or port, it's not useful any more,
# because clients with the old hard-coded address and port can't find it
ADDRESS_AND_PORT_STABLE_DAYS = (
getenv_conf('TOR_FB_ADDRESS_AND_PORT_STABLE_DAYS',
90, int))
# We ignore relays that have been down for more than this period
MAX_DOWNTIME_DAYS = getenv_conf('TOR_FB_MAX_DOWNTIME_DAYS',
0 if MUST_BE_RUNNING_NOW else 7, int)
# FallbackDirs must have a time-weighted-fraction that is greater than or
# equal to:
# Mirrors that are down half the time are still useful half the time
# (But we need 75% of the list to be up on average, or we start getting
# fallback warnings from DocTor.)
CUTOFF_RUNNING = getenv_conf('TOR_FB_CUTOFF_RUNNING',
.50, float)
CUTOFF_V2DIR = getenv_conf('TOR_FB_CUTOFF_V2DIR',
.50, float)
# Guard flags are removed for some time after a relay restarts, so we ignore
# the guard flag.
CUTOFF_GUARD = getenv_conf('TOR_FB_CUTOFF_GUARD',
.00, float)
# FallbackDirs must have a time-weighted-fraction that is less than or equal
# to:
# .00 means no bad exits
PERMITTED_BADEXIT = getenv_conf('TOR_FB_PERMITTED_BADEXIT',
.00, float)
# older entries' weights are adjusted with ALPHA^(age in days)
AGE_ALPHA = getenv_conf('TOR_FB_AGE_ALPHA',
.99, float)
# this factor is used to scale Onionoo entries to [0,1]
# it's not configurable, because it's unlikely to change
ONIONOO_SCALE_ONE = 999.
## Fallback Count Limits
# The target for these parameters is 20% of the guards in the network
# This is around 200 as of October 2015
_FB_POG = 0.2
# None means no limit on the number of fallbacks.
# Set env TOR_FB_FALLBACK_PROPORTION_OF_GUARDS="None" to have no limit.
FALLBACK_PROPORTION_OF_GUARDS = (
getenv_conf('TOR_FB_FALLBACK_PROPORTION_OF_GUARDS',
None if OUTPUT_CANDIDATES else _FB_POG, float, optional=True))
# Limit the number of fallbacks (eliminating lowest by advertised bandwidth)
# None means no limit on the number of fallbacks.
# Set env TOR_FB_MAX_FALLBACK_COUNT="None" to have no limit.
MAX_FALLBACK_COUNT = (
getenv_conf('TOR_FB_MAX_FALLBACK_COUNT',
None if OUTPUT_CANDIDATES else 200, int, optional=True))
# Emit a C #error if the number of fallbacks is less than expected
# Set to 0 to have no minimum.
MIN_FALLBACK_COUNT = (
getenv_conf('TOR_FB_MIN_FALLBACK_COUNT',
0 if OUTPUT_CANDIDATES else MAX_FALLBACK_COUNT*0.5, int))
# The maximum number of fallbacks on the same address, contact, or family
#
# With 150 fallbacks, this means each operator sees 5% of client bootstraps.
# For comparison:
# - We try to limit guard and exit operators to 5% of the network
# - The directory authorities used to see 11% of client bootstraps each
#
# We also don't want too much of the list to go down if a single operator
# has to move all their relays.
MAX_FALLBACKS_PER_IP = getenv_conf('TOR_FB_MAX_FALLBACKS_PER_IP',
1, int)
MAX_FALLBACKS_PER_IPV4 = getenv_conf('TOR_FB_MAX_FALLBACKS_PER_IPV4',
MAX_FALLBACKS_PER_IP, int)
MAX_FALLBACKS_PER_IPV6 = getenv_conf('TOR_FB_MAX_FALLBACKS_PER_IPV6',
MAX_FALLBACKS_PER_IP, int)
MAX_FALLBACKS_PER_FAMILY = getenv_conf('TOR_FB_MAX_FALLBACKS_PER_FAMILY',
7, int)
MAX_FALLBACKS_PER_CONTACT = getenv_conf('TOR_FB_MAX_FALLBACKS_PER_CONTACT',
MAX_FALLBACKS_PER_FAMILY, int)
## Fallback Bandwidth Requirements
# Any fallback with the Exit flag has its bandwidth multiplied by this fraction
# to make sure we aren't further overloading exits
# (Set to 1.0, because we asked that only lightly loaded exits opt-in,
# and the extra load really isn't that much for large relays.)
EXIT_BANDWIDTH_FRACTION = getenv_conf('TOR_FB_EXIT_BANDWIDTH_FRACTION',
1.0, float)
# If a single fallback's bandwidth is too low, it's pointless adding it
# We expect fallbacks to handle an extra 10 kilobytes per second of traffic
# Make sure they can support fifty times the expected extra load
#
# We convert this to a consensus weight before applying the filter,
# because all the bandwidth amounts are specified by the relay
MIN_BANDWIDTH = getenv_conf('TOR_FB_MIN_BANDWIDTH',
50.0 * 10.0 * 1024.0, float)
# Clients will time out (or users will give up) after 30 seconds trying to
# download a consensus
# So allow fallback directories half that to deliver a consensus
# The exact download times might change based on the network connection
# running this script, but only by a few seconds
# There is also about a second of python overhead
CONSENSUS_DOWNLOAD_SPEED_MAX = (
getenv_conf('TOR_FB_CONSENSUS_DOWNLOAD_SPEED_MAX',
15.0, float))
# If the relay fails a consensus check, retry the download
# This avoids delisting a relay due to transient network conditions
CONSENSUS_DOWNLOAD_RETRY = getenv_conf('TOR_FB_CONSENSUS_DOWNLOAD_RETRY',
True, custom_bool)
## Parsing Functions
def parse_ts(t):
return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S")
def remove_bad_chars(raw_string, bad_char_list):
# Remove each character in the bad_char_list
cleansed_string = raw_string
for c in bad_char_list:
cleansed_string = cleansed_string.replace(c, '')
return cleansed_string
def cleanse_unprintable(raw_string):
# Remove all unprintable characters
cleansed_string = ''
for c in raw_string:
if c in string.printable:
cleansed_string += c
return cleansed_string
def cleanse_whitespace(raw_string):
# Replace all whitespace characters with a space
cleansed_string = raw_string
for c in string.whitespace:
cleansed_string = cleansed_string.replace(c, ' ')
return cleansed_string
def cleanse_c_multiline_comment(raw_string):
cleansed_string = raw_string
# Embedded newlines should be removed by tor/onionoo, but let's be paranoid
cleansed_string = cleanse_whitespace(cleansed_string)
# ContactInfo and Version can be arbitrary binary data
cleansed_string = cleanse_unprintable(cleansed_string)
# Prevent a malicious / unanticipated string from breaking out
# of a C-style multiline comment
# This removes '/*' and '*/' and '//'
bad_char_list = '*/'
# Prevent a malicious string from using C nulls
bad_char_list += '\0'
# Avoid confusing parsers by making sure there is only one comma per fallback
bad_char_list += ','
# Avoid confusing parsers by making sure there is only one equals per field
bad_char_list += '='
# Be safer by removing bad characters entirely
cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
# Some compilers may further process the content of comments
# There isn't much we can do to cover every possible case
# But comment-based directives are typically only advisory
return cleansed_string
def cleanse_c_string(raw_string):
cleansed_string = raw_string
# Embedded newlines should be removed by tor/onionoo, but let's be paranoid
cleansed_string = cleanse_whitespace(cleansed_string)
# ContactInfo and Version can be arbitrary binary data
cleansed_string = cleanse_unprintable(cleansed_string)
# Prevent a malicious address/fingerprint string from breaking out
# of a C-style string
bad_char_list = '"'
# Prevent a malicious string from using escapes
bad_char_list += '\\'
# Prevent a malicious string from using C nulls
bad_char_list += '\0'
# Avoid confusing parsers by making sure there is only one comma per fallback
bad_char_list += ','
# Avoid confusing parsers by making sure there is only one equals per field
bad_char_list += '='
# Be safer by removing bad characters entirely
cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
# Some compilers may further process the content of strings
# There isn't much we can do to cover every possible case
# But this typically only results in changes to the string data
return cleansed_string
## Onionoo Source Functions
# a dictionary of source metadata for each onionoo query we've made
fetch_source = {}
# register source metadata for 'what'
# assumes we only retrieve one document for each 'what'
def register_fetch_source(what, url, relays_published, version):
fetch_source[what] = {}
fetch_source[what]['url'] = url
fetch_source[what]['relays_published'] = relays_published
fetch_source[what]['version'] = version
# list each registered source's 'what'
def fetch_source_list():
return sorted(fetch_source.keys())
# given 'what', provide a multiline C comment describing the source
def describe_fetch_source(what):
desc = '/*'
desc += '\n'
desc += 'Onionoo Source: '
desc += cleanse_c_multiline_comment(what)
desc += ' Date: '
desc += cleanse_c_multiline_comment(fetch_source[what]['relays_published'])
desc += ' Version: '
desc += cleanse_c_multiline_comment(fetch_source[what]['version'])
desc += '\n'
desc += 'URL: '
desc += cleanse_c_multiline_comment(fetch_source[what]['url'])
desc += '\n'
desc += '*/'
return desc
## File Processing Functions
def write_to_file(str, file_name, max_len):
try:
with open(file_name, 'w') as f:
f.write(str[0:max_len])
except EnvironmentError as error:
logging.error('Writing file %s failed: %d: %s'%
(file_name,
error.errno,
error.strerror)
)
def read_from_file(file_name, max_len):
try:
if os.path.isfile(file_name):
with open(file_name, 'r') as f:
return f.read(max_len)
except EnvironmentError as error:
logging.info('Loading file %s failed: %d: %s'%
(file_name,
error.errno,
error.strerror)
)
return None
def parse_fallback_file(file_name):
file_data = read_from_file(file_name, MAX_LIST_FILE_SIZE)
file_data = cleanse_unprintable(file_data)
file_data = remove_bad_chars(file_data, '\n"\0')
file_data = re.sub('/\*.*?\*/', '', file_data)
file_data = file_data.replace(',', '\n')
file_data = file_data.replace(' weight=10', '')
return file_data
def load_possibly_compressed_response_json(response):
if response.info().get('Content-Encoding') == 'gzip':
buf = six.BytesIO( response.read() )
f = gzip.GzipFile(fileobj=buf)
return json.load(f)
else:
return json.load(response)
def load_json_from_file(json_file_name):
# An exception here may be resolved by deleting the .last_modified
# and .json files, and re-running the script
try:
with open(json_file_name, 'r') as f:
return json.load(f)
except EnvironmentError as error:
raise Exception('Reading not-modified json file %s failed: %d: %s'%
(json_file_name,
error.errno,
error.strerror)
)
## Onionoo Functions
def datestr_to_datetime(datestr):
# Parse datetimes like: Fri, 02 Oct 2015 13:34:14 GMT
if datestr is not None:
dt = dateutil.parser.parse(datestr)
else:
# Never modified - use start of epoch
dt = datetime.datetime.utcfromtimestamp(0)
# strip any timezone out (in case they're supported in future)
dt = dt.replace(tzinfo=None)
return dt
def onionoo_fetch(what, **kwargs):
params = kwargs
params['type'] = 'relay'
if ONIONOO_LIMIT is not None:
params['limit'] = str(ONIONOO_LIMIT)
params['first_seen_days'] = '%d-'%(ADDRESS_AND_PORT_STABLE_DAYS)
params['last_seen_days'] = '-%d'%(MAX_DOWNTIME_DAYS)
params['flag'] = 'V2Dir'
# Get the relays with the highest consensus weight first,
# then use first_seen to get a stable order.
# The order is important when we're limiting the number of relays returned.
params['order'] = '-consensus_weight,first_seen'
url = ONIONOO + what + '?' + urllib.parse.urlencode(params)
# Unfortunately, the URL is too long for some OS filenames,
# but we still don't want to get files from different URLs mixed up
base_file_name = what + '-' + hashlib.sha1(url.encode('ascii')).hexdigest()
full_url_file_name = base_file_name + '.full_url'
MAX_FULL_URL_LENGTH = 1024
last_modified_file_name = base_file_name + '.last_modified'
MAX_LAST_MODIFIED_LENGTH = 64
json_file_name = base_file_name + '.json'
if LOCAL_FILES_ONLY:
# Read from the local file, don't write to anything
response_json = load_json_from_file(json_file_name)
else:
# store the full URL to a file for debugging
# no need to compare as long as you trust SHA-1
write_to_file(url, full_url_file_name, MAX_FULL_URL_LENGTH)
request = urllib.request.Request(url)
request.add_header('Accept-encoding', 'gzip')
# load the last modified date from the file, if it exists
last_mod_date = read_from_file(last_modified_file_name,
MAX_LAST_MODIFIED_LENGTH)
if last_mod_date is not None:
request.add_header('If-modified-since', last_mod_date)
# Parse last modified date
last_mod = datestr_to_datetime(last_mod_date)
# Not Modified and still recent enough to be useful
# Onionoo / Globe used to use 6 hours, but we can afford a day
required_freshness = datetime.datetime.utcnow()
# strip any timezone out (to match dateutil.parser)
required_freshness = required_freshness.replace(tzinfo=None)
required_freshness -= datetime.timedelta(hours=24)
# Make the Onionoo request
response_code = 0
try:
response = urllib.request.urlopen(request)
response_code = response.getcode()
except urllib.error.HTTPError as error:
response_code = error.code
if response_code == 304: # not modified
pass
else:
raise Exception("Could not get " + url + ": "
+ str(error.code) + ": " + error.reason)
if response_code == 200: # OK
last_mod = datestr_to_datetime(response.info().get('Last-Modified'))
# Check for freshness
if last_mod < required_freshness:
if last_mod_date is not None:
# This check sometimes fails transiently, retry the script if it does
date_message = "Outdated data: last updated " + last_mod_date
else:
date_message = "No data: never downloaded "
raise Exception(date_message + " from " + url)
# Process the data
if response_code == 200: # OK
response_json = load_possibly_compressed_response_json(response)
with open(json_file_name, 'w') as f:
# use the most compact json representation to save space
json.dump(response_json, f, separators=(',',':'))
# store the last modified date in its own file
if response.info().get('Last-modified') is not None:
write_to_file(response.info().get('Last-Modified'),
last_modified_file_name,
MAX_LAST_MODIFIED_LENGTH)
elif response_code == 304: # Not Modified
response_json = load_json_from_file(json_file_name)
else: # Unexpected HTTP response code not covered in the HTTPError above
raise Exception("Unexpected HTTP response code to " + url + ": "
+ str(response_code))
register_fetch_source(what,
url,
response_json['relays_published'],
response_json['version'])
return response_json
def fetch(what, **kwargs):
#x = onionoo_fetch(what, **kwargs)
# don't use sort_keys, as the order of or_addresses is significant
#print(json.dumps(x, indent=4, separators=(',', ': ')))
#sys.exit(0)
return onionoo_fetch(what, **kwargs)
## Fallback Candidate Class
class Candidate(object):
CUTOFF_ADDRESS_AND_PORT_STABLE = (datetime.datetime.utcnow()
- datetime.timedelta(ADDRESS_AND_PORT_STABLE_DAYS))
def __init__(self, details):
for f in ['fingerprint', 'nickname', 'last_changed_address_or_port',
'consensus_weight', 'or_addresses', 'dir_address']:
if not f in details: raise Exception("Document has no %s field."%(f,))
if not 'flags' in details or details['flags'] is None:
details['flags'] = []
if (not 'advertised_bandwidth' in details
or details['advertised_bandwidth'] is None):
# relays without advertised bandwidth have it calculated from their
# consensus weight
details['advertised_bandwidth'] = 0
if (not 'effective_family' in details
or details['effective_family'] is None):
details['effective_family'] = []
details['last_changed_address_or_port'] = parse_ts(
details['last_changed_address_or_port'])
# Handle fields that can have arbitrary bytes, but should be UTF-8
if not 'contact' in details:
details['contact'] = None
else:
details['contact'] = six.ensure_text(details['contact'], errors='replace')
if not 'platform' in details:
details['platform'] = None
else:
details['platform'] = six.ensure_text(details['platform'], errors='replace')
self._data = details
self._stable_sort_or_addresses()
self._fpr = self._data['fingerprint']
self._running = self._guard = self._v2dir = 0.
self._split_dirport()
self._compute_orport()
if self.orport is None:
raise Exception("Failed to get an orport for %s."%(self._fpr,))
self._compute_ipv6addr()
if not self.has_ipv6():
logging.debug("Failed to get an ipv6 address for %s."%(self._fpr,))
self._compute_version()
self._extra_info_cache = None
def _stable_sort_or_addresses(self):
# replace self._data['or_addresses'] with a stable ordering,
# sorting the secondary addresses in string order
# leave the received order in self._data['or_addresses_raw']
self._data['or_addresses_raw'] = self._data['or_addresses']
or_address_primary = self._data['or_addresses'][:1]
# subsequent entries in the or_addresses array are in an arbitrary order
# so we stabilise the addresses by sorting them in string order
or_addresses_secondaries_stable = sorted(self._data['or_addresses'][1:])
or_addresses_stable = or_address_primary + or_addresses_secondaries_stable
self._data['or_addresses'] = or_addresses_stable
def get_fingerprint(self):
return self._fpr
# is_valid_ipv[46]_address by gsathya, karsten, 2013
@staticmethod
def is_valid_ipv4_address(address):
if not isinstance(address, six.string_types):
return False
# check if there are four period separated values
if address.count(".") != 3:
return False
# checks that each value in the octet are decimal values between 0-255
for entry in address.split("."):
if not entry.isdigit() or int(entry) < 0 or int(entry) > 255:
return False
elif entry[0] == "0" and len(entry) > 1:
return False # leading zeros, for instance in "1.2.3.001"
return True
@staticmethod
def is_valid_ipv6_address(address):
if not isinstance(address, six.string_types):
return False
# remove brackets
address = address[1:-1]
# addresses are made up of eight colon separated groups of four hex digits
# with leading zeros being optional
# https://en.wikipedia.org/wiki/IPv6#Address_format
colon_count = address.count(":")
if colon_count > 7:
return False # too many groups
elif colon_count != 7 and not "::" in address:
return False # not enough groups and none are collapsed
elif address.count("::") > 1 or ":::" in address:
return False # multiple groupings of zeros can't be collapsed
found_ipv4_on_previous_entry = False
for entry in address.split(":"):
# If an IPv6 address has an embedded IPv4 address,
# it must be the last entry
if found_ipv4_on_previous_entry:
return False
if not re.match("^[0-9a-fA-f]{0,4}$", entry):
if not Candidate.is_valid_ipv4_address(entry):
return False
else:
found_ipv4_on_previous_entry = True
return True
def _split_dirport(self):
# Split the dir_address into dirip and dirport
self._data['dir_address'] = six.ensure_text(self._data['dir_address'])
(self.dirip, _dirport) = self._data['dir_address'].split(':', 2)
self.dirport = int(_dirport)
def _compute_orport(self):
# Choose the first ORPort that's on the same IPv4 address as the DirPort.
# In rare circumstances, this might not be the primary ORPort address.
# However, _stable_sort_or_addresses() ensures we choose the same one
# every time, even if onionoo changes the order of the secondaries.
self._split_dirport()
self.orport = None
for i in self._data['or_addresses']:
if i != self._data['or_addresses'][0]:
logging.debug('Secondary IPv4 Address Used for %s: %s'%(self._fpr, i))
i = six.ensure_text(i)
(ipaddr, port) = i.rsplit(':', 1)
if (ipaddr == self.dirip) and Candidate.is_valid_ipv4_address(ipaddr):
self.orport = int(port)
return
def _compute_ipv6addr(self):
# Choose the first IPv6 address that uses the same port as the ORPort
# Or, choose the first IPv6 address in the list
# _stable_sort_or_addresses() ensures we choose the same IPv6 address
# every time, even if onionoo changes the order of the secondaries.
self.ipv6addr = None
self.ipv6orport = None
# Choose the first IPv6 address that uses the same port as the ORPort
for i in self._data['or_addresses']:
i = six.ensure_text(i)
(ipaddr, port) = i.rsplit(':', 1)
port = int(port)
if (port == self.orport) and Candidate.is_valid_ipv6_address(ipaddr):
self.ipv6addr = ipaddr
self.ipv6orport = port
return
# Choose the first IPv6 address in the list
for i in self._data['or_addresses']:
i = six.ensure_text(i)
(ipaddr, port) = i.rsplit(':', 1)
port = int(port)
if Candidate.is_valid_ipv6_address(ipaddr):
self.ipv6addr = ipaddr
self.ipv6orport = port
return
def _compute_version(self):
# parse the version out of the platform string
# The platform looks like: "Tor 0.2.7.6 on Linux"
self._data['version'] = None
if self._data['platform'] is None:
return
# be tolerant of weird whitespacing, use a whitespace split
tokens = self._data['platform'].split()
for token in tokens:
vnums = token.split('.')
# if it's at least a.b.c.d, with potentially an -alpha-dev, -alpha, -rc
if (len(vnums) >= 4 and vnums[0].isdigit() and vnums[1].isdigit() and
vnums[2].isdigit()):
self._data['version'] = token
return
# From #20509
# bug #20499 affects versions from 0.2.9.1-alpha-dev to 0.2.9.4-alpha-dev
# and version 0.3.0.0-alpha-dev
# Exhaustive lists are hard to get wrong
STALE_CONSENSUS_VERSIONS = ['0.2.9.1-alpha-dev',
'0.2.9.2-alpha',
'0.2.9.2-alpha-dev',
'0.2.9.3-alpha',
'0.2.9.3-alpha-dev',
'0.2.9.4-alpha',
'0.2.9.4-alpha-dev',
'0.3.0.0-alpha-dev'
]
def is_valid_version(self):
# call _compute_version before calling this
# is the version of the relay a version we want as a fallback?
# checks both recommended versions and bug #20499 / #20509
#
# if the relay doesn't have a recommended version field, exclude the relay
if 'recommended_version' not in self._data:
log_excluded('%s not a candidate: no recommended_version field',
self._fpr)
return False
if not self._data['recommended_version']:
log_excluded('%s not a candidate: version not recommended', self._fpr)
return False
# if the relay doesn't have version field, exclude the relay
if 'version' not in self._data:
log_excluded('%s not a candidate: no version field', self._fpr)
return False
if self._data['version'] in Candidate.STALE_CONSENSUS_VERSIONS:
logging.warning('%s not a candidate: version delivers stale consensuses',
self._fpr)
return False
return True
@staticmethod
def _extract_generic_history(history, which='unknown'):
# given a tree like this:
# {
# "1_month": {
# "count": 187,
# "factor": 0.001001001001001001,
# "first": "2015-02-27 06:00:00",
# "interval": 14400,
# "last": "2015-03-30 06:00:00",
# "values": [
# 999,
# 999
# ]
# },
# "1_week": {
# "count": 169,
# "factor": 0.001001001001001001,
# "first": "2015-03-23 07:30:00",
# "interval": 3600,
# "last": "2015-03-30 07:30:00",
# "values": [ ...]
# },
# "1_year": {
# "count": 177,
# "factor": 0.001001001001001001,
# "first": "2014-04-11 00:00:00",
# "interval": 172800,
# "last": "2015-03-29 00:00:00",
# "values": [ ...]
# },
# "3_months": {
# "count": 185,
# "factor": 0.001001001001001001,
# "first": "2014-12-28 06:00:00",
# "interval": 43200,
# "last": "2015-03-30 06:00:00",
# "values": [ ...]
# }
# },
# extract exactly one piece of data per time interval,
# using smaller intervals where available.
#
# returns list of (age, length, value) dictionaries.
generic_history = []
periods = list(history.keys())
periods.sort(key = lambda x: history[x]['interval'])
now = datetime.datetime.utcnow()
newest = now
for p in periods:
h = history[p]
interval = datetime.timedelta(seconds = h['interval'])
this_ts = parse_ts(h['last'])
if (len(h['values']) != h['count']):
logging.warning('Inconsistent value count in %s document for %s'
%(p, which))
for v in reversed(h['values']):
if (this_ts <= newest):
agt1 = now - this_ts
agt2 = interval
agetmp1 = (agt1.microseconds + (agt1.seconds + agt1.days * 24 * 3600)
* 10**6) / 10**6
agetmp2 = (agt2.microseconds + (agt2.seconds + agt2.days * 24 * 3600)
* 10**6) / 10**6
generic_history.append(
{ 'age': agetmp1,
'length': agetmp2,
'value': v
})
newest = this_ts