-
Notifications
You must be signed in to change notification settings - Fork 25
/
runner.py
executable file
·1856 lines (1514 loc) · 95.5 KB
/
runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import faulthandler
faulthandler.enable(file=sys.__stderr__) # will catch segfaults and write to stderr
from lib.venv_checker import check_venv
check_venv() # this check must even run before __main__ as imports might not get resolved
import subprocess
import json
import os
import time
from html import escape
import importlib
import re
from pathlib import Path
import random
import shutil
import yaml
from collections import OrderedDict
from datetime import datetime
import platform
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
from lib import utils
from lib import process_helpers
from lib import hardware_info
from lib import hardware_info_root
from lib import error_helpers
from lib.repo_info import get_repo_info
from lib.debug_helper import DebugHelper
from lib.terminal_colors import TerminalColors
from lib.schema_checker import SchemaChecker
from lib.db import DB
from lib.global_config import GlobalConfig
from lib.notes import Notes
from lib import system_checks
from lib.machine import Machine
from lib import metric_importer
def arrows(text):
return f"\n\n>>>> {text} <<<<\n\n"
class Runner:
def __init__(self,
*, uri, uri_type, name=None, filename='usage_scenario.yml', branch=None,
debug_mode=False, allow_unsafe=False, skip_system_checks=False,
skip_unsafe=False, verbose_provider_boot=False, full_docker_prune=False,
dev_no_sleeps=False, dev_cache_build=False, dev_no_metrics=False,
dev_flow_timetravel=False, dev_no_optimizations=False, docker_prune=False, job_id=None,
user_id=1, measurement_flow_process_duration=None, measurement_total_duration=None, dev_no_phase_stats=False,
skip_volume_inspect=False):
if skip_unsafe is True and allow_unsafe is True:
raise RuntimeError('Cannot specify both --skip-unsafe and --allow-unsafe')
# variables that should not change if you call run multiple times
if name:
self._name = name
else:
self._name = f"Run {datetime.now()}"
self._debugger = DebugHelper(debug_mode)
self._allow_unsafe = allow_unsafe
self._skip_unsafe = skip_unsafe
self._skip_system_checks = skip_system_checks
self._skip_volume_inspect = skip_volume_inspect
self._verbose_provider_boot = verbose_provider_boot
self._full_docker_prune = full_docker_prune
self._docker_prune = docker_prune
self._dev_no_sleeps = dev_no_sleeps
self._dev_cache_build = dev_cache_build
self._dev_no_metrics = dev_no_metrics
self._dev_flow_timetravel = dev_flow_timetravel
self._dev_no_optimizations = dev_no_optimizations
self._dev_no_phase_stats = dev_no_phase_stats
self._uri = uri
self._uri_type = uri_type
self._original_filename = filename
self._branch = branch
self._tmp_folder = Path('/tmp/green-metrics-tool').resolve() # since linux has /tmp and macos /private/tmp
self._usage_scenario = {}
self._architecture = utils.get_architecture()
self._sci = {'R_d': None, 'R': 0}
self._sci |= GlobalConfig().config.get('sci', None) # merge in data from machine config like I, TE etc.
self._job_id = job_id
self._arguments = locals()
self._repo_folder = f"{self._tmp_folder}/repo" # default if not changed in checkout_repository
self._run_id = None
self._commit_hash = None
self._commit_timestamp = None
self._user_id = user_id
self._measurement_flow_process_duration = measurement_flow_process_duration
self._measurement_total_duration = measurement_total_duration
self._last_measurement_duration = 0
del self._arguments['self'] # self is not needed and also cannot be serialzed. We remove it
# transient variables that are created by the runner itself
# these are accessed and processed on cleanup and then reset
# They are __ as they should not be changed because this could break the state of the runner
self.__stdout_logs = OrderedDict()
self.__containers = {}
self.__networks = []
self.__ps_to_kill = []
self.__ps_to_read = []
self.__metric_providers = []
self.__notes_helper = Notes()
self.__phases = OrderedDict()
self.__start_measurement_seconds = None
self.__start_measurement = None
self.__end_measurement = None
self.__services_to_pause_phase = {}
self.__join_default_network = False
self.__docker_params = []
self.__working_folder = self._repo_folder
self.__working_folder_rel = ''
self.__image_sizes = {}
self.__volume_sizes = {}
# we currently do not use this variable
# self.__filename = self._original_filename # this can be changed later if working directory changes
def custom_sleep(self, sleep_time):
if not self._dev_no_sleeps:
print(TerminalColors.HEADER, '\nSleeping for : ', sleep_time, TerminalColors.ENDC)
time.sleep(sleep_time)
def get_optimizations_ignore(self):
return self._usage_scenario.get('optimizations_ignore', [])
# This function takes a path and a file and joins them while making sure that no one is trying to escape the
# path with `..`, symbolic links or similar.
# We always return the same error message including the path and file parameter, never `filename` as
# otherwise we might disclose if certain files exist or not.
def join_paths(self, path, path2, force_path_as_root=False):
filename = os.path.realpath(os.path.join(path, path2))
# If the original path is a symlink we need to resolve it.
path = os.path.realpath(path)
# This is a special case in which the file is '.'
if filename == path.rstrip('/'):
return filename
if not filename.startswith(self._repo_folder):
raise ValueError(f"{path2} must not be in folder above root repo folder {self._repo_folder}")
if force_path_as_root and not filename.startswith(path):
raise RuntimeError(f"{path2} must not be in folder above {path}")
# Another way to implement this. This is checking again but we want to be extra secure 👾
if Path(self._repo_folder).resolve(strict=True) not in Path(path, path2).resolve(strict=True).parents:
raise ValueError(f"{path2} must not be in folder above root repo folder {self._repo_folder}")
if force_path_as_root and Path(path).resolve(strict=True) not in Path(path, path2).resolve(strict=True).parents:
raise ValueError(f"{path2} must not be in folder above {path}")
if os.path.exists(filename):
return filename
raise FileNotFoundError(f"{path2} in {path} not found")
def initialize_folder(self, path):
shutil.rmtree(path, ignore_errors=True)
Path(path).mkdir(parents=True, exist_ok=True)
def save_notes_runner(self):
if not self._run_id:
return # Nothing to do, but also no hard error needed
print(TerminalColors.HEADER, '\nSaving notes: ', TerminalColors.ENDC, self.__notes_helper.get_notes())
self.__notes_helper.save_to_db(self._run_id)
def check_system(self, mode='start'):
if self._skip_system_checks:
print("System check skipped")
return
if mode =='start':
system_checks.check_start()
else:
raise RuntimeError('Unknown mode for system check:', mode)
def checkout_repository(self):
print(TerminalColors.HEADER, '\nChecking out repository', TerminalColors.ENDC)
if self._uri_type == 'URL':
# always remove the folder if URL provided, cause -v directory binding always creates it
# no check cause might fail when directory might be missing due to manual delete
if self._branch:
print(f"Branch specified: {self._branch}")
# git clone -b <branchname> --single-branch <remote-repo-url>
subprocess.run(
[
'git',
'clone',
'--depth', '1',
'-b', self._branch,
'--single-branch',
'--recurse-submodules',
'--shallow-submodules',
self._uri,
self._repo_folder
],
check=True,
capture_output=True,
encoding='UTF-8',
)
else:
subprocess.run(
[
'git',
'clone',
'--depth', '1',
'--single-branch',
'--recurse-submodules',
'--shallow-submodules',
self._uri,
self._repo_folder
],
check=True,
capture_output=True,
encoding='UTF-8'
) # always name target-dir repo according to spec
else:
if self._branch:
# we never want to checkout a local directory to a different branch as this might also be the GMT directory itself and might confuse the tool
raise RuntimeError('Specified --branch but using local URI. Did you mean to specify a github url?')
# If the provided uri is a symlink we need to resolve it.
path = os.path.realpath(self._uri)
self.__working_folder = self._repo_folder = path
self._branch = subprocess.check_output(['git', 'branch', '--show-current'], cwd=self._repo_folder, encoding='UTF-8').strip()
git_repo_root = subprocess.check_output(['git', 'rev-parse', '--show-toplevel'], cwd=self._repo_folder, encoding='UTF-8').strip()
if git_repo_root != self._repo_folder:
raise RuntimeError('Supplied folder through --uri is not the root of the git repository. Please only supply the root folder and then the target directory through --filename')
# we can safely do this, even with problematic folders, as the folder can only be a local unsafe one when
# running in CLI mode
self._commit_hash, self._commit_timestamp = get_repo_info(self._repo_folder)
# This method loads the yml file and takes care that the includes work and are secure.
# It uses the tagging infrastructure provided by https://pyyaml.org/wiki/PyYAMLDocumentation
# Inspiration from https://github.com/tanbro/pyyaml-include which we can't use as it doesn't
# do security checking and has no option to select when imported
def load_yml_file(self):
#pylint: disable=too-many-ancestors
runner_join_paths = self.join_paths
class Loader(yaml.SafeLoader):
def __init__(self, stream):
# We need to find our own root as the Loader is instantiated in PyYaml
self._root = os.path.split(stream.name)[0]
super().__init__(stream)
def include(self, node):
# We allow two types of includes
# !include <filename> => ScalarNode
# and
# !include <filename> <selector> => SequenceNode
if isinstance(node, yaml.nodes.ScalarNode):
nodes = [self.construct_scalar(node)]
elif isinstance(node, yaml.nodes.SequenceNode):
nodes = self.construct_sequence(node)
else:
raise ValueError("We don't support Mapping Nodes to date")
try:
filename = runner_join_paths(self._root, nodes[0], force_path_as_root=True)
except RuntimeError as exc:
raise ValueError(f"Included compose file \"{nodes[0]}\" may only be in the same directory as the usage_scenario file as otherwise relative context_paths and volume_paths cannot be mapped anymore") from exc
with open(filename, 'r', encoding='utf-8') as f:
# We want to enable a deep search for keys
def recursive_lookup(k, d):
if k in d:
return d[k]
for v in d.values():
if isinstance(v, dict):
return recursive_lookup(k, v)
return None
# We can use load here as the Loader extends SafeLoader
if len(nodes) == 1:
# There is no selector specified
return yaml.load(f, Loader)
return recursive_lookup(nodes[1], yaml.load(f, Loader))
Loader.add_constructor('!include', Loader.include)
usage_scenario_file = self.join_paths(self._repo_folder, self._original_filename)
# We set the working folder now to the actual location of the usage_scenario
if '/' in self._original_filename:
self.__working_folder_rel = self._original_filename.rsplit('/', 1)[0]
self.__working_folder = usage_scenario_file.rsplit('/', 1)[0]
#self.__filename = usage_scenario_file.rsplit('/', 1)[1] # we currently do not use this variable
print("Working folder changed to ", self.__working_folder)
with open(usage_scenario_file, 'r', encoding='utf-8') as fp:
# We can use load here as the Loader extends SafeLoader
yml_obj = yaml.load(fp, Loader)
# Now that we have parsed the yml file we need to check for the special case in which we have a
# compose-file key. In this case we merge the data we find under this key but overwrite it with
# the data from the including file.
# We need to write our own merge method as dict.update doesn't do a "deep" merge
def merge_dicts(dict1, dict2):
if isinstance(dict1, dict):
for k, v in dict2.items():
if k in dict1 and isinstance(v, dict) and isinstance(dict1[k], dict):
merge_dicts(dict1[k], v)
else:
dict1[k] = v
return dict1
return dict1
new_dict = {}
if 'compose-file' in yml_obj.keys():
for k,v in yml_obj['compose-file'].items():
if k in yml_obj:
new_dict[k] = merge_dicts(v,yml_obj[k])
else: # just copy over if no key exists in usage_scenario
yml_obj[k] = v
del yml_obj['compose-file']
yml_obj.update(new_dict)
# If a service is defined as None we remove it. This is so we can have a compose file that starts
# all the various services but we can disable them in the usage_scenario. This is quite useful when
# creating benchmarking scripts and you want to have all options in the compose but not in each benchmark.
# The cleaner way would be to handle an empty service key throughout the code but would make it quite messy
# so we chose to remove it right at the start.
for key in [sname for sname, content in yml_obj.get('services', {}).items() if content is None]:
del yml_obj['services'][key]
self._usage_scenario = yml_obj
def initial_parse(self):
self.load_yml_file()
schema_checker = SchemaChecker(validate_compose_flag=True)
schema_checker.check_usage_scenario(self._usage_scenario)
print(TerminalColors.HEADER, '\nHaving Usage Scenario ', self._usage_scenario['name'], TerminalColors.ENDC)
print('From: ', self._usage_scenario['author'])
print('Description: ', self._usage_scenario['description'], '\n')
if self._allow_unsafe:
print(TerminalColors.WARNING, arrows('Warning: Runner is running in unsafe mode'), TerminalColors.ENDC)
if self._usage_scenario.get('architecture') is not None and self._architecture != self._usage_scenario['architecture'].lower():
raise RuntimeError(f"Specified architecture does not match system architecture: system ({self._architecture}) != specified ({self._usage_scenario.get('architecture')})")
self._sci['R_d'] = self._usage_scenario.get('sci', {}).get('R_d', None)
def prepare_docker(self):
# Disable Docker CLI hints (e.g. "What's Next? ...")
os.environ['DOCKER_CLI_HINTS'] = 'false'
def check_running_containers(self):
result = subprocess.run(['docker', 'ps' ,'--format', '{{.Names}}'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True, encoding='UTF-8')
for line in result.stdout.splitlines():
for running_container in line.split(','): # if docker container has multiple tags, they will be split by comma, so we only want to
for service_name in self._usage_scenario.get('services', {}):
if 'container_name' in self._usage_scenario['services'][service_name]:
container_name = self._usage_scenario['services'][service_name]['container_name']
else:
container_name = service_name
if running_container == container_name:
raise PermissionError(f"Container '{container_name}' is already running on system. Please close it before running the tool.")
def populate_image_names(self):
for service_name, service in self._usage_scenario.get('services', {}).items():
if not service.get('image', None): # image is a non-mandatory field. But we need it, so we tmp it
if self._dev_cache_build:
service['image'] = f"{service_name}"
else:
service['image'] = f"{service_name}_{random.randint(500000,10000000)}"
def remove_docker_images(self):
if self._dev_cache_build:
return
print(TerminalColors.HEADER, '\nRemoving all temporary GMT images', TerminalColors.ENDC)
subprocess.run(
'docker images --format "{{.Repository}}:{{.Tag}}" | grep "gmt_run_tmp" | xargs docker rmi -f',
shell=True,
stderr=subprocess.DEVNULL, # to suppress showing of stderr
check=False,
)
if self._full_docker_prune:
print(TerminalColors.HEADER, '\nStopping and removing all containers, build caches, volumes and images on the system', TerminalColors.ENDC)
subprocess.run('docker ps -aq | xargs docker stop', shell=True, check=False)
subprocess.run('docker images --format "{{.ID}}" | xargs docker rmi -f', shell=True, check=False)
subprocess.run(['docker', 'system', 'prune' ,'--force', '--volumes'], check=True)
elif self._docker_prune:
print(TerminalColors.HEADER, '\nRemoving all unassociated build caches, networks volumes and stopped containers on the system', TerminalColors.ENDC)
subprocess.run(['docker', 'system', 'prune' ,'--force', '--volumes'], check=True)
else:
print(TerminalColors.WARNING, arrows('Warning: GMT is not instructed to prune docker images and build caches. \nWe recommend to set --docker-prune to remove build caches and anonymous volumes, because otherwise your disk will get full very quickly. If you want to measure also network I/O delay for pulling images and have a dedicated measurement machine please set --full-docker-prune'), TerminalColors.ENDC)
'''
A machine will always register in the database on run.
This means that it will write its machine_id and machine_descroption to the machines table
and then link itself in the runs table accordingly.
'''
def register_machine_id(self):
config = GlobalConfig().config
if config['machine'].get('id') is None \
or not isinstance(config['machine']['id'], int) \
or config['machine'].get('description') is None \
or config['machine']['description'] == '':
raise RuntimeError('You must set machine id and machine description')
machine = Machine(machine_id=config['machine'].get('id'), description=config['machine'].get('description'))
machine.register()
def initialize_run(self):
config = GlobalConfig().config
gmt_hash, _ = get_repo_info(CURRENT_DIR)
# There are two ways we get hardware info. First things we don't need to be root to do which we get through
# a method call. And then things we need root privilege which we need to call as a subprocess with sudo. The
# install.sh script should have added the script to the sudoes file.
machine_specs = hardware_info.get_default_values()
if len(hardware_info_root.get_root_list()) > 0:
ps = subprocess.run(['sudo', '/usr/bin/python3', '-m', 'lib.hardware_info_root'], stdout=subprocess.PIPE, cwd=CURRENT_DIR, check=True, encoding='UTF-8')
machine_specs_root = json.loads(ps.stdout)
machine_specs.update(machine_specs_root)
measurement_config = {}
measurement_config['settings'] = {k: v for k, v in config['measurement'].items() if k != 'metric-providers'}
measurement_config['providers'] = utils.get_metric_providers(config)
measurement_config['sci'] = self._sci
# We issue a fetch_one() instead of a query() here, cause we want to get the RUN_ID
self._run_id = DB().fetch_one("""
INSERT INTO runs (
job_id, name, uri, branch, filename,
commit_hash, commit_timestamp, runner_arguments,
machine_specs, measurement_config,
usage_scenario, gmt_hash,
machine_id, user_id, created_at
)
VALUES (
%s, %s, %s, %s, %s,
%s, %s, %s,
%s, %s,
%s, %s,
%s, %s, NOW()
)
RETURNING id
""", params=(
self._job_id, self._name, self._uri, self._branch, self._original_filename,
self._commit_hash, self._commit_timestamp, json.dumps(self._arguments),
escape(json.dumps(machine_specs), quote=False), json.dumps(measurement_config),
escape(json.dumps(self._usage_scenario), quote=False), gmt_hash,
GlobalConfig().config['machine']['id'], self._user_id,
))[0]
return self._run_id
def import_metric_providers(self):
if self._dev_no_metrics:
print(TerminalColors.HEADER, '\nSkipping import of metric providers', TerminalColors.ENDC)
return
config = GlobalConfig().config
print(TerminalColors.HEADER, '\nImporting metric providers', TerminalColors.ENDC)
metric_providers = utils.get_metric_providers(config)
if not metric_providers:
print(TerminalColors.WARNING, arrows('No metric providers were configured in config.yml. Was this intentional?'), TerminalColors.ENDC)
return
subprocess.run(["docker", "info"], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, encoding='UTF-8', check=True)
for metric_provider in metric_providers: # will iterate over keys
module_path, class_name = metric_provider.rsplit('.', 1)
module_path = f"metric_providers.{module_path}"
conf = metric_providers[metric_provider] or {}
print(f"Importing {class_name} from {module_path}")
module = importlib.import_module(module_path)
if self._skip_system_checks:
metric_provider_obj = getattr(module, class_name)(**conf, skip_check=True)
print(f"Configuration is {conf}; skip_check=true")
else:
metric_provider_obj = getattr(module, class_name)(**conf)
print(f"Configuration is {conf}")
self.__metric_providers.append(metric_provider_obj)
if hasattr(metric_provider_obj, 'get_docker_params'):
services_list = ",".join(list(self._usage_scenario.get('services', {}).keys()))
self.__docker_params += metric_provider_obj.get_docker_params(no_proxy_list=services_list)
self.__metric_providers.sort(key=lambda item: 'rapl' not in item.__class__.__name__.lower())
def download_dependencies(self):
if self._dev_cache_build:
print(TerminalColors.HEADER, '\nSkipping downloading dependencies', TerminalColors.ENDC)
return
print(TerminalColors.HEADER, '\nDownloading dependencies', TerminalColors.ENDC)
subprocess.run(['docker', 'pull', 'gcr.io/kaniko-project/executor:latest'], check=True)
def get_build_info(self, service):
if isinstance(service['build'], str):
# If build is a string we can assume the short form
context = service['build']
dockerfile = 'Dockerfile'
else:
context = service['build'].get('context', '.')
dockerfile = service['build'].get('dockerfile', 'Dockerfile')
return context, dockerfile
def clean_image_name(self, name):
# clean up image name for problematic characters
name = re.sub(r'[^A-Za-z0-9_]', '', name)
# only lowercase letters are allowed for tags
name = name.lower()
name = f"{name}_gmt_run_tmp"
return name
def build_docker_images(self):
print(TerminalColors.HEADER, '\nBuilding Docker images', TerminalColors.ENDC)
# Create directory /tmp/green-metrics-tool/docker_images
temp_dir = f"{self._tmp_folder}/docker_images"
self.initialize_folder(temp_dir)
# technically the usage_scenario needs no services and can also operate on an empty list
# This use case is when you have running containers on your host and want to benchmark some code running in them
for _, service in self._usage_scenario.get('services', {}).items():
# minimal protection from possible shell escapes.
# since we use subprocess without shell we should be safe though
if re.findall(r'(\.\.|\$|\'|"|`|!)', service['image']):
raise ValueError(f"In scenario file the builds contains an invalid image name: {service['image']}")
tmp_img_name = self.clean_image_name(service['image'])
# If we are in developer repeat runs check if the docker image has already been built
try:
subprocess.run(['docker', 'inspect', '--type=image', tmp_img_name],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
encoding='UTF-8',
check=True)
# The image exists so exit and don't build
print(f"Image {service['image']} exists in build cache. Skipping build ...")
continue
except subprocess.CalledProcessError:
pass
if 'build' in service:
context, dockerfile = self.get_build_info(service)
print(f"Building {service['image']}")
self.__notes_helper.add_note({'note': f"Building {service['image']}", 'detail_name': '[NOTES]', 'timestamp': int(time.time_ns() / 1_000)})
# Make sure the context docker file exists and is not trying to escape some root. We don't need the returns
context_path = self.join_paths(self.__working_folder, context)
self.join_paths(context_path, dockerfile)
docker_build_command = ['docker', 'run', '--rm',
'-v', '/workspace',
'-v', f"{self._repo_folder}:/tmp/repo:ro", # this is the folder where the usage_scenario is!
'-v', f"{temp_dir}:/output",
'gcr.io/kaniko-project/executor:latest',
f"--dockerfile=/tmp/repo/{self.__working_folder_rel}/{context}/{dockerfile}",
'--context', f'dir:///tmp/repo/{self.__working_folder_rel}/{context}',
f"--destination={tmp_img_name}",
f"--tar-path=/output/{tmp_img_name}.tar",
'--cleanup=true',
'--no-push']
if self.__docker_params:
docker_build_command[2:2] = self.__docker_params
print(' '.join(docker_build_command))
if self._measurement_total_duration:
ps = subprocess.run(docker_build_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='UTF-8', timeout=self._measurement_total_duration, check=False)
else:
ps = subprocess.run(docker_build_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='UTF-8', check=False)
if ps.returncode != 0:
print(f"Error: {ps.stderr} \n {ps.stdout}")
raise OSError(f"Docker build failed\nStderr: {ps.stderr}\nStdout: {ps.stdout}")
# import the docker image locally
image_import_command = ['docker', 'load', '-q', '-i', f"{temp_dir}/{tmp_img_name}.tar"]
print(' '.join(image_import_command))
ps = subprocess.run(image_import_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='UTF-8', check=False)
if ps.returncode != 0 or ps.stderr != "":
print(f"Error: {ps.stderr} \n {ps.stdout}")
raise OSError("Docker image import failed")
else:
print(f"Pulling {service['image']}")
self.__notes_helper.add_note({'note':f"Pulling {service['image']}" , 'detail_name': '[NOTES]', 'timestamp': int(time.time_ns() / 1_000)})
ps = subprocess.run(['docker', 'pull', service['image']], stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='UTF-8', check=False)
if ps.returncode != 0:
print(f"Error: {ps.stderr} \n {ps.stdout}")
if __name__ == '__main__':
print(TerminalColors.OKCYAN, '\nThe docker image could not be pulled. Since you are working locally we can try looking in your local images. Do you want that? (y/N).', TerminalColors.ENDC)
if sys.stdin.readline().strip().lower() == 'y':
try:
subprocess.run(['docker', 'inspect', '--type=image', service['image']],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
encoding='UTF-8',
check=True)
print('Docker image found locally. Tagging now for use in cached runs ...')
except subprocess.CalledProcessError:
raise OSError(f"Docker pull failed and image does not exist locally. Is your image name correct and are you connected to the internet: {service['image']}") from subprocess.CalledProcessError
else:
raise OSError(f"Docker pull failed. Is your image name correct and are you connected to the internet: {service['image']}")
else:
raise OSError(f"Docker pull failed. Is your image name correct and are you connected to the internet: {service['image']}")
# tagging must be done in pull and local case, so we can get the correct container later
subprocess.run(['docker', 'tag', service['image'], tmp_img_name], check=True)
# Delete the directory /tmp/gmt_docker_images
shutil.rmtree(temp_dir)
def save_image_and_volume_sizes(self):
for _, service in self._usage_scenario.get('services', {}).items():
tmp_img_name = self.clean_image_name(service['image'])
# This will report bogus values on macOS sadly that do not align with "docker images" size info ...
output = subprocess.check_output(
f"docker image inspect {tmp_img_name} " + '--format={{.Size}}',
shell=True,
encoding='UTF-8',
)
self.__image_sizes[service['image']] = int(output.strip())
# du -s -b does not work on macOS and also the docker image is in a VM and not accessible with du for us
if not self._skip_volume_inspect and self._allow_unsafe and platform.system() != 'Darwin':
for volume in self._usage_scenario.get('volumes', {}):
# This will report bogus values on macOS sadly that do not align with "docker images" size info ...
try:
output = subprocess.check_output(
['docker', 'volume', 'inspect', volume, '--format={{.Mountpoint}}'],
encoding='UTF-8',
)
output = subprocess.check_output(
['du', '-s', '-b', output.strip()],
encoding='UTF-8',
)
self.__volume_sizes[volume] = int(output.strip().split('\t', maxsplit=1)[0])
except Exception as exc:
raise RuntimeError('Docker volumes could not be inspected. This can happen if you are storing images in a root only accessible location. Consider switching to docker rootless, running with --skip-volume-inspect or running GMT with sudo.') from exc
DB().query("""
UPDATE runs
SET machine_specs = machine_specs || %s
WHERE id = %s
""", params=(json.dumps({'Container Image Sizes': self.__image_sizes, 'Container Volume Sizes': self.__volume_sizes}), self._run_id))
def setup_networks(self):
# for some rare containers there is no network, like machine learning for example
if 'networks' in self._usage_scenario:
print(TerminalColors.HEADER, '\nSetting up networks', TerminalColors.ENDC)
for network in self._usage_scenario['networks']:
print('Creating network: ', network)
# remove first if present to not get error, but do not make check=True, as this would lead to inf. loop
subprocess.run(['docker', 'network', 'rm', network], stderr=subprocess.DEVNULL, check=False)
if self._usage_scenario['networks'][network] and self._usage_scenario['networks'][network].get('internal', False):
subprocess.check_output(['docker', 'network', 'create', '--internal', network])
else:
subprocess.check_output(['docker', 'network', 'create', network])
self.__networks.append(network)
else:
print(TerminalColors.HEADER, '\nNo network found. Creating default network', TerminalColors.ENDC)
network = f"GMT_default_tmp_network_{random.randint(500000,10000000)}"
print('Creating network: ', network)
# remove first if present to not get error, but do not make check=True, as this would lead to inf. loop
subprocess.run(['docker', 'network', 'rm', network], stderr=subprocess.DEVNULL, check=False)
subprocess.run(['docker', 'network', 'create', network], check=True)
self.__networks.append(network)
self.__join_default_network = True
def order_services(self, services):
names_ordered = []
def order_service_names(service_name, visited=None):
if visited is None:
visited = set()
if service_name in visited:
raise RuntimeError(f"Cycle found in depends_on definition with service '{service_name}'!")
visited.add(service_name)
if service_name not in services:
raise RuntimeError(f"Dependent service '{service_name}' defined in 'depends_on' does not exist in usage_scenario!")
service = services[service_name]
if 'depends_on' in service:
for dep in service['depends_on']:
if dep not in names_ordered:
order_service_names(dep, visited)
if service_name not in names_ordered:
names_ordered.append(service_name)
# Iterate over all services and sort them with the recursive function 'order_service_names'
for service_name in services.keys():
order_service_names(service_name)
print("Startup order: ", names_ordered)
return OrderedDict((key, services[key]) for key in names_ordered)
def setup_services(self):
config = GlobalConfig().config
print(TerminalColors.HEADER, '\nSetting up services', TerminalColors.ENDC)
# technically the usage_scenario needs no services and can also operate on an empty list
# This use case is when you have running containers on your host and want to benchmark some code running in them
services = self._usage_scenario.get('services', {})
# Check if there are service dependencies defined with 'depends_on'.
# If so, change the order of the services accordingly.
services_ordered = self.order_services(services)
for service_name, service in services_ordered.items():
if 'container_name' in service:
container_name = service['container_name']
else:
container_name = service_name
print(TerminalColors.HEADER, '\nSetting up container for service:', service_name, TerminalColors.ENDC)
print('Container name:', container_name)
print('Resetting container')
# By using the -f we return with 0 if no container is found
# we always reset container without checking if something is running, as we expect that a user understands
# this mechanic when using docker based tools. A container with the same name may not run twice
subprocess.run(['docker', 'rm', '-f', container_name], stderr=subprocess.DEVNULL, check=True)
print('Creating container')
# We are attaching the -it option here to keep STDIN open and a terminal attached.
# This helps to keep an excecutable-only container open, which would otherwise exit
# This MAY break in the future, as some docker CLI implementation do not allow this and require
# the command args to be passed on run only
# docker_run_string must stay as list, cause this forces items to be quoted and escaped and prevents
# injection of unwawnted params
docker_run_string = ['docker', 'run', '-it', '-d', '--name', container_name]
docker_run_string.append('-v')
if 'folder-destination' in service:
docker_run_string.append(f"{self._repo_folder}:{service['folder-destination']}:ro")
else:
docker_run_string.append(f"{self._repo_folder}:/tmp/repo:ro")
if self.__docker_params:
docker_run_string[2:2] = self.__docker_params
if 'volumes' in service:
if self._allow_unsafe:
# On old docker clients we experience some weird error, that we deem legacy
# If a volume is supplied in the compose.yml file in this form: ./file.txt:/tmp/file.txt
# and the file does NOT exist, then docker will create the folder in the current running dir
# This is however not enabled anymore and hard to circumvent. We keep this as unfixed for now.
if not isinstance(service['volumes'], list):
raise RuntimeError(f"Service '{service_name}' volumes must be a list but is: {type(service['volumes'])}")
for volume in service['volumes']:
docker_run_string.append('-v')
if volume.startswith('./'): # we have a bind-mount with relative path
vol = volume.split(':',1) # there might be an :ro etc at the end, so only split once
path = os.path.realpath(os.path.join(self.__working_folder, vol[0]))
if not os.path.exists(path):
raise RuntimeError(f"Service '{service_name}' volume path does not exist: {path}")
docker_run_string.append(f"{path}:{vol[1]}")
else:
docker_run_string.append(f"{volume}")
else: # safe volume bindings are active by default
if not isinstance(service['volumes'], list):
raise RuntimeError(f"Service '{service_name}' volumes must be a list but is: {type(service['volumes'])}")
for volume in service['volumes']:
vol = volume.split(':')
# We always assume the format to be ./dir:dir:[flag] as if we allow none bind mounts people
# could create volumes that would linger on our system.
try:
path = self.join_paths(self.__working_folder, vol[0])
except FileNotFoundError as exc:
raise RuntimeError(f"The volume {vol[0]} could not be loaded or found at the specified path.") from exc
if len(vol) == 3:
if vol[2] != 'ro':
raise RuntimeError(f"Service '{service_name}': We only allow ro as parameter in volume mounts in unsafe mode")
docker_run_string.append('--mount')
docker_run_string.append(f"type=bind,source={path},target={vol[1]},readonly")
if 'ports' in service:
if self._allow_unsafe:
if not isinstance(service['ports'], list):
raise RuntimeError(f"ports must be a list but is: {type(service['ports'])}")
for ports in service['ports']:
print('Setting ports: ', service['ports'])
docker_run_string.append('-p')
docker_run_string.append(str(ports)) # Ports can also be an int according to schema checker, but needs to be a string when we use subprocess
elif self._skip_unsafe:
print(TerminalColors.WARNING, arrows('Found ports entry but not running in unsafe mode. Skipping'), TerminalColors.ENDC)
else:
raise RuntimeError('Found "ports" but neither --skip-unsafe nor --allow-unsafe is set')
if 'environment' in service:
env_var_check_errors = []
for docker_env_var in service['environment']:
# In a compose file env vars can be defined with a "=" and as a dict.
# We make sure that:
# environment:
# - DEBUG
# or
# environment:
# - image: "postgres: ${POSTGRES_VERSION}"
# will fail as this could expose env vars from the host system.
if isinstance(docker_env_var, str) and '=' in docker_env_var:
env_key, env_value = docker_env_var.split('=')
elif isinstance(service['environment'], dict):
env_key, env_value = str(docker_env_var), str(service['environment'][docker_env_var])
else:
raise RuntimeError('Environment variable needs to be a string with = or dict and non-empty. We do not allow the feature of forwarding variables from the host OS!')
# Check the key of the environment var
if not self._allow_unsafe and re.search(r'^[A-Z_]+[A-Z0-9_]*$', env_key) is None:
if self._skip_unsafe:
warn_message= arrows(f"Found environment var key with wrong format. Only ^[A-Z_]+[A-Z0-9_]*$ allowed: {env_key} - Skipping")
print(TerminalColors.WARNING, warn_message, TerminalColors.ENDC)
continue
env_var_check_errors.append(f"- key '{env_key}' has wrong format. Only ^[A-Z_]+[A-Z0-9_]*$ is allowed - Maybe consider using --allow-unsafe or --skip-unsafe")
# Check the value of the environment var
# We only forbid long values (>1024), every character is allowed.
# The value is directly passed to the container and is not evaluated on the host system, so there is no security related reason to forbid special characters.
if not self._allow_unsafe and len(env_value) > 1024:
if self._skip_unsafe:
print(TerminalColors.WARNING, arrows(f"Found environment var value with size {len(env_value)} (max allowed length is 1024) - Skipping env var '{env_key}'"), TerminalColors.ENDC)
continue
env_var_check_errors.append(f"- value of environment var '{env_key}' is too long {len(env_value)} (max allowed length is 1024) - Maybe consider using --allow-unsafe or --skip-unsafe")
docker_run_string.append('-e')
docker_run_string.append(f"{env_key}={env_value}")
if env_var_check_errors:
raise RuntimeError('Docker container environment setup has problems:\n\n'.join(env_var_check_errors))
if 'networks' in service:
for network in service['networks']:
docker_run_string.append('--net')
docker_run_string.append(network)
elif self.__join_default_network:
# only join default network if no other networks provided
# if this is true only one entry is in self.__networks
docker_run_string.append('--net')
docker_run_string.append(self.__networks[0])
if 'pause-after-phase' in service:
self.__services_to_pause_phase[service['pause-after-phase']] = self.__services_to_pause_phase.get(service['pause-after-phase'], []) + [container_name]
if 'deploy' in service:
if memory := service['deploy'].get('resources', {}).get('limits', {}).get('memory', None):
docker_run_string.append('--memory') # value in bytes
docker_run_string.append(str(memory))
if cpus := service['deploy'].get('resources', {}).get('limits', {}).get('cpus', None):
docker_run_string.append('--cpus') # value in cores
docker_run_string.append(str(cpus))
if 'healthcheck' in service: # must come last
if 'disable' in service['healthcheck'] and service['healthcheck']['disable'] is True:
docker_run_string.append('--no-healthcheck')
else:
if 'test' in service['healthcheck']:
docker_run_string.append('--health-cmd')
health_string = service['healthcheck']['test']
if isinstance(service['healthcheck']['test'], list):
health_string_copy = service['healthcheck']['test'].copy()
health_string_command = health_string_copy.pop(0)
if health_string_command not in ['CMD', 'CMD-SHELL']:
raise RuntimeError(f"Healthcheck starts with {health_string_command}. Please use 'CMD' or 'CMD-SHELL' when supplying as list. For disabling do not use 'NONE' but the disable argument.")
health_string = ' '.join(health_string_copy)
docker_run_string.append(health_string)
if 'interval' in service['healthcheck']:
docker_run_string.append('--health-interval')
docker_run_string.append(service['healthcheck']['interval'])
if 'timeout' in service['healthcheck']:
docker_run_string.append('--health-timeout')
docker_run_string.append(service['healthcheck']['timeout'])
if 'retries' in service['healthcheck']:
docker_run_string.append('--health-retries')
docker_run_string.append(str(service['healthcheck']['retries'])) # we need a str to pass to subprocess
if 'start_period' in service['healthcheck']:
docker_run_string.append('--health-start-period')
docker_run_string.append(service['healthcheck']['start_period'])
if 'start_interval' in service['healthcheck']:
docker_run_string.append('--health-start-interval')
docker_run_string.append(service['healthcheck']['start_interval'])
docker_run_string.append(self.clean_image_name(service['image']))
# Before finally starting the container for the current service, check if the dependent services are ready.
# If not, wait for some time. If a dependent service is not ready after a certain time, throw an error.
# If a healthcheck is defined, the container of the dependent service must become "healthy".
# If no healthcheck is defined, the container state "running" is sufficient.
if 'depends_on' in service:
for dependent_service in service['depends_on']:
dependent_container_name = dependent_service
if 'container_name' in services[dependent_service]:
dependent_container_name = services[dependent_service]["container_name"]
time_waited = 0
state = ''
health = 'healthy' # default because some containers have no health
max_waiting_time = config['measurement']['boot']['wait_time_dependencies']
while time_waited < max_waiting_time:
status_output = subprocess.check_output(
["docker", "container", "inspect", "-f", "{{.State.Status}}", dependent_container_name],
stderr=subprocess.STDOUT,
encoding='UTF-8',
)
state = status_output.strip()
if time_waited == 0 or state != "running":
print(f"Container state of dependent service '{dependent_service}': {state}")
if isinstance(service['depends_on'], dict) \
and 'condition' in service['depends_on'][dependent_service]:
condition = service['depends_on'][dependent_service]['condition']
if condition == 'service_healthy':
ps = subprocess.run(
["docker", "container", "inspect", "-f", "{{.State.Health.Status}}", dependent_container_name],
check=False,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, # put both in one stream
encoding='UTF-8'
)
health = ps.stdout.strip()
print(f"Container health of dependent service '{dependent_service}': {health}")
if ps.returncode != 0 or health == '<nil>':
raise RuntimeError(f"Health check for service '{dependent_service}' was requested by '{service_name}', but service has no healthcheck implemented! (Output was: {health})")
if health == 'unhealthy':
raise RuntimeError(f'Health check of container "{dependent_container_name}" failed terminally with status "unhealthy" after {time_waited}s')
elif condition == 'service_started':
pass
else:
raise RuntimeError(f"Unsupported condition in healthcheck for service '{service_name}': {condition}")
if state == 'running' and health == 'healthy':
break
time.sleep(1)
time_waited += 1
if state != 'running':