prepare_benchmark

#!/usr/bin/env python3


import argparse
import logging
import os
import sys

import importlib.util

from checks import check_program_exists
from dateutil import parser

from logging.config import fileConfig
from s64da_benchmark_toolkit.streams import Streams, Benchmark

fileConfig('logging.ini')
logger = logging.getLogger()

if __name__ == '__main__':
    py_version_info = sys.version_info
    if py_version_info < (3, 10):
        logger.error('Your python version does not match the requirements.')
        sys.exit()

    check_program_exists('psql')

    args_to_parse = argparse.ArgumentParser(
        argument_default=argparse.SUPPRESS,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    args_to_parse.add_argument('--dsn', required=True, help=(
        'The PostgreSQL DSN to connect to. Supply with DB, e.g. '
        'postgresql://postgres@localhost/dbname'
    ))

    benchmarks = []
    benchmarks_dir = 'benchmarks'
    try:
        for directory in os.listdir('benchmarks'):
            base_dir = os.path.join(benchmarks_dir, directory)
            benchmarks.append(Benchmark(name=directory, base_dir=base_dir))
    except FileNotFoundError:
        pass

    benchmark_names = sorted([benchmark.name for benchmark in benchmarks])
    args_to_parse.add_argument('--benchmark', required=True, choices=benchmark_names, help=(
        'Which benchmark to run.'
    ))

    scale_factor_required = any(x in ''.join(sys.argv) for x in ('tpcds', 'tpch', 'ssb', 'htap'))
    args_to_parse.add_argument('--scale-factor', type=int, default=None,
                               required=scale_factor_required, help=(
        'Scale factor for specific benchmarks (e.g. TPC-DS/H, SSB).'
    ))

    args_to_parse.add_argument('--chunks', type=int, default=10, help=(
        'How many chunks to generate if a data generator is involved.'
    ))

    args_to_parse.add_argument('--schema', required=True, help=(
        'Which schema of the benchmark to use.'
    ))

    args_to_parse.add_argument('--max-jobs', type=int, default=8, help=(
        'Limit the overall parallelism to this amount of jobs.'
    ))

    args_to_parse.add_argument('--check-diskspace-of-directory', default=None, help=(
        'If flag is present, a disk space check on the passed storage directory will be '
        'performed prior to ingestion.'
    ))

    args_to_parse.add_argument('--data-dir', default=None, help=(
        'The directory holding the data files to ingest from.'
    ))

    args_to_parse.add_argument('--num-partitions', default=None, help=(
        'The number of partitions for partitioned schemas.'
    ))

    args_to_parse.add_argument(
        '--start-date', default='1992-01-01', type=parser.isoparse, help=(
        'The start date for TPC-C.'
    ))

    args_to_parse.add_argument('--umbra', default=False, action='store_true',
        required=False, help=('Run in Umbra compatibility mode')
    )

    args = args_to_parse.parse_args()
    for benchmark in benchmarks:
        if benchmark.name == args.benchmark:
            benchmark_to_run = benchmark
            break

    if args.benchmark == 'ssb':
        # Parameter --check-diskspace-of-directory not supported for ssb yet
        args.check_diskspace_of_directory = ''
    
    if args.benchmark == 'tpcds':
            check_program_exists('recode')

    spec = importlib.util.spec_from_file_location(
        's64da_benchmark_toolkit.prepare',
        os.path.join(benchmark_to_run.base_dir, 'prepare.py'))
    prepare = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(prepare)
    prepare.PrepareBenchmark(args, benchmark).run()