From 55fe177da8b4aaf23d9d27c7fbbc579404232855 Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Fri, 2 Aug 2024 11:35:43 +0800 Subject: [PATCH] stall-analyser: add output support for flamegraph a typical session is like: ```console $ ./stall-analyser.py --format trace -e scylla/libexec/scylla \ reactor_stalled_231d31df_2022_2_twcs_3h.log > out.folded $ flamegraph.pl out.folded > /tmp/folded.svg $ xdg-open /tmp/folded.svg ``` Signed-off-by: Kefu Chai --- scripts/stall-analyser.py | 76 ++++++++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/scripts/stall-analyser.py b/scripts/stall-analyser.py index b003c87200b..70d31e83c2c 100755 --- a/scripts/stall-analyser.py +++ b/scripts/stall-analyser.py @@ -5,8 +5,9 @@ import re import addr2line -from itertools import dropwhile -from typing import Self +from collections import defaultdict +from itertools import chain, dropwhile +from typing import Iterator, Self def get_command_line_parser(): @@ -46,6 +47,8 @@ def get_command_line_parser(): help='Process only stalls lasting the given time, in milliseconds, or longer') parser.add_argument('-b', '--branch-threshold', type=float, default=0.03, help='Drop branches responsible for less than this threshold relative to the previous level, not global. (default 3%%)') + parser.add_argument('--format', choices=['graph', 'trace'], default='graph', + help='The output format, default is %(default)s. `trace` is suitable as input for flamegraph.pl') parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin, @@ -280,6 +283,58 @@ def _recursive_print_graph(n: Node, total: int = 0, count: int = 0, level: int = _recursive_print_graph(r) +class StackCollapse: + # collapse stall backtraces into single lines + def __init__(self, resolver: addr2line.BacktraceResolver) -> None: + self.resolver = resolver + # track every stack and the its total sample counts + self.collapsed = defaultdict(int) + # to match lines like + # (inlined by) position_in_partition::tri_compare::operator() at ././position_in_partition.hh:485 + self.pattern = re.compile(r'''(.+)\s # function signature + at\s # + [^:]+:(?:\?|\d+) # :''', re.X) + + def process_trace(self, frames: list[str], count: int) -> None: + # each stall report is mapped to a line of perf samples + # so the output looks like: + # row_cache::update;row_cache::do_update;row_cache::upgrade_entry 42 + # from outer-most caller to the inner-most callee, and 42 is the time + # in ms, but we use it for the count of samples. + self.collapsed[';'.join(frames)] += count + + def _annotate_func(self, line: str) -> str: + # sample input: + # (inlined by) position_in_partition::tri_compare::operator() at ././position_in_partition.hh:485 + # sample output: + # position_in_partition::tri_compare::operator()_[i] + if line.startswith("??"): + return "" + + inlined_prefix = ' (inlined by) ' + inlined = line.startswith(inlined_prefix) + if inlined: + line = line[len(inlined_prefix):] + + matched = self.pattern.match(line) + assert matched, f"bad line: {line}" + func = matched.groups()[0] + # annotations + if inlined: + func += "_[i]" + return func + + def _resolve(self, addr: str) -> Iterator[str]: + lines = self.resolver.resolve_address(addr).splitlines() + return (self._annotate_func(line) for line in lines) + + def print_graph(self, *_) -> None: + for stack, count in self.collapsed.items(): + frames = filter(lambda frame: frame, + chain.from_iterable(self._resolve(addr) for addr in stack.split(';'))) + print(';'.join(reversed(list(frames))), count) + + def print_stats(tally: dict, tmin: int) -> None: data = [] total_time = 0 @@ -346,7 +401,11 @@ def main(): if args.executable: resolver = addr2line.BacktraceResolver(executable=args.executable, concise=not args.full_function_names) - graph = Graph(resolver) + if args.format == 'graph': + render = Graph(resolver) + else: + render = StackCollapse(resolver) + for s in args.file: if comment.search(s): continue @@ -375,15 +434,16 @@ def main(): if address_threshold: trace = list(dropwhile(lambda addr: int(addr, 0) >= address_threshold, trace)) if t >= args.tmin: - graph.process_trace(trace, t) + render.process_trace(trace, t) try: - if not graph: + if not render: print("No input data found. Please run `stall-analyser.py --help` for usage instruction") sys.exit() - print_command_line_options(args) - print_stats(tally, args.tmin) - graph.print_graph(args.direction, args.width, args.branch_threshold) + if args.format == 'graph': + print_command_line_options(args) + print_stats(tally, tmin) + render.print_graph(args.direction, args.width, args.branch_threshold) except BrokenPipeError: pass