Added perfbd.py and block device performance sampling in bench-runner

Based loosely on Linux's perf tool, perfbd.py uses trace output with
backtraces to aggregate and show the block device usage of all functions
in a program, propagating block devices operation cost up the backtrace
for each operation.

This combined with --trace-period and --trace-freq for
sampling/filtering trace events allow the bench-runner to very
efficiently record the general cost of block device operations with very
little overhead.

Adopted this as the default side-effect of make bench, replacing
cycle-based performance measurements which are less important for
littlefs.
This commit is contained in:
Christopher Haster
2022-10-13 11:09:26 -05:00
parent 29cbafeb67
commit 3a33c3795b
20 changed files with 2026 additions and 610 deletions

View File

@@ -4,7 +4,7 @@
#
# Example:
# ./scripts/perf.py -R -obench.perf ./runners/bench_runner
# ./scripts/perf.py bench.perf -Flfs.c -Flfs_util.c -Scycles
# ./scripts/perf.py bench.perf -j -Flfs.c -Flfs_util.c -Scycles
#
# Copyright (c) 2022, The littlefs authors.
# SPDX-License-Identifier: BSD-3-Clause
@@ -16,7 +16,6 @@ import csv
import errno
import fcntl
import functools as ft
import glob
import itertools as it
import math as m
import multiprocessing as mp
@@ -31,7 +30,6 @@ import zipfile
# TODO support non-zip perf results?
PERF_PATHS = ['*.perf']
PERF_TOOL = ['perf']
PERF_EVENTS = 'cycles,branch-misses,branches,cache-misses,cache-references'
PERF_FREQ = 100
@@ -147,14 +145,14 @@ class PerfResult(co.namedtuple('PerfResult', [
self.children + other.children)
def openio(path, mode='r'):
def openio(path, mode='r', buffering=-1):
if path == '-':
if mode == 'r':
return os.fdopen(os.dup(sys.stdin.fileno()), 'r')
return os.fdopen(os.dup(sys.stdin.fileno()), mode, buffering)
else:
return os.fdopen(os.dup(sys.stdout.fileno()), 'w')
return os.fdopen(os.dup(sys.stdout.fileno()), mode, buffering)
else:
return open(path, mode)
return open(path, mode, buffering)
# run perf as a subprocess, storing measurements into a zip file
def record(command, *,
@@ -164,14 +162,6 @@ def record(command, *,
perf_events=PERF_EVENTS,
perf_tool=PERF_TOOL,
**args):
if not command:
print('error: no command specified?')
sys.exit(-1)
if not output:
print('error: no output file specified?')
sys.exit(-1)
# create a temporary file for perf to write to, as far as I can tell
# this is strictly needed because perf's pipe-mode only works with stdout
with tempfile.NamedTemporaryFile('rb') as f:
@@ -214,8 +204,187 @@ def record(command, *,
return err
# try to only process each dso onceS
#
# note this only caches with the non-keyword arguments
def multiprocessing_cache(f):
local_cache = {}
manager = mp.Manager()
global_cache = manager.dict()
lock = mp.Lock()
def multiprocessing_cache(*args, **kwargs):
# check local cache?
if args in local_cache:
return local_cache[args]
# check global cache?
with lock:
if args in global_cache:
v = global_cache[args]
local_cache[args] = v
return v
# fall back to calling the function
v = f(*args, **kwargs)
global_cache[args] = v
local_cache[args] = v
return v
return multiprocessing_cache
@multiprocessing_cache
def collect_syms_and_lines(obj_path, *,
objdump_tool=None,
**args):
symbol_pattern = re.compile(
'^(?P<addr>[0-9a-fA-F]+)'
'\s+.*'
'\s+(?P<size>[0-9a-fA-F]+)'
'\s+(?P<name>[^\s]+)\s*$')
line_pattern = re.compile(
'^\s+(?:'
# matches dir/file table
'(?P<no>[0-9]+)'
'(?:\s+(?P<dir>[0-9]+))?'
'\s+.*'
'\s+(?P<path>[^\s]+)'
# matches line opcodes
'|' '\[[^\]]*\]\s+'
'(?:'
'(?P<op_special>Special)'
'|' '(?P<op_copy>Copy)'
'|' '(?P<op_end>End of Sequence)'
'|' 'File .*?to (?:entry )?(?P<op_file>\d+)'
'|' 'Line .*?to (?P<op_line>[0-9]+)'
'|' '(?:Address|PC) .*?to (?P<op_addr>[0x0-9a-fA-F]+)'
'|' '.' ')*'
')$', re.IGNORECASE)
# figure out symbol addresses and file+line ranges
syms = {}
sym_at = []
cmd = objdump_tool + ['-t', obj_path]
if args.get('verbose'):
print(' '.join(shlex.quote(c) for c in cmd))
proc = sp.Popen(cmd,
stdout=sp.PIPE,
stderr=sp.PIPE if not args.get('verbose') else None,
universal_newlines=True,
errors='replace',
close_fds=False)
for line in proc.stdout:
m = symbol_pattern.match(line)
if m:
name = m.group('name')
addr = int(m.group('addr'), 16)
size = int(m.group('size'), 16)
# ignore zero-sized symbols
if not size:
continue
# note multiple symbols can share a name
if name not in syms:
syms[name] = set()
syms[name].add((addr, size))
sym_at.append((addr, name, size))
proc.wait()
if proc.returncode != 0:
if not args.get('verbose'):
for line in proc.stderr:
sys.stdout.write(line)
# assume no debug-info on failure
pass
# sort and keep largest/first when duplicates
sym_at.sort(key=lambda x: (x[0], -x[2], x[1]))
sym_at_ = []
for addr, name, size in sym_at:
if len(sym_at_) == 0 or sym_at_[-1][0] != addr:
sym_at_.append((addr, name, size))
sym_at = sym_at_
# state machine for dwarf line numbers, note that objdump's
# decodedline seems to have issues with multiple dir/file
# tables, which is why we need this
lines = []
line_at = []
dirs = {}
files = {}
op_file = 1
op_line = 1
op_addr = 0
cmd = objdump_tool + ['--dwarf=rawline', obj_path]
if args.get('verbose'):
print(' '.join(shlex.quote(c) for c in cmd))
proc = sp.Popen(cmd,
stdout=sp.PIPE,
stderr=sp.PIPE if not args.get('verbose') else None,
universal_newlines=True,
errors='replace',
close_fds=False)
for line in proc.stdout:
m = line_pattern.match(line)
if m:
if m.group('no') and not m.group('dir'):
# found a directory entry
dirs[int(m.group('no'))] = m.group('path')
elif m.group('no'):
# found a file entry
dir = int(m.group('dir'))
if dir in dirs:
files[int(m.group('no'))] = os.path.join(
dirs[dir],
m.group('path'))
else:
files[int(m.group('no'))] = m.group('path')
else:
# found a state machine update
if m.group('op_file'):
op_file = int(m.group('op_file'), 0)
if m.group('op_line'):
op_line = int(m.group('op_line'), 0)
if m.group('op_addr'):
op_addr = int(m.group('op_addr'), 0)
if (m.group('op_special')
or m.group('op_copy')
or m.group('op_end')):
file = os.path.abspath(files.get(op_file, '?'))
lines.append((file, op_line, op_addr))
line_at.append((op_addr, file, op_line))
if m.group('op_end'):
op_file = 1
op_line = 1
op_addr = 0
proc.wait()
if proc.returncode != 0:
if not args.get('verbose'):
for line in proc.stderr:
sys.stdout.write(line)
# assume no debug-info on failure
pass
# sort and keep first when duplicates
lines.sort()
lines_ = []
for file, line, addr in lines:
if len(lines_) == 0 or lines_[-1][0] != file or lines[-1][1] != line:
lines_.append((file, line, addr))
lines = lines_
# sort and keep first when duplicates
line_at.sort()
line_at_ = []
for addr, file, line in line_at:
if len(line_at_) == 0 or line_at_[-1][0] != addr:
line_at_.append((addr, file, line))
line_at = line_at_
return syms, sym_at, lines, line_at
def collect_decompressed(path, *,
perf_tool=PERF_TOOL,
sources=None,
everything=False,
propagate=0,
depth=1,
@@ -228,7 +397,7 @@ def collect_decompressed(path, *,
'\s+(?P<event>[^:]+):')
frame_pattern = re.compile(
'\s+(?P<addr>\w+)'
'\s+(?P<sym>[^\s]+)'
'\s+(?P<sym>[^\s\+]+)(?:\+(?P<off>\w+))?'
'\s+\((?P<dso>[^\)]+)\)')
events = {
'cycles': 'cycles',
@@ -254,6 +423,9 @@ def collect_decompressed(path, *,
last_event = ''
last_period = 0
last_stack = []
deltas = co.defaultdict(lambda: {})
syms_ = co.defaultdict(lambda: {})
at_cache = {}
results = {}
def commit():
@@ -276,36 +448,117 @@ def collect_decompressed(path, *,
for line in proc.stdout:
# we need to process a lot of data, so wait to use regex as late
# as possible
if not line:
continue
if not line.startswith('\t'):
m = sample_pattern.match(line)
if m:
if last_stack:
commit()
last_event = m.group('event')
last_filtered = last_event in events
last_period = int(m.group('period'), 0)
last_stack = []
if last_filtered:
commit()
last_filtered = False
if line:
m = sample_pattern.match(line)
if m and m.group('event') in events:
last_filtered = True
last_event = m.group('event')
last_period = int(m.group('period'), 0)
last_stack = []
elif last_filtered:
m = frame_pattern.match(line)
if m:
# filter out internal/kernel functions
if not everything and (
m.group('sym').startswith('__')
or m.group('dso').startswith('/usr/lib')
or not m.group('sym')[:1].isalpha()):
or m.group('sym').startswith('0')
or m.group('sym').startswith('-')
or m.group('sym').startswith('[')
or m.group('dso').startswith('/usr/lib')):
continue
last_stack.append((
m.group('dso'),
m.group('sym'),
int(m.group('addr'), 16)))
dso = m.group('dso')
sym = m.group('sym')
off = int(m.group('off'), 0) if m.group('off') else 0
addr_ = int(m.group('addr'), 16)
# get the syms/lines for the dso, this is cached
syms, sym_at, lines, line_at = collect_syms_and_lines(
dso,
**args)
# ASLR is tricky, we have symbols+offsets, but static symbols
# means we may have multiple options for each symbol.
#
# To try to solve this, we use previous seen symbols to build
# confidence for the correct ASLR delta. This means we may
# guess incorrectly for early symbols, but this will only affect
# a few samples.
if sym in syms:
sym_addr_ = addr_ - off
# track possible deltas?
for sym_addr, size in syms[sym]:
delta = sym_addr - sym_addr_
if delta not in deltas[dso]:
deltas[dso][delta] = sum(
abs(a_+delta - a)
for s, (a_, _) in syms_[dso].items()
for a, _ in syms[s])
for delta in deltas[dso].keys():
deltas[dso][delta] += abs(sym_addr_+delta - sym_addr)
syms_[dso][sym] = sym_addr_, size
# guess the best delta
delta, _ = min(deltas[dso].items(),
key=lambda x: (x[1], x[0]))
addr = addr_ + delta
# cached?
if (dso,addr) in at_cache:
cached = at_cache[(dso,addr)]
if cached is None:
# cache says to skip
continue
file, line = cached
else:
# find file+line
i = bisect.bisect(line_at, addr, key=lambda x: x[0])
if i > 0:
_, file, line = line_at[i-1]
else:
file, line = re.sub('(\.o)?$', '.c', dso, 1), 0
# ignore filtered sources
if sources is not None:
if not any(
os.path.abspath(file) == os.path.abspath(s)
for s in sources):
at_cache[(dso,addr)] = None
continue
else:
# default to only cwd
if not everything and not os.path.commonpath([
os.getcwd(),
os.path.abspath(file)]) == os.getcwd():
at_cache[(dso,addr)] = None
continue
# simplify path
if os.path.commonpath([
os.getcwd(),
os.path.abspath(file)]) == os.getcwd():
file = os.path.relpath(file)
else:
file = os.path.abspath(file)
at_cache[(dso,addr)] = file, line
else:
file, line = re.sub('(\.o)?$', '.c', dso, 1), 0
last_stack.append((file, sym, line))
# stop propogating?
if propagate and len(last_stack) >= propagate:
commit()
last_filtered = False
if last_stack:
if last_filtered:
commit()
proc.wait()
@@ -341,35 +594,15 @@ def starapply(args):
f, args, kwargs = args
return f(*args, **kwargs)
def collect(paths, *,
def collect(perf_paths, *,
jobs=None,
objdump_tool=None,
sources=None,
everything=False,
**args):
symbol_pattern = re.compile(
'^(?P<addr>[0-9a-fA-F]+)\s.*\s(?P<name>[^\s]+)\s*$')
line_pattern = re.compile(
'^\s+(?:'
# matches dir/file table
'(?P<no>[0-9]+)\s+'
'(?:(?P<dir>[0-9]+)\s+)?'
'.*\s+'
'(?P<path>[^\s]+)'
# matches line opcodes
'|' '\[[^\]]*\]\s+'
'(?:'
'(?P<op_special>Special)'
'|' '(?P<op_copy>Copy)'
'|' '(?P<op_end>End of Sequence)'
'|' 'File .*?to (?:entry )?(?P<op_file>\d+)'
'|' 'Line .*?to (?P<op_line>[0-9]+)'
'|' '(?:Address|PC) .*?to (?P<op_addr>[0x0-9a-fA-F]+)'
'|' '.' ')*'
')$', re.IGNORECASE)
# automatic job detection?
if jobs == 0:
jobs = len(os.sched_getaffinity(0))
records = []
for path in paths:
for path in perf_paths:
# each .perf file is actually a zip file containing perf files from
# multiple runs
with zipfile.ZipFile(path) as z:
@@ -377,225 +610,17 @@ def collect(paths, *,
# we're dealing with a lot of data but also surprisingly
# parallelizable
dsos = {}
results = []
with mp.Pool(jobs or len(os.sched_getaffinity(0))) as p:
for results_ in p.imap_unordered(
starapply,
((collect_job, (path, i), dict(
everything=everything,
**args))
for path, i in records)):
# organize by dso
results__ = {}
for r in results_:
if r.file not in results__:
results__[r.file] = []
results__[r.file].append(r)
results_ = results__
for dso, results_ in results_.items():
if dso not in dsos:
# find file+line ranges for dsos
#
# do this here so we only process each dso once
syms = {}
sym_at = []
cmd = objdump_tool + ['-t', dso]
if args.get('verbose'):
print(' '.join(shlex.quote(c) for c in cmd))
proc = sp.Popen(cmd,
stdout=sp.PIPE,
stderr=sp.PIPE if not args.get('verbose') else None,
universal_newlines=True,
errors='replace',
close_fds=False)
for line in proc.stdout:
m = symbol_pattern.match(line)
if m:
name = m.group('name')
addr = int(m.group('addr'), 16)
# note multiple symbols can share a name
if name not in syms:
syms[name] = set()
syms[name].add(addr)
sym_at.append((addr, name))
proc.wait()
if proc.returncode != 0:
if not args.get('verbose'):
for line in proc.stderr:
sys.stdout.write(line)
# assume no debug-info on failure
pass
# sort and keep first when duplicates
sym_at.sort()
sym_at_ = []
for addr, name in sym_at:
if len(sym_at_) == 0 or sym_at_[-1][0] != addr:
sym_at_.append((addr, name))
sym_at = sym_at_
# state machine for dwarf line numbers, note that objdump's
# decodedline seems to have issues with multiple dir/file
# tables, which is why we need this
line_at = []
dirs = {}
files = {}
op_file = 1
op_line = 1
op_addr = 0
cmd = objdump_tool + ['--dwarf=rawline', dso]
if args.get('verbose'):
print(' '.join(shlex.quote(c) for c in cmd))
proc = sp.Popen(cmd,
stdout=sp.PIPE,
stderr=sp.PIPE if not args.get('verbose') else None,
universal_newlines=True,
errors='replace',
close_fds=False)
for line in proc.stdout:
m = line_pattern.match(line)
if m:
if m.group('no') and not m.group('dir'):
# found a directory entry
dirs[int(m.group('no'))] = m.group('path')
elif m.group('no'):
# found a file entry
dir = int(m.group('dir'))
if dir in dirs:
files[int(m.group('no'))] = os.path.join(
dirs[dir],
m.group('path'))
else:
files[int(m.group('no'))] = m.group('path')
else:
# found a state machine update
if m.group('op_file'):
op_file = int(m.group('op_file'), 0)
if m.group('op_line'):
op_line = int(m.group('op_line'), 0)
if m.group('op_addr'):
op_addr = int(m.group('op_addr'), 0)
if (m.group('op_special')
or m.group('op_copy')
or m.group('op_end')):
line_at.append((
op_addr,
files.get(op_file, '?'),
op_line))
if m.group('op_end'):
op_file = 1
op_line = 1
op_addr = 0
proc.wait()
if proc.returncode != 0:
if not args.get('verbose'):
for line in proc.stderr:
sys.stdout.write(line)
# assume no debug-info on failure
pass
# sort and keep first when duplicates
#
# I think dwarf requires this to be sorted but just in case
line_at.sort()
line_at_ = []
for addr, file, line in line_at:
if len(line_at_) == 0 or line_at_[-1][0] != addr:
line_at_.append((addr, file, line))
line_at = line_at_
# discard lines outside of the range of the containing
# function, these are introduced by dwarf for inlined
# functions but don't map to elf-level symbols
sym_at_ = []
for addr, sym in sym_at:
i = bisect.bisect(line_at, addr, key=lambda x: x[0])
if i > 0:
_, file, line = line_at[i-1]
sym_at_.append((file, line, sym))
sym_at_.sort()
line_at_ = []
for addr, file, line in line_at:
# only keep if sym-at-addr and sym-at-line match
i = bisect.bisect(
sym_at, addr, key=lambda x: x[0])
j = bisect.bisect(
sym_at_, (file, line), key=lambda x: (x[0], x[1]))
if i > 0 and j > 0 and (
sym_at[i-1][1] == sym_at_[j-1][2]):
line_at_.append((addr, file, line))
line_at = line_at_
dsos[dso] = (syms, sym_at, line_at)
syms, _, line_at = dsos[dso]
# first try to reverse ASLR
def deltas(r, d):
if '+' in r.function:
sym, off = r.function.split('+', 1)
off = int(off, 0)
else:
sym, off = r.function, 0
addr = r.line - off + d
for addr_ in syms.get(sym, []):
yield addr_ - addr
delta = min(
it.chain.from_iterable(
deltas(r, 0) for r in results_),
key=lambda d: sum(it.chain.from_iterable(
deltas(r, d) for r in results_)),
default=0)
# then try to map addrs -> file+line
#
# note we need to do this recursively
def remap(results):
results_ = []
for r in results:
addr = r.line + delta
i = bisect.bisect(line_at, addr, key=lambda x: x[0])
if i > 0:
_, file, line = line_at[i-1]
else:
file, line = re.sub('(\.o)?$', '.c', r.file, 1), 0
# ignore filtered sources
if sources is not None:
if not any(
os.path.abspath(file) == os.path.abspath(s)
for s in sources):
continue
else:
# default to only cwd
if not everything and not os.path.commonpath([
os.getcwd(),
os.path.abspath(file)]) == os.getcwd():
continue
# simplify path
if os.path.commonpath([
os.getcwd(),
os.path.abspath(file)]) == os.getcwd():
file = os.path.relpath(file)
else:
file = os.path.abspath(file)
function, *_ = r.function.split('+', 1)
results_.append(r._replace(
file=file, function=function, line=line,
children=remap(r.children)))
return results_
results.extend(remap(results_))
if jobs is not None:
results = []
with mp.Pool(jobs) as p:
for results_ in p.imap_unordered(
starapply,
((collect_job, (path, i), args) for path, i in records)):
results.extend(results_)
else:
results = []
for path, i in records:
results.extend(collect_job(path, i, **args))
return results
@@ -640,7 +665,7 @@ def fold(Result, results, *,
Result, r.children,
by=by,
defines=defines)))
folded = folded_
folded = folded_
return folded
@@ -983,7 +1008,6 @@ def report(perf_paths, *,
fields=None,
defines=None,
sort=None,
self=False,
branches=False,
caches=False,
**args):
@@ -1001,20 +1025,7 @@ def report(perf_paths, *,
# find sizes
if not args.get('use', None):
# find .o files
paths = []
for path in perf_paths:
if os.path.isdir(path):
path = path + '/*.perf'
for path in glob.glob(path):
paths.append(path)
if not paths:
print("error: no .perf files found in %r?" % perf_paths)
sys.exit(-1)
results = collect(paths, **args)
results = collect(perf_paths, **args)
else:
results = []
with openio(args['use']) as f:
@@ -1124,8 +1135,7 @@ if __name__ == "__main__":
parser.add_argument(
'perf_paths',
nargs=nargs,
help="Description of where to find *.perf files. May be a directory "
"or a list of paths. Defaults to %r." % PERF_PATHS)
help="Input *.perf files.")
parser.add_argument(
'-v', '--verbose',
action='store_true',
@@ -1224,7 +1234,7 @@ if __name__ == "__main__":
nargs='?',
type=lambda x: tuple(float(x) for x in x.split(',')),
const=THRESHOLD,
help="Show lines wth samples above this threshold as a percent of "
help="Show lines with samples above this threshold as a percent of "
"all lines. Defaults to %s." % ','.join(str(t) for t in THRESHOLD))
parser.add_argument(
'-c', '--context',
@@ -1295,7 +1305,13 @@ if __name__ == "__main__":
# perf_paths/command overlap, so need to do some munging here
args.command = args.perf_paths
args.perf_paths = args.perf_paths or PERF_PATHS
if args.record:
if not args.command:
print('error: no command specified?')
sys.exit(-1)
if not args.output:
print('error: no output file specified?')
sys.exit(-1)
sys.exit(main(**{k: v
for k, v in vars(args).items()