forked from Imagelibrary/littlefs
Reworked bench.py/bench_runner/how bench measurements are recorded
This is based on how bench.py/bench_runners have actually been used in
practice. The main changes have been to make the output of bench.py more
readibly consumable by plot.py/plotmpl.py without needing a bunch of
hacky intermediary scripts.
Now instead of a single per-bench BENCH_START/BENCH_STOP, benches can
have multiple named BENCH_START/BENCH_STOP invocations to measure
multiple things in one run:
BENCH_START("fetch", i, STEP);
lfsr_rbyd_fetch(&lfs, &rbyd_, rbyd.block, CFG->block_size) => 0;
BENCH_STOP("fetch");
Benches can also now report explicit results, for non-io measurements:
BENCH_RESULT("usage", i, STEP, rbyd.eoff);
The extra iter/size parameters to BENCH_START/BENCH_RESULT also allow
some extra information to be calculated post-bench. This infomation gets
tagged with an extra bench_agg field to help organize results in
plot.py/plotmpl.py:
- bench_meas=<meas>+amor, bench_agg=raw - amortized results
- bench_meas=<meas>+div, bench_agg=raw - per-byte results
- bench_meas=<meas>+avg, bench_agg=avg - average over BENCH_SEED
- bench_meas=<meas>+min, bench_agg=min - minimum over BENCH_SEED
- bench_meas=<meas>+max, bench_agg=max - maximum over BENCH_SEED
---
Also removed all bench.tomls for now. This may seem counterproductive in
a commit to improve benchmarking, but I'm not sure there's actual value
to keeping bench cases committed in tree.
These were alway quick to fall out of date (at the time of this commit
most of the low-level bench.tomls, rbyd, btree, etc, no longer
compiled), and most benchmarks were one-off collections of scripts/data
with results too large/cumbersome to commit and keep updated in tree.
I think the better way to approach benchmarking is a seperate repo
(multiple repos?) with all related scripts/state/code and results
committed into a hopefully reproducible snapshot. Keeping the
bench.tomls in that repo makes more sense in this model.
There may be some value to having benchmarks in CI in the future, but
for that to make sense they would need to actually fail on performance
regression. How to do that isn't so clear. Anyways we can always address
this in the future rather than now.
This commit is contained in:
165
scripts/bench.py
165
scripts/bench.py
@@ -944,6 +944,54 @@ class BenchOutput:
|
||||
for row in self.rows:
|
||||
self.writer.writerow(row)
|
||||
|
||||
def avg(self):
|
||||
# compute min/max/avg
|
||||
ops = ['bench_readed', 'bench_proged', 'bench_erased']
|
||||
results = co.defaultdict(lambda: {
|
||||
'sums': {op: 0 for op in ops},
|
||||
'mins': {op: +m.inf for op in ops},
|
||||
'maxs': {op: -m.inf for op in ops},
|
||||
'count': 0})
|
||||
|
||||
for row in self.rows:
|
||||
# we only care about results with a BENCH_SEED entry
|
||||
if 'BENCH_SEED' not in row:
|
||||
continue
|
||||
|
||||
# figure our a key for each row, this is everything but the bench
|
||||
# results/seed reencoded as a big tuple-tuple for hashability
|
||||
key = (row['bench_meas'], tuple(sorted(
|
||||
(k, v) for k, v in row.items()
|
||||
if k != 'BENCH_SEED'
|
||||
and k != 'bench_meas'
|
||||
and k != 'bench_agg'
|
||||
and k not in ops)))
|
||||
# find sum/min/max/etc
|
||||
result = results[key]
|
||||
for op in ops:
|
||||
result['sums'][op] += row[op]
|
||||
result['mins'][op] = min(result['mins'][op], row[op])
|
||||
result['maxs'][op] = max(result['maxs'][op], row[op])
|
||||
result['count'] += 1
|
||||
|
||||
# append results to output
|
||||
for (meas, key), result in results.items():
|
||||
self.writerow({
|
||||
'bench_meas': meas+'+avg',
|
||||
'bench_agg': 'avg',
|
||||
**{k: v for k, v in key},
|
||||
**{op: result['sums'][op] / result['count'] for op in ops}})
|
||||
self.writerow({
|
||||
'bench_meas': meas+'+min',
|
||||
'bench_agg': 'bnd',
|
||||
**{k: v for k, v in key},
|
||||
**{op: result['mins'][op] for op in ops}})
|
||||
self.writerow({
|
||||
'bench_meas': meas+'+max',
|
||||
'bench_agg': 'bnd',
|
||||
**{k: v for k, v in key},
|
||||
**{op: result['maxs'][op] for op in ops}})
|
||||
|
||||
# A bench failure
|
||||
class BenchFailure(Exception):
|
||||
def __init__(self, id, returncode, stdout, assert_=None):
|
||||
@@ -952,6 +1000,35 @@ class BenchFailure(Exception):
|
||||
self.stdout = stdout
|
||||
self.assert_ = assert_
|
||||
|
||||
# computer extra result stuff, this includes averages and amortized results
|
||||
def bench_results(results):
|
||||
ops = ['readed', 'proged', 'erased']
|
||||
|
||||
# first compute amortized results
|
||||
amors = {}
|
||||
for meas in set(meas for meas, _ in results.keys()):
|
||||
# keep a running sum
|
||||
sums = {op: 0 for op in ops}
|
||||
size = 0
|
||||
for i, (iter, result) in enumerate(sorted(
|
||||
(iter, result) for (meas_, iter), result in results.items()
|
||||
if meas_ == meas)):
|
||||
for op in ops:
|
||||
sums[op] += result.get(op, 0)
|
||||
size += result.get('size', 1)
|
||||
|
||||
# find amortized results
|
||||
amors[meas+'+amor', iter] = {
|
||||
'size': result.get('size', 1),
|
||||
**{op: sums[op] / (i+1) for op in ops}}
|
||||
# also find per-byte results
|
||||
amors[meas+'+div', iter] = {
|
||||
'size': result.get('size', 1),
|
||||
**{op: result.get(op, 0) / size for op in ops}}
|
||||
|
||||
return results | amors
|
||||
|
||||
|
||||
def run_stage(name, runner, bench_ids, stdout_, trace_, output_, **args):
|
||||
# get expected suite/case/perm counts
|
||||
(case_suites,
|
||||
@@ -970,13 +1047,17 @@ def run_stage(name, runner, bench_ids, stdout_, trace_, output_, **args):
|
||||
killed = False
|
||||
|
||||
pattern = re.compile('^(?:'
|
||||
'(?P<op>running|finished|skipped|powerloss)'
|
||||
'(?P<op>running|finished|skipped)'
|
||||
' (?P<id>(?P<case>[^:]+)[^\s]*)'
|
||||
'(?: (?P<readed>\d+))?'
|
||||
'(?: (?P<proged>\d+))?'
|
||||
'(?: (?P<erased>\d+))?'
|
||||
'|' '(?P<path>[^:]+):(?P<lineno>\d+):(?P<op_>assert):'
|
||||
' *(?P<message>.*)'
|
||||
'|' '(?P<op__>benched)'
|
||||
' (?P<meas>[^\s]+)'
|
||||
' (?P<iter>\d+)'
|
||||
' (?P<size>\d+)'
|
||||
'(?: (?P<readed>[\d\.]+))?'
|
||||
'(?: (?P<proged>[\d\.]+))?'
|
||||
'(?: (?P<erased>[\d\.]+))?'
|
||||
')$')
|
||||
locals = th.local()
|
||||
children = set()
|
||||
@@ -1004,6 +1085,8 @@ def run_stage(name, runner, bench_ids, stdout_, trace_, output_, **args):
|
||||
last_id = None
|
||||
last_stdout = co.deque(maxlen=args.get('context', 5) + 1)
|
||||
last_assert = None
|
||||
if output_:
|
||||
last_results = {}
|
||||
try:
|
||||
while True:
|
||||
# parse a line for state changes
|
||||
@@ -1025,35 +1108,39 @@ def run_stage(name, runner, bench_ids, stdout_, trace_, output_, **args):
|
||||
|
||||
m = pattern.match(line)
|
||||
if m:
|
||||
op = m.group('op') or m.group('op_')
|
||||
op = m.group('op') or m.group('op_') or m.group('op__')
|
||||
if op == 'running':
|
||||
locals.seen_perms += 1
|
||||
last_id = m.group('id')
|
||||
last_stdout.clear()
|
||||
last_assert = None
|
||||
if output_:
|
||||
last_results = {}
|
||||
elif op == 'finished':
|
||||
case = m.group('case')
|
||||
suite = case_suites[case]
|
||||
readed_ = int(m.group('readed'))
|
||||
proged_ = int(m.group('proged'))
|
||||
erased_ = int(m.group('erased'))
|
||||
passed_suite_perms[suite] += 1
|
||||
passed_case_perms[case] += 1
|
||||
passed_perms += 1
|
||||
readed += readed_
|
||||
proged += proged_
|
||||
erased += erased_
|
||||
if output_:
|
||||
# get defines and write to csv
|
||||
defines = find_defines(
|
||||
runner, m.group('id'), **args)
|
||||
output_.writerow({
|
||||
'suite': suite,
|
||||
'case': case,
|
||||
'bench_readed': readed_,
|
||||
'bench_proged': proged_,
|
||||
'bench_erased': erased_,
|
||||
**defines})
|
||||
# compute extra measurements here
|
||||
last_results = bench_results(last_results)
|
||||
for (meas, iter), result in (
|
||||
last_results.items()):
|
||||
output_.writerow({
|
||||
'suite': suite,
|
||||
'case': case,
|
||||
**defines,
|
||||
'bench_meas': meas,
|
||||
'bench_agg': 'raw',
|
||||
'bench_iter': iter,
|
||||
'bench_size': result['size'],
|
||||
'bench_readed': result['readed'],
|
||||
'bench_proged': result['proged'],
|
||||
'bench_erased': result['erased']})
|
||||
elif op == 'skipped':
|
||||
locals.seen_perms += 1
|
||||
elif op == 'assert':
|
||||
@@ -1064,6 +1151,32 @@ def run_stage(name, runner, bench_ids, stdout_, trace_, output_, **args):
|
||||
# go ahead and kill the process, aborting takes a while
|
||||
if args.get('keep_going'):
|
||||
proc.kill()
|
||||
elif op == 'benched':
|
||||
meas = m.group('meas')
|
||||
iter = int(m.group('iter'))
|
||||
size = int(m.group('size'))
|
||||
result = {'size': size}
|
||||
for op in ['readed', 'proged', 'erased']:
|
||||
if m.group(op) is None:
|
||||
result[op] = 0
|
||||
elif '.' in m.group(op):
|
||||
result[op] = float(m.group(op))
|
||||
else:
|
||||
result[op] = int(m.group(op))
|
||||
# keep track of per-perm results
|
||||
if output_:
|
||||
# if we've already seen this measurement, sum
|
||||
result_ = last_results.get((meas, iter))
|
||||
if result_ is not None:
|
||||
result['readed'] += result_['readed']
|
||||
result['proged'] += result_['proged']
|
||||
result['erased'] += result_['erased']
|
||||
result['size'] += result_['size']
|
||||
last_results[meas, iter] = result
|
||||
# keep track of total for summary
|
||||
readed += result['readed']
|
||||
proged += result['proged']
|
||||
erased += result['erased']
|
||||
except KeyboardInterrupt:
|
||||
raise BenchFailure(last_id, 1, list(last_stdout))
|
||||
finally:
|
||||
@@ -1102,17 +1215,6 @@ def run_stage(name, runner, bench_ids, stdout_, trace_, output_, **args):
|
||||
start += locals.seen_perms*step
|
||||
|
||||
except BenchFailure as failure:
|
||||
# keep track of failures
|
||||
if output_:
|
||||
case, _ = failure.id.split(':', 1)
|
||||
suite = case_suites[case]
|
||||
# get defines and write to csv
|
||||
defines = find_defines(runner, failure.id, **args)
|
||||
output_.writerow({
|
||||
'suite': suite,
|
||||
'case': case,
|
||||
**defines})
|
||||
|
||||
# race condition for multiple failures?
|
||||
if failures and not args.get('keep_going'):
|
||||
break
|
||||
@@ -1236,7 +1338,8 @@ def run(runner, bench_ids=[], **args):
|
||||
if args.get('output'):
|
||||
output = BenchOutput(args['output'],
|
||||
['suite', 'case'],
|
||||
['bench_readed', 'bench_proged', 'bench_erased'])
|
||||
['bench_meas', 'bench_iter', 'bench_size',
|
||||
'bench_readed', 'bench_proged', 'bench_erased'])
|
||||
|
||||
# measure runtime
|
||||
start = time.time()
|
||||
@@ -1287,6 +1390,8 @@ def run(runner, bench_ids=[], **args):
|
||||
except BrokenPipeError:
|
||||
pass
|
||||
if output:
|
||||
# computer averages?
|
||||
output.avg()
|
||||
output.close()
|
||||
|
||||
# show summary
|
||||
|
||||
Reference in New Issue
Block a user