Files
littlefs/scripts/summary.py
Christopher Haster 616b4e1c9e Tweaked scripts that consume .csv files to filter defines early
With the quantity of data being output by bench.py now, filtering ASAP
while parsing CSV files is a valuable optimization. And thanks to how
CSV files are structured, we can even avoid ever loading the full
contents into RAM.

This does end up with use filtering for defines redundantly in a few
places, but this is well worth the saved overhead from early filtering.

Also tried to clean up the plot.py/plotmpl.py's data folding path,
though that may have been wasted effort.
2023-11-03 14:30:22 -05:00

828 lines
25 KiB
Python
Executable File

#!/usr/bin/env python3
#
# Script to summarize the outputs of other scripts. Operates on CSV files.
#
# Example:
# ./scripts/code.py lfs.o lfs_util.o -q -o lfs.code.csv
# ./scripts/data.py lfs.o lfs_util.o -q -o lfs.data.csv
# ./scripts/summary.py lfs.code.csv lfs.data.csv -q -o lfs.csv
# ./scripts/summary.py -Y lfs.csv -f code=code_size,data=data_size
#
# Copyright (c) 2022, The littlefs authors.
# SPDX-License-Identifier: BSD-3-Clause
#
import collections as co
import csv
import functools as ft
import itertools as it
import math as m
import os
import re
# supported merge operations
#
# this is a terrible way to express these
#
OPS = {
'sum': lambda xs: sum(xs[1:], start=xs[0]),
'prod': lambda xs: m.prod(xs[1:], start=xs[0]),
'min': min,
'max': max,
'mean': lambda xs: Float(sum(float(x) for x in xs) / len(xs)),
'stddev': lambda xs: (
lambda mean: Float(
m.sqrt(sum((float(x) - mean)**2 for x in xs) / len(xs)))
)(sum(float(x) for x in xs) / len(xs)),
'gmean': lambda xs: Float(m.prod(float(x) for x in xs)**(1/len(xs))),
'gstddev': lambda xs: (
lambda gmean: Float(
m.exp(m.sqrt(sum(m.log(float(x)/gmean)**2 for x in xs) / len(xs)))
if gmean else m.inf)
)(m.prod(float(x) for x in xs)**(1/len(xs))),
}
# integer fields
class Int(co.namedtuple('Int', 'x')):
__slots__ = ()
def __new__(cls, x=0):
if isinstance(x, Int):
return x
if isinstance(x, str):
try:
x = int(x, 0)
except ValueError:
# also accept +-∞ and +-inf
if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
x = m.inf
elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
x = -m.inf
else:
raise
assert isinstance(x, int) or m.isinf(x), x
return super().__new__(cls, x)
def __str__(self):
if self.x == m.inf:
return ''
elif self.x == -m.inf:
return '-∞'
else:
return str(self.x)
def __int__(self):
assert not m.isinf(self.x)
return self.x
def __float__(self):
return float(self.x)
none = '%7s' % '-'
def table(self):
return '%7s' % (self,)
diff_none = '%7s' % '-'
diff_table = table
def diff_diff(self, other):
new = self.x if self else 0
old = other.x if other else 0
diff = new - old
if diff == +m.inf:
return '%7s' % '+∞'
elif diff == -m.inf:
return '%7s' % '-∞'
else:
return '%+7d' % diff
def ratio(self, other):
new = self.x if self else 0
old = other.x if other else 0
if m.isinf(new) and m.isinf(old):
return 0.0
elif m.isinf(new):
return +m.inf
elif m.isinf(old):
return -m.inf
elif not old and not new:
return 0.0
elif not old:
return 1.0
else:
return (new-old) / old
def __add__(self, other):
return self.__class__(self.x + other.x)
def __sub__(self, other):
return self.__class__(self.x - other.x)
def __mul__(self, other):
return self.__class__(self.x * other.x)
# float fields
class Float(co.namedtuple('Float', 'x')):
__slots__ = ()
def __new__(cls, x=0.0):
if isinstance(x, Float):
return x
if isinstance(x, str):
try:
x = float(x)
except ValueError:
# also accept +-∞ and +-inf
if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
x = m.inf
elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
x = -m.inf
else:
raise
assert isinstance(x, float), x
return super().__new__(cls, x)
def __str__(self):
if self.x == m.inf:
return ''
elif self.x == -m.inf:
return '-∞'
else:
return '%.1f' % self.x
def __float__(self):
return float(self.x)
none = Int.none
table = Int.table
diff_none = Int.diff_none
diff_table = Int.diff_table
diff_diff = Int.diff_diff
ratio = Int.ratio
__add__ = Int.__add__
__sub__ = Int.__sub__
__mul__ = Int.__mul__
# fractional fields, a/b
class Frac(co.namedtuple('Frac', 'a,b')):
__slots__ = ()
def __new__(cls, a=0, b=None):
if isinstance(a, Frac) and b is None:
return a
if isinstance(a, str) and b is None:
a, b = a.split('/', 1)
if b is None:
b = a
return super().__new__(cls, Int(a), Int(b))
def __str__(self):
return '%s/%s' % (self.a, self.b)
def __float__(self):
return float(self.a)
none = '%11s %7s' % ('-', '-')
def table(self):
t = self.a.x/self.b.x if self.b.x else 1.0
return '%11s %7s' % (
self,
'%' if t == +m.inf
else '-∞%' if t == -m.inf
else '%.1f%%' % (100*t))
diff_none = '%11s' % '-'
def diff_table(self):
return '%11s' % (self,)
def diff_diff(self, other):
new_a, new_b = self if self else (Int(0), Int(0))
old_a, old_b = other if other else (Int(0), Int(0))
return '%11s' % ('%s/%s' % (
new_a.diff_diff(old_a).strip(),
new_b.diff_diff(old_b).strip()))
def ratio(self, other):
new_a, new_b = self if self else (Int(0), Int(0))
old_a, old_b = other if other else (Int(0), Int(0))
new = new_a.x/new_b.x if new_b.x else 1.0
old = old_a.x/old_b.x if old_b.x else 1.0
return new - old
def __add__(self, other):
return self.__class__(self.a + other.a, self.b + other.b)
def __sub__(self, other):
return self.__class__(self.a - other.a, self.b - other.b)
def __mul__(self, other):
return self.__class__(self.a * other.a, self.b + other.b)
def __lt__(self, other):
self_t = self.a.x/self.b.x if self.b.x else 1.0
other_t = other.a.x/other.b.x if other.b.x else 1.0
return (self_t, self.a.x) < (other_t, other.a.x)
def __gt__(self, other):
return self.__class__.__lt__(other, self)
def __le__(self, other):
return not self.__gt__(other)
def __ge__(self, other):
return not self.__lt__(other)
# available types
TYPES = co.OrderedDict([
('int', Int),
('float', Float),
('frac', Frac)
])
def openio(path, mode='r', buffering=-1):
# allow '-' for stdin/stdout
if path == '-':
if 'r' in mode:
return os.fdopen(os.dup(sys.stdin.fileno()), mode, buffering)
else:
return os.fdopen(os.dup(sys.stdout.fileno()), mode, buffering)
else:
return open(path, mode, buffering)
def collect(csv_paths, renames=[], defines=[]):
# collect results from CSV files
results = []
for path in csv_paths:
try:
with openio(path) as f:
reader = csv.DictReader(f, restval='')
for r in reader:
# apply any renames
if renames:
# make a copy so renames can overlap
r_ = {}
for new_k, old_k in renames:
if old_k in r:
r_[new_k] = r[old_k]
r.update(r_)
# filter by matching defines
if not all(k in r and r[k] in vs for k, vs in defines):
continue
results.append(r)
except FileNotFoundError:
pass
return results
def infer(results, *,
by=None,
fields=None,
types={},
ops={},
renames=[],
**_):
# if fields not specified, try to guess from data
if fields is None:
fields = co.OrderedDict()
for r in results:
for k, v in r.items():
if (by is None or k not in by) and v.strip():
types_ = []
for t in fields.get(k, TYPES.values()):
try:
t(v)
types_.append(t)
except ValueError:
pass
fields[k] = types_
fields = list(k for k, v in fields.items() if v)
# deduplicate fields
fields = list(co.OrderedDict.fromkeys(fields).keys())
# if by not specified, guess it's anything not in fields and not a
# source of a rename
if by is None:
by = co.OrderedDict()
for r in results:
# also ignore None keys, these are introduced by csv.DictReader
# when header + row mismatch
by.update((k, True) for k in r.keys()
if k is not None
and k not in fields
and not any(k == old_k for _, old_k in renames))
by = list(by.keys())
# deduplicate fields
by = list(co.OrderedDict.fromkeys(by).keys())
# find best type for all fields
types_ = {}
for k in fields:
if k in types:
types_[k] = types[k]
else:
for t in TYPES.values():
for r in results:
if k in r and r[k].strip():
try:
t(r[k])
except ValueError:
break
else:
types_[k] = t
break
else:
print("error: no type matches field %r?" % k)
sys.exit(-1)
types = types_
# does folding change the type?
types_ = {}
for k, t in types.items():
types_[k] = ops.get(k, OPS['sum'])([t()]).__class__
# create result class
def __new__(cls, **r):
return cls.__mro__[1].__new__(cls,
**{k: r.get(k, '') for k in by},
**{k: r[k] if k in r and isinstance(r[k], list)
else [types[k](r[k])] if k in r
else []
for k in fields})
def __add__(self, other):
return self.__class__(
**{k: getattr(self, k) for k in by},
**{k: object.__getattribute__(self, k)
+ object.__getattribute__(other, k)
for k in fields})
def __getattribute__(self, k):
if k in fields:
if object.__getattribute__(self, k):
return ops.get(k, OPS['sum'])(object.__getattribute__(self, k))
else:
return None
return object.__getattribute__(self, k)
return type('Result', (co.namedtuple('Result', by + fields),), {
'__slots__': (),
'__new__': __new__,
'__add__': __add__,
'__getattribute__': __getattribute__,
'_by': by,
'_fields': fields,
'_sort': fields,
'_types': types_,
})
def fold(Result, results, *,
by=None,
defines=[],
**_):
if by is None:
by = Result._by
for k in it.chain(by or [], (k for k, _ in defines)):
if k not in Result._by and k not in Result._fields:
print("error: could not find field %r?" % k)
sys.exit(-1)
# filter by matching defines
if defines:
results_ = []
for r in results:
if all(getattr(r, k) in vs for k, vs in defines):
results_.append(r)
results = results_
# organize results into conflicts
folding = co.OrderedDict()
for r in results:
name = tuple(getattr(r, k) for k in by)
if name not in folding:
folding[name] = []
folding[name].append(r)
# merge conflicts
folded = []
for name, rs in folding.items():
folded.append(sum(rs[1:], start=rs[0]))
return folded
def table(Result, results, diff_results=None, *,
by=None,
fields=None,
sort=None,
summary=False,
all=False,
percent=False,
**_):
all_, all = all, __builtins__.all
if by is None:
by = Result._by
if fields is None:
fields = Result._fields
types = Result._types
# fold again
results = fold(Result, results, by=by)
if diff_results is not None:
diff_results = fold(Result, diff_results, by=by)
# organize by name
table = {
','.join(str(getattr(r, k) or '') for k in by): r
for r in results}
diff_table = {
','.join(str(getattr(r, k) or '') for k in by): r
for r in diff_results or []}
names = list(table.keys() | diff_table.keys())
# sort again, now with diff info, note that python's sort is stable
names.sort()
if diff_results is not None:
names.sort(key=lambda n: tuple(
types[k].ratio(
getattr(table.get(n), k, None),
getattr(diff_table.get(n), k, None))
for k in fields),
reverse=True)
if sort:
for k, reverse in reversed(sort):
names.sort(
key=lambda n: tuple(
(getattr(table[n], k),)
if getattr(table.get(n), k, None) is not None else ()
for k in ([k] if k else [
k for k in Result._sort if k in fields])),
reverse=reverse ^ (not k or k in Result._fields))
# build up our lines
lines = []
# header
header = []
header.append('%s%s' % (
','.join(by),
' (%d added, %d removed)' % (
sum(1 for n in table if n not in diff_table),
sum(1 for n in diff_table if n not in table))
if diff_results is not None and not percent else '')
if not summary else '')
if diff_results is None:
for k in fields:
header.append(k)
elif percent:
for k in fields:
header.append(k)
else:
for k in fields:
header.append('o'+k)
for k in fields:
header.append('n'+k)
for k in fields:
header.append('d'+k)
header.append('')
lines.append(header)
def table_entry(name, r, diff_r=None, ratios=[]):
entry = []
entry.append(name)
if diff_results is None:
for k in fields:
entry.append(getattr(r, k).table()
if getattr(r, k, None) is not None
else types[k].none)
elif percent:
for k in fields:
entry.append(getattr(r, k).diff_table()
if getattr(r, k, None) is not None
else types[k].diff_none)
else:
for k in fields:
entry.append(getattr(diff_r, k).diff_table()
if getattr(diff_r, k, None) is not None
else types[k].diff_none)
for k in fields:
entry.append(getattr(r, k).diff_table()
if getattr(r, k, None) is not None
else types[k].diff_none)
for k in fields:
entry.append(types[k].diff_diff(
getattr(r, k, None),
getattr(diff_r, k, None)))
if diff_results is None:
entry.append('')
elif percent:
entry.append(' (%s)' % ', '.join(
'+∞%' if t == +m.inf
else '-∞%' if t == -m.inf
else '%+.1f%%' % (100*t)
for t in ratios))
else:
entry.append(' (%s)' % ', '.join(
'+∞%' if t == +m.inf
else '-∞%' if t == -m.inf
else '%+.1f%%' % (100*t)
for t in ratios
if t)
if any(ratios) else '')
return entry
# entries
if not summary:
for name in names:
r = table.get(name)
if diff_results is None:
diff_r = None
ratios = None
else:
diff_r = diff_table.get(name)
ratios = [
types[k].ratio(
getattr(r, k, None),
getattr(diff_r, k, None))
for k in fields]
if not all_ and not any(ratios):
continue
lines.append(table_entry(name, r, diff_r, ratios))
# total
r = next(iter(fold(Result, results, by=[])), None)
if diff_results is None:
diff_r = None
ratios = None
else:
diff_r = next(iter(fold(Result, diff_results, by=[])), None)
ratios = [
types[k].ratio(
getattr(r, k, None),
getattr(diff_r, k, None))
for k in fields]
lines.append(table_entry('TOTAL', r, diff_r, ratios))
# find the best widths, note that column 0 contains the names and column -1
# the ratios, so those are handled a bit differently
widths = [
((max(it.chain([w], (len(l[i]) for l in lines)))+1+4-1)//4)*4-1
for w, i in zip(
it.chain([23], it.repeat(7)),
range(len(lines[0])-1))]
# print our table
for line in lines:
print('%-*s %s%s' % (
widths[0], line[0],
' '.join('%*s' % (w, x)
for w, x in zip(widths[1:], line[1:-1])),
line[-1]))
def main(csv_paths, *,
by=None,
fields=None,
defines=[],
sort=None,
**args):
# separate out renames
renames = list(it.chain.from_iterable(
((k, v) for v in vs)
for k, vs in it.chain(by or [], fields or [])))
if by is not None:
by = [k for k, _ in by]
if fields is not None:
fields = [k for k, _ in fields]
# figure out types
types = {}
for t in TYPES.keys():
for k in args.get(t, []):
if k in types:
print("error: conflicting type for field %r?" % k)
sys.exit(-1)
types[k] = TYPES[t]
# rename types?
if renames:
types_ = {}
for new_k, old_k in renames:
if old_k in types:
types_[new_k] = types[old_k]
types.update(types_)
# figure out merge operations
ops = {}
for o in OPS.keys():
for k in args.get(o, []):
if k in ops:
print("error: conflicting op for field %r?" % k)
sys.exit(-1)
ops[k] = OPS[o]
# rename ops?
if renames:
ops_ = {}
for new_k, old_k in renames:
if old_k in ops:
ops_[new_k] = ops[old_k]
ops.update(ops_)
# find CSV files
results = collect(csv_paths, renames=renames, defines=defines)
# homogenize
Result = infer(results,
by=by,
fields=fields,
types=types,
ops=ops,
renames=renames)
results_ = []
for r in results:
if not any(k in r and r[k].strip()
for k in Result._fields):
continue
try:
results_.append(Result(**{
k: r[k] for k in Result._by + Result._fields
if k in r and r[k].strip()}))
except TypeError:
pass
results = results_
# fold
results = fold(Result, results, by=by, defines=defines)
# sort, note that python's sort is stable
results.sort()
if sort:
for k, reverse in reversed(sort):
results.sort(
key=lambda r: tuple(
(getattr(r, k),) if getattr(r, k) is not None else ()
for k in ([k] if k else Result._sort)),
reverse=reverse ^ (not k or k in Result._fields))
# write results to CSV
if args.get('output'):
with openio(args['output'], 'w') as f:
writer = csv.DictWriter(f, Result._by + Result._fields)
writer.writeheader()
for r in results:
# note we need to go through getattr to resolve lazy fields
writer.writerow({
k: getattr(r, k) for k in Result._by + Result._fields})
# find previous results?
if args.get('diff'):
diff_results = collect([args['diff']], renames=renames, defines=defines)
diff_results_ = []
for r in diff_results:
if not any(k in r and r[k].strip()
for k in Result._fields):
continue
try:
diff_results_.append(Result(**{
k: r[k] for k in Result._by + Result._fields
if k in r and r[k].strip()}))
except TypeError:
pass
diff_results = diff_results_
# fold
diff_results = fold(Result, diff_results, by=by, defines=defines)
# print table
if not args.get('quiet'):
table(Result, results,
diff_results if args.get('diff') else None,
by=by,
fields=fields,
sort=sort,
**args)
if __name__ == "__main__":
import argparse
import sys
parser = argparse.ArgumentParser(
description="Summarize measurements in CSV files.",
allow_abbrev=False)
parser.add_argument(
'csv_paths',
nargs='*',
help="Input *.csv files.")
parser.add_argument(
'-q', '--quiet',
action='store_true',
help="Don't show anything, useful with -o.")
parser.add_argument(
'-o', '--output',
help="Specify CSV file to store results.")
parser.add_argument(
'-d', '--diff',
help="Specify CSV file to diff against.")
parser.add_argument(
'-a', '--all',
action='store_true',
help="Show all, not just the ones that changed.")
parser.add_argument(
'-p', '--percent',
action='store_true',
help="Only show percentage change, not a full diff.")
parser.add_argument(
'-b', '--by',
action='append',
type=lambda x: (
lambda k,v=None: (k, v.split(',') if v is not None else ())
)(*x.split('=', 1)),
help="Group by this field. Can rename fields with new_name=old_name.")
parser.add_argument(
'-f', '--field',
dest='fields',
action='append',
type=lambda x: (
lambda k,v=None: (k, v.split(',') if v is not None else ())
)(*x.split('=', 1)),
help="Show this field. Can rename fields with new_name=old_name.")
parser.add_argument(
'-D', '--define',
dest='defines',
action='append',
type=lambda x: (lambda k,v: (k, set(v.split(','))))(*x.split('=', 1)),
help="Only include results where this field is this value. May include "
"comma-separated options.")
class AppendSort(argparse.Action):
def __call__(self, parser, namespace, value, option):
if namespace.sort is None:
namespace.sort = []
namespace.sort.append((value, True if option == '-S' else False))
parser.add_argument(
'-s', '--sort',
nargs='?',
action=AppendSort,
help="Sort by this field.")
parser.add_argument(
'-S', '--reverse-sort',
nargs='?',
action=AppendSort,
help="Sort by this field, but backwards.")
parser.add_argument(
'-Y', '--summary',
action='store_true',
help="Only show the total.")
parser.add_argument(
'--int',
action='append',
help="Treat these fields as ints.")
parser.add_argument(
'--float',
action='append',
help="Treat these fields as floats.")
parser.add_argument(
'--frac',
action='append',
help="Treat these fields as fractions.")
parser.add_argument(
'--sum',
action='append',
help="Add these fields (the default).")
parser.add_argument(
'--prod',
action='append',
help="Multiply these fields.")
parser.add_argument(
'--min',
action='append',
help="Take the minimum of these fields.")
parser.add_argument(
'--max',
action='append',
help="Take the maximum of these fields.")
parser.add_argument(
'--mean',
action='append',
help="Average these fields.")
parser.add_argument(
'--stddev',
action='append',
help="Find the standard deviation of these fields.")
parser.add_argument(
'--gmean',
action='append',
help="Find the geometric mean of these fields.")
parser.add_argument(
'--gstddev',
action='append',
help="Find the geometric standard deviation of these fields.")
sys.exit(main(**{k: v
for k, v in vars(parser.parse_intermixed_args()).items()
if v is not None}))