Files
littlefs/scripts/summary.py
Christopher Haster 37f738cc71 Changed RFrac equality in scripts
Now, fractions are considered equal if they have the same ratio:

- 6/6 == 12/12 => True
- 3/6 == 3/12  => False
- 1/6 == 2/12  => True

It's interesting to note this implementation is actually more
numerically stable than float comparison, though that wasn't really the
goal.

The main reason for this is to allow other fields to take over when
sorting multi-field fractional data: cov (lines + branches), testmarks
(passed + time), etc. Before, sorting would usually stop after
mismatched fraction fields, which wasn't all that useful.
2024-05-28 15:19:11 -05:00

852 lines
25 KiB
Python
Executable File

#!/usr/bin/env python3
#
# Script to summarize the outputs of other scripts. Operates on CSV files.
#
# Example:
# ./scripts/code.py lfs.o lfs_util.o -q -o lfs.code.csv
# ./scripts/data.py lfs.o lfs_util.o -q -o lfs.data.csv
# ./scripts/summary.py lfs.code.csv lfs.data.csv -q -o lfs.csv
# ./scripts/summary.py -Y lfs.csv -f code=code_size,data=data_size
#
# Copyright (c) 2022, The littlefs authors.
# SPDX-License-Identifier: BSD-3-Clause
#
import collections as co
import csv
import functools as ft
import itertools as it
import math as m
import os
import re
# supported merge operations
#
# this is a terrible way to express these
#
OPS = {
'sum': lambda xs: sum(xs[1:], start=xs[0]),
'prod': lambda xs: m.prod(xs[1:], start=xs[0]),
'min': min,
'max': max,
'avg': lambda xs: RFloat(sum(float(x) for x in xs) / len(xs)),
'stddev': lambda xs: (
lambda avg: RFloat(
m.sqrt(sum((float(x) - avg)**2 for x in xs) / len(xs)))
)(sum(float(x) for x in xs) / len(xs)),
'gmean': lambda xs: RFloat(m.prod(float(x) for x in xs)**(1/len(xs))),
'gstddev': lambda xs: (
lambda gmean: RFloat(
m.exp(m.sqrt(sum(m.log(float(x)/gmean)**2 for x in xs) / len(xs)))
if gmean else m.inf)
)(m.prod(float(x) for x in xs)**(1/len(xs))),
}
# integer fields
class RInt(co.namedtuple('RInt', 'x')):
__slots__ = ()
def __new__(cls, x=0):
if isinstance(x, RInt):
return x
if isinstance(x, str):
try:
x = int(x, 0)
except ValueError:
# also accept +-∞ and +-inf
if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
x = m.inf
elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
x = -m.inf
else:
raise
assert isinstance(x, int) or m.isinf(x), x
return super().__new__(cls, x)
def __str__(self):
if self.x == m.inf:
return ''
elif self.x == -m.inf:
return '-∞'
else:
return str(self.x)
def __int__(self):
assert not m.isinf(self.x)
return self.x
def __float__(self):
return float(self.x)
none = '%7s' % '-'
def table(self):
return '%7s' % (self,)
def diff(self, other):
new = self.x if self else 0
old = other.x if other else 0
diff = new - old
if diff == +m.inf:
return '%7s' % '+∞'
elif diff == -m.inf:
return '%7s' % '-∞'
else:
return '%+7d' % diff
def ratio(self, other):
new = self.x if self else 0
old = other.x if other else 0
if m.isinf(new) and m.isinf(old):
return 0.0
elif m.isinf(new):
return +m.inf
elif m.isinf(old):
return -m.inf
elif not old and not new:
return 0.0
elif not old:
return 1.0
else:
return (new-old) / old
def __add__(self, other):
return self.__class__(self.x + other.x)
def __sub__(self, other):
return self.__class__(self.x - other.x)
def __mul__(self, other):
return self.__class__(self.x * other.x)
# float fields
class RFloat(co.namedtuple('RFloat', 'x')):
__slots__ = ()
def __new__(cls, x=0.0):
if isinstance(x, RFloat):
return x
if isinstance(x, str):
try:
x = float(x)
except ValueError:
# also accept +-∞ and +-inf
if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
x = m.inf
elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
x = -m.inf
else:
raise
assert isinstance(x, float), x
return super().__new__(cls, x)
def __str__(self):
if self.x == m.inf:
return ''
elif self.x == -m.inf:
return '-∞'
else:
return '%.1f' % self.x
def __float__(self):
return float(self.x)
none = RInt.none
table = RInt.table
def diff(self, other):
new = self.x if self else 0
old = other.x if other else 0
diff = new - old
if diff == +m.inf:
return '%7s' % '+∞'
elif diff == -m.inf:
return '%7s' % '-∞'
else:
return '%+7.1f' % diff
ratio = RInt.ratio
__add__ = RInt.__add__
__sub__ = RInt.__sub__
__mul__ = RInt.__mul__
# fractional fields, a/b
class RFrac(co.namedtuple('RFrac', 'a,b')):
__slots__ = ()
def __new__(cls, a=0, b=None):
if isinstance(a, RFrac) and b is None:
return a
if isinstance(a, str) and b is None:
a, b = a.split('/', 1)
if b is None:
b = a
return super().__new__(cls, RInt(a), RInt(b))
def __str__(self):
return '%s/%s' % (self.a, self.b)
def __float__(self):
return float(self.a)
none = '%11s' % '-'
def table(self):
return '%11s' % (self,)
def notes(self):
t = self.a.x/self.b.x if self.b.x else 1.0
return ['%' if t == +m.inf
else '-∞%' if t == -m.inf
else '%.1f%%' % (100*t)]
def diff(self, other):
new_a, new_b = self if self else (RInt(0), RInt(0))
old_a, old_b = other if other else (RInt(0), RInt(0))
return '%11s' % ('%s/%s' % (
new_a.diff(old_a).strip(),
new_b.diff(old_b).strip()))
def ratio(self, other):
new_a, new_b = self if self else (RInt(0), RInt(0))
old_a, old_b = other if other else (RInt(0), RInt(0))
new = new_a.x/new_b.x if new_b.x else 1.0
old = old_a.x/old_b.x if old_b.x else 1.0
return new - old
def __add__(self, other):
return self.__class__(self.a + other.a, self.b + other.b)
def __sub__(self, other):
return self.__class__(self.a - other.a, self.b - other.b)
def __mul__(self, other):
return self.__class__(self.a * other.a, self.b + other.b)
def __eq__(self, other):
self_a, self_b = self if self.b.x else (RInt(1), RInt(1))
other_a, other_b = other if other.b.x else (RInt(1), RInt(1))
return self_a * other_b == other_a * self_b
def __ne__(self, other):
return not self.__eq__(other)
def __lt__(self, other):
self_a, self_b = self if self.b.x else (RInt(1), RInt(1))
other_a, other_b = other if other.b.x else (RInt(1), RInt(1))
return self_a * other_b < other_a * self_b
def __gt__(self, other):
return self.__class__.__lt__(other, self)
def __le__(self, other):
return not self.__gt__(other)
def __ge__(self, other):
return not self.__lt__(other)
# available types
TYPES = co.OrderedDict([
('int', RInt),
('float', RFloat),
('frac', RFrac)
])
def openio(path, mode='r', buffering=-1):
# allow '-' for stdin/stdout
if path == '-':
if 'r' in mode:
return os.fdopen(os.dup(sys.stdin.fileno()), mode, buffering)
else:
return os.fdopen(os.dup(sys.stdout.fileno()), mode, buffering)
else:
return open(path, mode, buffering)
def collect(csv_paths, renames=[], defines=[]):
# collect results from CSV files
fields = []
results = []
for path in csv_paths:
try:
with openio(path) as f:
reader = csv.DictReader(f, restval='')
fields.extend(
k for k in reader.fieldnames
if k not in fields)
for r in reader:
# apply any renames
if renames:
# make a copy so renames can overlap
r_ = {}
for new_k, old_k in renames:
if old_k in r:
r_[new_k] = r[old_k]
r.update(r_)
# filter by matching defines
if not all(k in r and r[k] in vs for k, vs in defines):
continue
results.append(r)
except FileNotFoundError:
pass
return fields, results
def infer(fields_, results,
by=None,
fields=None,
types={},
ops={},
renames=[],
defines=[]):
# if by not specified, guess it's anything not in fields/renames/defines
if by is None:
by = [
k for k in fields_
if k not in (fields or [])
and not any(k == old_k for _, old_k in renames)
and not any(k == k_ for k_, _ in defines)]
# if fields not specified, guess it's anything not in by/renames/defines
if fields is None:
fields = [
k for k in fields_
if k not in (by or [])
and not any(k == old_k for _, old_k in renames)
and not any(k == k_ for k_, _ in defines)]
# deduplicate by/fields
by = list(co.OrderedDict.fromkeys(by).keys())
fields = list(co.OrderedDict.fromkeys(fields).keys())
# find best type for all fields
types_ = {}
for k in fields:
if k in types:
types_[k] = types[k]
else:
for t in TYPES.values():
for r in results:
if k in r and r[k].strip():
try:
t(r[k])
except ValueError:
break
else:
types_[k] = t
break
else:
print("error: no type matches field %r?" % k,
file=sys.stderr)
sys.exit(-1)
types = types_
# does folding change the type?
types_ = {}
for k, t in types.items():
types_[k] = ops.get(k, OPS['sum'])([t()]).__class__
# create result class
def __new__(cls, **r):
return cls.__mro__[1].__new__(cls,
**{k: r.get(k, '') for k in by},
**{k: r[k] if k in r and isinstance(r[k], tuple)
else ([types[k](r[k])], 1) if k in r
else ([], 0)
for k in fields})
def __add__(self, other):
# reuse lists if possible
def extend(a, b):
if len(a[0]) == a[1]:
a[0].extend(b[0][:b[1]])
return (a[0], a[1] + b[1])
else:
return (a[0][:a[1]] + b[0][:b[1]], a[1] + b[1])
return self.__class__(
**{k: getattr(self, k) for k in by},
**{k: extend(
object.__getattribute__(self, k),
object.__getattribute__(other, k))
for k in fields})
def __getattribute__(self, k):
if k in fields:
v = object.__getattribute__(self, k)
if v[1]:
return ops.get(k, OPS['sum'])(v[0][:v[1]])
else:
return None
return object.__getattribute__(self, k)
return type('Result', (co.namedtuple('Result', by + fields),), {
'__slots__': (),
'__new__': __new__,
'__add__': __add__,
'__getattribute__': __getattribute__,
'_by': by,
'_fields': fields,
'_sort': fields,
'_types': types_,
})
def fold(Result, results, by=None, defines=[]):
if by is None:
by = Result._by
for k in it.chain(by or [], (k for k, _ in defines)):
if k not in Result._by and k not in Result._fields:
print("error: could not find field %r?" % k,
file=sys.stderr)
sys.exit(-1)
# filter by matching defines
if defines:
results_ = []
for r in results:
if all(getattr(r, k) in vs for k, vs in defines):
results_.append(r)
results = results_
# organize results into conflicts
folding = co.OrderedDict()
for r in results:
name = tuple(getattr(r, k) for k in by)
if name not in folding:
folding[name] = []
folding[name].append(r)
# merge conflicts
folded = []
for name, rs in folding.items():
folded.append(sum(rs[1:], start=rs[0]))
return folded
def table(Result, results, diff_results=None, *,
by=None,
fields=None,
sort=None,
summary=False,
all=False,
percent=False,
**_):
all_, all = all, __builtins__.all
if by is None:
by = Result._by
if fields is None:
fields = Result._fields
types = Result._types
# fold again
results = fold(Result, results, by=by)
if diff_results is not None:
diff_results = fold(Result, diff_results, by=by)
# organize by name
table = {
','.join(str(getattr(r, k) or '') for k in by): r
for r in results}
diff_table = {
','.join(str(getattr(r, k) or '') for k in by): r
for r in diff_results or []}
names = list(table.keys() | diff_table.keys())
# sort again, now with diff info, note that python's sort is stable
names.sort()
if diff_results is not None:
names.sort(key=lambda n: tuple(
types[k].ratio(
getattr(table.get(n), k, None),
getattr(diff_table.get(n), k, None))
for k in fields),
reverse=True)
if sort:
for k, reverse in reversed(sort):
names.sort(
key=lambda n: tuple(
(getattr(table[n], k),)
if getattr(table.get(n), k, None) is not None else ()
for k in ([k] if k else [
k for k in Result._sort if k in fields])),
reverse=reverse ^ (not k or k in Result._fields))
# build up our lines
lines = []
# header
header = []
header.append('%s%s' % (
','.join(by),
' (%d added, %d removed)' % (
sum(1 for n in table if n not in diff_table),
sum(1 for n in diff_table if n not in table))
if diff_results is not None and not percent else '')
if not summary else '')
if diff_results is None:
for k in fields:
header.append(k)
elif percent:
for k in fields:
header.append(k)
else:
for k in fields:
header.append('o'+k)
for k in fields:
header.append('n'+k)
for k in fields:
header.append('d'+k)
header.append('')
lines.append(header)
def table_entry(name, r, diff_r=None):
entry = []
entry.append(name)
if diff_results is None:
for k in fields:
entry.append(getattr(r, k).table()
if getattr(r, k, None) is not None
else types[k].none)
elif percent:
for k in fields:
entry.append(getattr(r, k).table()
if getattr(r, k, None) is not None
else types[k].none)
else:
for k in fields:
entry.append(getattr(diff_r, k).table()
if getattr(diff_r, k, None) is not None
else types[k].none)
for k in fields:
entry.append(getattr(r, k).table()
if getattr(r, k, None) is not None
else types[k].none)
for k in fields:
entry.append(types[k].diff(
getattr(r, k, None),
getattr(diff_r, k, None)))
if diff_results is None:
notes = []
for k in fields:
try:
notes.extend(getattr(r, k).notes())
except AttributeError:
pass
entry.append(' (%s)' % ', '.join(notes)
if notes else '')
else:
ratios = [
types[k].ratio(
getattr(r, k, None),
getattr(diff_r, k, None))
for k in fields]
entry.append(' (%s)' % ', '.join(
'+∞%' if t == +m.inf
else '-∞%' if t == -m.inf
else '%+.1f%%' % (100*t)
for t in ratios)
if percent or any(ratios) else '')
return entry
# entries
if not summary:
for name in names:
r = table.get(name)
if diff_results is None:
diff_r = None
else:
diff_r = diff_table.get(name)
if not all_ and not any(
types[k].ratio(
getattr(r, k, None),
getattr(diff_r, k, None))
for k in fields):
continue
lines.append(table_entry(name, r, diff_r))
# total
r = next(iter(fold(Result, results, by=[])), None)
if diff_results is None:
diff_r = None
else:
diff_r = next(iter(fold(Result, diff_results, by=[])), None)
lines.append(table_entry('TOTAL', r, diff_r))
# find the best widths, note that column 0 contains the names and
# column -1 the ratios/notes, so those are handled a bit differently
widths = [
((max(it.chain([w], (len(l[i]) for l in lines)))+1+4-1)//4)*4-1
for w, i in zip(
it.chain([23], it.repeat(7)),
range(len(lines[0])-1))]
# print our table
for line in lines:
print('%-*s %s%s' % (
widths[0], line[0],
' '.join('%*s' % (w, x)
for w, x in zip(widths[1:], line[1:-1])),
line[-1]))
def main(csv_paths, *,
by=None,
fields=None,
defines=[],
sort=None,
**args):
# separate out renames
renames = list(it.chain.from_iterable(
((k, v) for v in vs)
for k, vs in it.chain(by or [], fields or [])))
if by is not None:
by = [k for k, _ in by]
if fields is not None:
fields = [k for k, _ in fields]
# figure out types
types = {}
for t in TYPES.keys():
for k in args.get(t, []):
if k in types:
print("error: conflicting type for field %r?" % k,
file=sys.stderr)
sys.exit(-1)
types[k] = TYPES[t]
# rename types?
if renames:
types_ = {}
for new_k, old_k in renames:
if old_k in types:
types_[new_k] = types[old_k]
types.update(types_)
# figure out merge operations
ops = {}
for o in OPS.keys():
for k in args.get(o, []):
if k in ops:
print("error: conflicting op for field %r?" % k,
file=sys.stderr)
sys.exit(-1)
ops[k] = OPS[o]
# rename ops?
if renames:
ops_ = {}
for new_k, old_k in renames:
if old_k in ops:
ops_[new_k] = ops[old_k]
ops.update(ops_)
if by is None and fields is None:
print("error: needs --by or --fields to figure out fields",
file=sys.stderr)
sys.exit(-1)
# find CSV files
fields_, results = collect(csv_paths, renames, defines)
# homogenize
Result = infer(fields_, results,
by=by,
fields=fields,
types=types,
ops=ops,
renames=renames,
defines=defines)
results_ = []
for r in results:
if not any(k in r and r[k].strip()
for k in Result._fields):
continue
try:
results_.append(Result(**{
k: r[k] for k in Result._by + Result._fields
if k in r and r[k].strip()}))
except TypeError:
pass
results = results_
# fold
results = fold(Result, results, by=by, defines=defines)
# sort, note that python's sort is stable
results.sort()
if sort:
for k, reverse in reversed(sort):
results.sort(
key=lambda r: tuple(
(getattr(r, k),) if getattr(r, k) is not None else ()
for k in ([k] if k else Result._sort)),
reverse=reverse ^ (not k or k in Result._fields))
# write results to CSV
if args.get('output'):
with openio(args['output'], 'w') as f:
writer = csv.DictWriter(f, Result._by + Result._fields)
writer.writeheader()
for r in results:
# note we need to go through getattr to resolve lazy fields
writer.writerow({
k: getattr(r, k) for k in Result._by + Result._fields})
# find previous results?
if args.get('diff'):
_, diff_results = collect([args['diff']], renames, defines)
diff_results_ = []
for r in diff_results:
if not any(k in r and r[k].strip()
for k in Result._fields):
continue
try:
diff_results_.append(Result(**{
k: r[k] for k in Result._by + Result._fields
if k in r and r[k].strip()}))
except TypeError:
pass
diff_results = diff_results_
# fold
diff_results = fold(Result, diff_results, by=by, defines=defines)
# print table
if not args.get('quiet'):
table(Result, results,
diff_results if args.get('diff') else None,
by=by,
fields=fields,
sort=sort,
**args)
if __name__ == "__main__":
import argparse
import sys
parser = argparse.ArgumentParser(
description="Summarize measurements in CSV files.",
allow_abbrev=False)
parser.add_argument(
'csv_paths',
nargs='*',
help="Input *.csv files.")
parser.add_argument(
'-q', '--quiet',
action='store_true',
help="Don't show anything, useful with -o.")
parser.add_argument(
'-o', '--output',
help="Specify CSV file to store results.")
parser.add_argument(
'-d', '--diff',
help="Specify CSV file to diff against.")
parser.add_argument(
'-a', '--all',
action='store_true',
help="Show all, not just the ones that changed.")
parser.add_argument(
'-p', '--percent',
action='store_true',
help="Only show percentage change, not a full diff.")
parser.add_argument(
'-b', '--by',
action='append',
type=lambda x: (
lambda k, vs=None: (
k.strip(),
tuple(v.strip() for v in vs.split(','))
if vs is not None else ())
)(*x.split('=', 1)),
help="Group by this field. Can rename fields with new_name=old_name.")
parser.add_argument(
'-f', '--field',
dest='fields',
action='append',
type=lambda x: (
lambda k, vs=None: (
k.strip(),
tuple(v.strip() for v in vs.split(','))
if vs is not None else ())
)(*x.split('=', 1)),
help="Show this field. Can rename fields with new_name=old_name.")
parser.add_argument(
'-D', '--define',
dest='defines',
action='append',
type=lambda x: (
lambda k, vs: (
k.strip(),
{v.strip() for v in vs.split(',')})
)(*x.split('=', 1)),
help="Only include results where this field is this value. May include "
"comma-separated options.")
class AppendSort(argparse.Action):
def __call__(self, parser, namespace, value, option):
if namespace.sort is None:
namespace.sort = []
namespace.sort.append((value, True if option == '-S' else False))
parser.add_argument(
'-s', '--sort',
nargs='?',
action=AppendSort,
help="Sort by this field.")
parser.add_argument(
'-S', '--reverse-sort',
nargs='?',
action=AppendSort,
help="Sort by this field, but backwards.")
parser.add_argument(
'-Y', '--summary',
action='store_true',
help="Only show the total.")
parser.add_argument(
'--int',
action='append',
help="Treat these fields as ints.")
parser.add_argument(
'--float',
action='append',
help="Treat these fields as floats.")
parser.add_argument(
'--frac',
action='append',
help="Treat these fields as fractions.")
parser.add_argument(
'--sum',
action='append',
help="Add these fields (the default).")
parser.add_argument(
'--prod',
action='append',
help="Multiply these fields.")
parser.add_argument(
'--min',
action='append',
help="Take the minimum of these fields.")
parser.add_argument(
'--max',
action='append',
help="Take the maximum of these fields.")
parser.add_argument(
'--avg', '--mean',
action='append',
help="Average these fields.")
parser.add_argument(
'--stddev',
action='append',
help="Find the standard deviation of these fields.")
parser.add_argument(
'--gmean',
action='append',
help="Find the geometric mean of these fields.")
parser.add_argument(
'--gstddev',
action='append',
help="Find the geometric standard deviation of these fields.")
sys.exit(main(**{k: v
for k, v in vars(parser.parse_intermixed_args()).items()
if v is not None}))