Added stddev/gmean/gstddev to summary.py

This commit is contained in:
Christopher Haster
2022-09-27 13:30:43 -05:00
parent 9507e6243c
commit a2fb7089dd
6 changed files with 266 additions and 230 deletions

View File

@@ -25,12 +25,26 @@ import re
CSV_PATHS = ['*.csv']
# supported merge operations
#
# this is a terrible way to express these
#
OPS = {
'add': lambda xs: sum(xs[1:], start=xs[0]),
'mul': lambda xs: m.prod(xs[1:], start=xs[0]),
'min': min,
'max': max,
'avg': lambda xs: sum(xs[1:], start=xs[0]) / len(xs),
'sum': lambda xs: sum(xs[1:], start=xs[0]),
'prod': lambda xs: m.prod(xs[1:], start=xs[0]),
'min': min,
'max': max,
'mean': lambda xs: FloatField(sum(float(x) for x in xs) / len(xs)),
'stddev': lambda xs: (
lambda mean: FloatField(
m.sqrt(sum((float(x) - mean)**2 for x in xs) / len(xs)))
)(sum(float(x) for x in xs) / len(xs)),
'gmean': lambda xs: FloatField(m.prod(float(x) for x in xs)**(1/len(xs))),
'gstddev': lambda xs: (
lambda gmean: FloatField(
m.exp(m.sqrt(sum(m.log(float(x)/gmean)**2 for x in xs) / len(xs)))
if gmean else m.inf)
)(m.prod(float(x) for x in xs)**(1/len(xs)))
}
@@ -47,7 +61,7 @@ def openio(path, mode='r'):
# integer fields
class IntField(co.namedtuple('IntField', 'x')):
__slots__ = ()
def __new__(cls, x):
def __new__(cls, x=0):
if isinstance(x, IntField):
return x
if isinstance(x, str):
@@ -56,13 +70,22 @@ class IntField(co.namedtuple('IntField', 'x')):
except ValueError:
# also accept +-∞ and +-inf
if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
x = float('inf')
x = m.inf
elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
x = float('-inf')
x = -m.inf
else:
raise
assert isinstance(x, int) or m.isinf(x), x
return super().__new__(cls, x)
def __str__(self):
if self.x == m.inf:
return ''
elif self.x == -m.inf:
return '-∞'
else:
return str(self.x)
def __int__(self):
assert not m.isinf(self.x)
return self.x
@@ -70,14 +93,6 @@ class IntField(co.namedtuple('IntField', 'x')):
def __float__(self):
return float(self.x)
def __str__(self):
if self.x == float('inf'):
return ''
elif self.x == float('-inf'):
return '-∞'
else:
return str(self.x)
none = '%7s' % '-'
def table(self):
return '%7s' % (self,)
@@ -89,9 +104,9 @@ class IntField(co.namedtuple('IntField', 'x')):
new = self.x if self else 0
old = other.x if other else 0
diff = new - old
if diff == float('+inf'):
if diff == +m.inf:
return '%7s' % '+∞'
elif diff == float('-inf'):
elif diff == -m.inf:
return '%7s' % '-∞'
else:
return '%+7d' % diff
@@ -102,9 +117,9 @@ class IntField(co.namedtuple('IntField', 'x')):
if m.isinf(new) and m.isinf(old):
return 0.0
elif m.isinf(new):
return float('+inf')
return +m.inf
elif m.isinf(old):
return float('-inf')
return -m.inf
elif not old and not new:
return 0.0
elif not old:
@@ -115,6 +130,9 @@ class IntField(co.namedtuple('IntField', 'x')):
def __add__(self, other):
return IntField(self.x + other.x)
def __sub__(self, other):
return IntField(self.x - other.x)
def __mul__(self, other):
return IntField(self.x * other.x)
@@ -130,16 +148,10 @@ class IntField(co.namedtuple('IntField', 'x')):
def __ge__(self, other):
return not self.__lt__(other)
def __truediv__(self, n):
if m.isinf(self.x):
return self
else:
return IntField(round(self.x / n))
# float fields
class FloatField(co.namedtuple('FloatField', 'x')):
__slots__ = ()
def __new__(cls, x):
def __new__(cls, x=0.0):
if isinstance(x, FloatField):
return x
if isinstance(x, str):
@@ -148,24 +160,25 @@ class FloatField(co.namedtuple('FloatField', 'x')):
except ValueError:
# also accept +-∞ and +-inf
if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
x = float('inf')
x = m.inf
elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
x = float('-inf')
x = -m.inf
else:
raise
assert isinstance(x, float), x
return super().__new__(cls, x)
def __float__(self):
return float(self.x)
def __str__(self):
if self.x == float('inf'):
if self.x == m.inf:
return ''
elif self.x == float('-inf'):
elif self.x == -m.inf:
return '-∞'
else:
return '%.1f' % self.x
def __float__(self):
return float(self.x)
none = IntField.none
table = IntField.table
diff_none = IntField.diff_none
@@ -173,22 +186,17 @@ class FloatField(co.namedtuple('FloatField', 'x')):
diff_diff = IntField.diff_diff
ratio = IntField.ratio
__add__ = IntField.__add__
__sub__ = IntField.__sub__
__mul__ = IntField.__mul__
__lt__ = IntField.__lt__
__gt__ = IntField.__gt__
__le__ = IntField.__le__
__ge__ = IntField.__ge__
def __truediv__(self, n):
if m.isinf(self.x):
return self
else:
return FloatField(self.x / n)
# fractional fields, a/b
class FracField(co.namedtuple('FracField', 'a,b')):
__slots__ = ()
def __new__(cls, a, b=None):
def __new__(cls, a=0, b=None):
if isinstance(a, FracField) and b is None:
return a
if isinstance(a, str) and b is None:
@@ -200,6 +208,9 @@ class FracField(co.namedtuple('FracField', 'a,b')):
def __str__(self):
return '%s/%s' % (self.a, self.b)
def __float__(self):
return float(self.a)
none = '%11s %7s' % ('-', '-')
def table(self):
if not self.b.x:
@@ -208,8 +219,8 @@ class FracField(co.namedtuple('FracField', 'a,b')):
t = self.a.x/self.b.x
return '%11s %7s' % (
self,
'%' if t == float('+inf')
else '-∞%' if t == float('-inf')
'%' if t == +m.inf
else '-∞%' if t == -m.inf
else '%.1f%%' % (100*t))
diff_none = '%11s' % '-'
@@ -236,12 +247,15 @@ class FracField(co.namedtuple('FracField', 'a,b')):
def __add__(self, other):
return FracField(self.a + other.a, self.b + other.b)
def __sub__(self, other):
return FracField(self.a - other.a, self.b - other.b)
def __mul__(self, other):
return FracField(self.a * other.a, self.b + other.b)
def __lt__(self, other):
self_r = self.a.x/self.b.x if self.b.x else float('-inf')
other_r = other.a.x/other.b.x if other.b.x else float('-inf')
self_r = self.a.x/self.b.x if self.b.x else -m.inf
other_r = other.a.x/other.b.x if other.b.x else -m.inf
return self_r < other_r
def __gt__(self, other):
@@ -253,9 +267,6 @@ class FracField(co.namedtuple('FracField', 'a,b')):
def __ge__(self, other):
return not self.__lt__(other)
def __truediv__(self, n):
return FracField(self.a / n, self.b / n)
# available types
TYPES = [IntField, FloatField, FracField]
@@ -314,7 +325,7 @@ def homogenize(results, *,
if k is not None
and k not in fields
and not any(k == old_k for _, old_k in renames))
by = list(by.keys())
by = list(by.keys())
# go ahead and clean up none values, these can have a few forms
results_ = []
@@ -357,6 +368,7 @@ def homogenize(results, *,
def fold(results, *,
by=[],
fields=[],
types=None,
ops={},
**_):
folding = co.OrderedDict()
@@ -375,7 +387,7 @@ def fold(results, *,
for k, vs in r.items():
if vs:
# sum fields by default
op = OPS[ops.get(k, 'add')]
op = OPS[ops.get(k, 'sum')]
r_[k] = op(vs)
# drop any rows without fields and any empty keys
@@ -384,14 +396,24 @@ def fold(results, *,
{k: v for k, v in zip(by, name) if v},
**r_))
return folded
# what is the type of merged fields?
if types is not None:
types_ = {}
for k in fields:
op = OPS[ops.get(k, 'sum')]
types_[k] = op([types[k]()]).__class__
if types is None:
return folded
else:
return types_, folded
def table(results, diff_results=None, *,
by=None,
fields=None,
types=None,
ops=None,
def table(results, total, diff_results=None, diff_total=None, *,
by=[],
fields=[],
types={},
ops={},
sort=None,
reverse_sort=None,
summary=False,
@@ -472,8 +494,8 @@ def table(results, diff_results=None, *,
if k in r else types[k].diff_none
for k in fields),
' (%s)' % ', '.join(
'+∞%' if t == float('+inf')
else '-∞%' if t == float('-inf')
'+∞%' if t == +m.inf
else '-∞%' if t == -m.inf
else '%+.1f%%' % (100*t)
for t in ratios)))
else:
@@ -488,19 +510,17 @@ def table(results, diff_results=None, *,
if k in r or k in diff_r else types[k].diff_none
for k in fields),
' (%s)' % ', '.join(
'+∞%' if t == float('+inf')
else '-∞%' if t == float('-inf')
'+∞%' if t == +m.inf
else '-∞%' if t == -m.inf
else '%+.1f%%' % (100*t)
for t in ratios
if t)
if any(ratios) else ''))
# print total
total = fold(results, by=[], fields=fields, ops=ops)
r = total[0] if total else {}
if diff_results is not None:
diff_total = fold(diff_results, by=[], fields=fields, ops=ops)
diff_r = diff_total[0] if diff_total else {}
r = total
if diff_total is not None:
diff_r = diff_total
ratios = [types[k].ratio(r.get(k), diff_r.get(k))
for k in fields]
@@ -516,8 +536,8 @@ def table(results, diff_results=None, *,
if k in r else types[k].diff_none
for k in fields),
' (%s)' % ', '.join(
'+∞%' if t == float('+inf')
else '-∞%' if t == float('-inf')
'+∞%' if t == +m.inf
else '-∞%' if t == -m.inf
else '%+.1f%%' % (100*t)
for t in ratios)))
else:
@@ -532,8 +552,8 @@ def table(results, diff_results=None, *,
if k in r or k in diff_r else types[k].diff_none
for k in fields),
' (%s)' % ', '.join(
'+∞%' if t == float('+inf')
else '-∞%' if t == float('-inf')
'+∞%' if t == +m.inf
else '-∞%' if t == -m.inf
else '%+.1f%%' % (100*t)
for t in ratios
if t)
@@ -597,9 +617,13 @@ def main(csv_paths, *,
by, fields, types, results = homogenize(results,
by=by, fields=fields, renames=renames, define=define)
# fold for total, note we do this with the raw data to avoid
# issues with lossy operations
total = fold(results, fields=fields, ops=ops)
total = total[0] if total else {}
# fold to remove duplicates
results = fold(results,
by=by, fields=fields, ops=ops)
types_, results = fold(results, by=by, fields=fields, types=types, ops=ops)
# write results to CSV
if args.get('output'):
@@ -624,19 +648,25 @@ def main(csv_paths, *,
_, _, _, diff_results = homogenize(diff_results,
by=by, fields=fields, renames=renames, define=define, types=types)
# fold for total, note we do this with the raw data to avoid
# issues with lossy operations
diff_total = fold(diff_results, fields=fields, ops=ops)
diff_total = diff_total[0] if diff_total else {}
# fold to remove duplicates
diff_results = fold(diff_results,
by=by, fields=fields, ops=ops)
diff_results = fold(diff_results, by=by, fields=fields, ops=ops)
# print table
if not args.get('quiet'):
table(
results,
total,
diff_results if args.get('diff') else None,
diff_total if args.get('diff') else None,
by=by,
fields=fields,
types=types_,
ops=ops,
types=types,
**args)
@@ -685,11 +715,11 @@ if __name__ == "__main__":
help="Only include rows where this field is this value. May include "
"comma-separated options.")
parser.add_argument(
'--add',
'--sum',
action='append',
help="Add these fields (the default).")
parser.add_argument(
'--mul',
'--prod',
action='append',
help="Multiply these fields.")
parser.add_argument(
@@ -701,9 +731,21 @@ if __name__ == "__main__":
action='append',
help="Take the maximum of these fields.")
parser.add_argument(
'--avg',
'--mean',
action='append',
help="Average these fields.")
parser.add_argument(
'--stddev',
action='append',
help="Find the standard deviation of these fields.")
parser.add_argument(
'--gmean',
action='append',
help="Find the geometric mean of these fields.")
parser.add_argument(
'--gstddev',
action='append',
help="Find the geometric standard deviation of these fields.")
parser.add_argument(
'-s', '--sort',
action='append',