#!/usr/bin/env python3 # # Script to manipulate CSV files. # # Example: # ./scripts/code.py lfs.o lfs_util.o -q -o lfs.code.csv # ./scripts/data.py lfs.o lfs_util.o -q -o lfs.data.csv # ./scripts/csv.py lfs.code.csv lfs.data.csv -q -o lfs.csv # ./scripts/csv.py -Y lfs.csv -f code=code_size,data=data_size # # Copyright (c) 2022, The littlefs authors. # SPDX-License-Identifier: BSD-3-Clause # # prevent local imports __import__('sys').path.pop(0) import collections as co import csv import functools as ft import itertools as it import math as mt import os import re # supported merge operations # # this is a terrible way to express these # OPS = { 'sum': lambda xs: sum(xs[1:], start=xs[0]), 'prod': lambda xs: mt.prod(xs[1:], start=xs[0]), 'min': min, 'max': max, 'avg': lambda xs: RFloat(sum(float(x) for x in xs) / len(xs)), 'stddev': lambda xs: ( lambda avg: RFloat( mt.sqrt(sum((float(x) - avg)**2 for x in xs) / len(xs))) )(sum(float(x) for x in xs) / len(xs)), 'gmean': lambda xs: RFloat(mt.prod(float(x) for x in xs)**(1/len(xs))), 'gstddev': lambda xs: ( lambda gmean: RFloat( mt.exp(mt.sqrt( sum(mt.log(float(x)/gmean)**2 for x in xs) / len(xs))) if gmean else mt.inf) )(mt.prod(float(x) for x in xs)**(1/len(xs))), } # integer fields class RInt(co.namedtuple('RInt', 'x')): __slots__ = () def __new__(cls, x=0): if isinstance(x, RInt): return x if isinstance(x, str): try: x = int(x, 0) except ValueError: # also accept +-∞ and +-inf if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x): x = mt.inf elif re.match('^\s*-\s*(?:∞|inf)\s*$', x): x = -mt.inf else: raise assert isinstance(x, int) or mt.isinf(x), x return super().__new__(cls, x) def __str__(self): if self.x == mt.inf: return '∞' elif self.x == -mt.inf: return '-∞' else: return str(self.x) def __int__(self): assert not mt.isinf(self.x) return self.x def __float__(self): return float(self.x) none = '%7s' % '-' def table(self): return '%7s' % (self,) def diff(self, other): new = self.x if self else 0 old = other.x if other else 0 diff = new - old if diff == +mt.inf: return '%7s' % '+∞' elif diff == -mt.inf: return '%7s' % '-∞' else: return '%+7d' % diff def ratio(self, other): new = self.x if self else 0 old = other.x if other else 0 if mt.isinf(new) and mt.isinf(old): return 0.0 elif mt.isinf(new): return +mt.inf elif mt.isinf(old): return -mt.inf elif not old and not new: return 0.0 elif not old: return +mt.inf else: return (new-old) / old def __add__(self, other): return self.__class__(self.x + other.x) def __sub__(self, other): return self.__class__(self.x - other.x) def __mul__(self, other): return self.__class__(self.x * other.x) # float fields class RFloat(co.namedtuple('RFloat', 'x')): __slots__ = () def __new__(cls, x=0.0): if isinstance(x, RFloat): return x if isinstance(x, str): try: x = float(x) except ValueError: # also accept +-∞ and +-inf if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x): x = mt.inf elif re.match('^\s*-\s*(?:∞|inf)\s*$', x): x = -mt.inf else: raise assert isinstance(x, float), x return super().__new__(cls, x) def __str__(self): if self.x == mt.inf: return '∞' elif self.x == -mt.inf: return '-∞' else: return '%.1f' % self.x def __float__(self): return float(self.x) none = RInt.none table = RInt.table def diff(self, other): new = self.x if self else 0 old = other.x if other else 0 diff = new - old if diff == +mt.inf: return '%7s' % '+∞' elif diff == -mt.inf: return '%7s' % '-∞' else: return '%+7.1f' % diff ratio = RInt.ratio __add__ = RInt.__add__ __sub__ = RInt.__sub__ __mul__ = RInt.__mul__ # fractional fields, a/b class RFrac(co.namedtuple('RFrac', 'a,b')): __slots__ = () def __new__(cls, a=0, b=None): if isinstance(a, RFrac) and b is None: return a if isinstance(a, str) and b is None: a, b = a.split('/', 1) if b is None: b = a return super().__new__(cls, RInt(a), RInt(b)) def __str__(self): return '%s/%s' % (self.a, self.b) def __float__(self): return float(self.a) none = '%11s' % '-' def table(self): return '%11s' % (self,) def notes(self): t = self.a.x/self.b.x if self.b.x else 1.0 return ['∞%' if t == +mt.inf else '-∞%' if t == -mt.inf else '%.1f%%' % (100*t)] def diff(self, other): new_a, new_b = self if self else (RInt(0), RInt(0)) old_a, old_b = other if other else (RInt(0), RInt(0)) return '%11s' % ('%s/%s' % ( new_a.diff(old_a).strip(), new_b.diff(old_b).strip())) def ratio(self, other): new_a, new_b = self if self else (RInt(0), RInt(0)) old_a, old_b = other if other else (RInt(0), RInt(0)) new = new_a.x/new_b.x if new_b.x else 1.0 old = old_a.x/old_b.x if old_b.x else 1.0 return new - old def __add__(self, other): return self.__class__(self.a + other.a, self.b + other.b) def __sub__(self, other): return self.__class__(self.a - other.a, self.b - other.b) def __mul__(self, other): return self.__class__(self.a * other.a, self.b + other.b) def __eq__(self, other): self_a, self_b = self if self.b.x else (RInt(1), RInt(1)) other_a, other_b = other if other.b.x else (RInt(1), RInt(1)) return self_a * other_b == other_a * self_b def __ne__(self, other): return not self.__eq__(other) def __lt__(self, other): self_a, self_b = self if self.b.x else (RInt(1), RInt(1)) other_a, other_b = other if other.b.x else (RInt(1), RInt(1)) return self_a * other_b < other_a * self_b def __gt__(self, other): return self.__class__.__lt__(other, self) def __le__(self, other): return not self.__gt__(other) def __ge__(self, other): return not self.__lt__(other) # available types TYPES = co.OrderedDict([ ('int', RInt), ('float', RFloat), ('frac', RFrac) ]) def openio(path, mode='r', buffering=-1): # allow '-' for stdin/stdout if path == '-': if 'r' in mode: return os.fdopen(os.dup(sys.stdin.fileno()), mode, buffering) else: return os.fdopen(os.dup(sys.stdout.fileno()), mode, buffering) else: return open(path, mode, buffering) def collect(csv_paths, renames=[], defines=[]): # collect results from CSV files fields = [] results = [] for path in csv_paths: try: with openio(path) as f: reader = csv.DictReader(f, restval='') fields.extend( k for k in reader.fieldnames if k not in fields) for r in reader: # apply any renames if renames: # make a copy so renames can overlap r_ = {} for new_k, old_k in renames: if old_k in r: r_[new_k] = r[old_k] r.update(r_) # filter by matching defines if not all(k in r and r[k] in vs for k, vs in defines): continue results.append(r) except FileNotFoundError: pass return fields, results def infer(fields_, results, by=None, fields=None, types={}, ops={}, renames=[], defines=[]): # if by not specified, guess it's anything not in fields/renames/defines if by is None: by = [k for k in fields_ if k not in (fields or []) and not any(k == old_k for _, old_k in renames) and not any(k == k_ for k_, _ in defines)] # if fields not specified, guess it's anything not in by/renames/defines if fields is None: fields = [k for k in fields_ if k not in (by or []) and not any(k == old_k for _, old_k in renames) and not any(k == k_ for k_, _ in defines)] # deduplicate by/fields by = list(co.OrderedDict.fromkeys(by).keys()) fields = list(co.OrderedDict.fromkeys(fields).keys()) # find best type for all fields types_ = {} for k in fields: if k in types: types_[k] = types[k] else: for t in TYPES.values(): for r in results: if k in r and r[k].strip(): try: t(r[k]) except ValueError: break else: types_[k] = t break else: print("error: no type matches field %r?" % k, file=sys.stderr) sys.exit(-1) types = types_ # does folding change the type? types_ = {} for k, t in types.items(): types_[k] = ops.get(k, OPS['sum'])([t()]).__class__ # create result class def __new__(cls, **r): return cls.__mro__[1].__new__(cls, **{k: r.get(k, '') for k in by}, **{k: r[k] if k in r and isinstance(r[k], tuple) else ([types[k](r[k])], 1) if k in r else ([], 0) for k in fields}) def __add__(self, other): # reuse lists if possible def extend(a, b): if len(a[0]) == a[1]: a[0].extend(b[0][:b[1]]) return (a[0], a[1] + b[1]) else: return (a[0][:a[1]] + b[0][:b[1]], a[1] + b[1]) return self.__class__( **{k: getattr(self, k) for k in by}, **{k: extend( object.__getattribute__(self, k), object.__getattribute__(other, k)) for k in fields}) def __getattribute__(self, k): if k in fields: v = object.__getattribute__(self, k) if v[1]: return ops.get(k, OPS['sum'])(v[0][:v[1]]) else: return None return object.__getattribute__(self, k) return type('Result', (co.namedtuple('Result', by + fields),), { '__slots__': (), '__new__': __new__, '__add__': __add__, '__getattribute__': __getattribute__, '_by': by, '_fields': fields, '_sort': fields, '_types': types_, }) def fold(Result, results, by=None, defines=[]): if by is None: by = Result._by for k in it.chain(by or [], (k for k, _ in defines)): if k not in Result._by and k not in Result._fields: print("error: could not find field %r?" % k, file=sys.stderr) sys.exit(-1) # filter by matching defines if defines: results_ = [] for r in results: if all(getattr(r, k) in vs for k, vs in defines): results_.append(r) results = results_ # organize results into conflicts folding = co.OrderedDict() for r in results: name = tuple(getattr(r, k) for k in by) if name not in folding: folding[name] = [] folding[name].append(r) # merge conflicts folded = [] for name, rs in folding.items(): folded.append(sum(rs[1:], start=rs[0])) return folded def table(Result, results, diff_results=None, *, by=None, fields=None, sort=None, summary=False, all=False, percent=False, **_): all_, all = all, __builtins__.all if by is None: by = Result._by if fields is None: fields = Result._fields types = Result._types # fold again results = fold(Result, results, by=by) if diff_results is not None: diff_results = fold(Result, diff_results, by=by) # organize by name table = { ','.join(str(getattr(r, k) or '') for k in by): r for r in results} diff_table = { ','.join(str(getattr(r, k) or '') for k in by): r for r in diff_results or []} names = [name for name in table.keys() | diff_table.keys() if diff_results is None or all_ or any( types[k].ratio( getattr(table.get(name), k, None), getattr(diff_table.get(name), k, None)) for k in fields)] # sort again, now with diff info, note that python's sort is stable names.sort() if diff_results is not None: names.sort( key=lambda n: tuple( types[k].ratio( getattr(table.get(n), k, None), getattr(diff_table.get(n), k, None)) for k in fields), reverse=True) if sort: for k, reverse in reversed(sort): names.sort( key=lambda n: tuple( (getattr(table[n], k),) if getattr(table.get(n), k, None) is not None else () for k in ( [k] if k else [ k for k in Result._sort if k in fields])), reverse=reverse ^ (not k or k in Result._fields)) # build up our lines lines = [] # header header = ['%s%s' % ( ','.join(by), ' (%d added, %d removed)' % ( sum(1 for n in table if n not in diff_table), sum(1 for n in diff_table if n not in table)) if diff_results is not None and not percent else '') if not summary else ''] if diff_results is None: for k in fields: header.append(k) elif percent: for k in fields: header.append(k) else: for k in fields: header.append('o'+k) for k in fields: header.append('n'+k) for k in fields: header.append('d'+k) lines.append(header) # entry helper def table_entry(name, r, diff_r=None): entry = [name] if diff_results is None: for k in fields: entry.append( (getattr(r, k).table(), getattr(getattr(r, k), 'notes', lambda: [])()) if getattr(r, k, None) is not None else types[k].none) elif percent: for k in fields: entry.append( (getattr(r, k).table() if getattr(r, k, None) is not None else types[k].none, (lambda t: ['+∞%'] if t == +mt.inf else ['-∞%'] if t == -mt.inf else ['%+.1f%%' % (100*t)])( types[k].ratio( getattr(r, k, None), getattr(diff_r, k, None))))) else: for k in fields: entry.append(getattr(diff_r, k).table() if getattr(diff_r, k, None) is not None else types[k].none) for k in fields: entry.append(getattr(r, k).table() if getattr(r, k, None) is not None else types[k].none) for k in fields: entry.append( (types[k].diff( getattr(r, k, None), getattr(diff_r, k, None)), (lambda t: ['+∞%'] if t == +mt.inf else ['-∞%'] if t == -mt.inf else ['%+.1f%%' % (100*t)] if t else [])( types[k].ratio( getattr(r, k, None), getattr(diff_r, k, None))))) return entry # entries if not summary: for name in names: r = table.get(name) if diff_results is None: diff_r = None else: diff_r = diff_table.get(name) lines.append(table_entry(name, r, diff_r)) # total r = next(iter(fold(Result, results, by=[])), None) if diff_results is None: diff_r = None else: diff_r = next(iter(fold(Result, diff_results, by=[])), None) lines.append(table_entry('TOTAL', r, diff_r)) # homogenize lines = [ [x if isinstance(x, tuple) else (x, []) for x in line] for line in lines] # find the best widths, note that column 0 contains the names and is # handled a bit differently widths = co.defaultdict(lambda: 7, {0: 23}) notes = co.defaultdict(lambda: 0) for line in lines: for i, x in enumerate(line): widths[i] = max(widths[i], ((len(x[0])+1+4-1)//4)*4-1) notes[i] = max(notes[i], 1+2*len(x[1])+sum(len(n) for n in x[1])) # print our table for line in lines: print('%-*s %s' % ( widths[0], line[0][0], ' '.join('%*s%-*s' % ( widths[i], x[0], notes[i], ' (%s)' % ', '.join(x[1]) if x[1] else '') for i, x in enumerate(line[1:], 1)))) def main(csv_paths, *, by=None, fields=None, defines=[], sort=None, **args): # separate out renames renames = list(it.chain.from_iterable( ((k, v) for v in vs) for k, vs in it.chain(by or [], fields or []))) if by is not None: by = [k for k, _ in by] if fields is not None: fields = [k for k, _ in fields] # figure out types types = {} for t in TYPES.keys(): for k in args.get(t, []): if k in types: print("error: conflicting type for field %r?" % k, file=sys.stderr) sys.exit(-1) types[k] = TYPES[t] # rename types? if renames: types_ = {} for new_k, old_k in renames: if old_k in types: types_[new_k] = types[old_k] types.update(types_) # figure out merge operations ops = {} for o in OPS.keys(): for k in args.get(o, []): if k in ops: print("error: conflicting op for field %r?" % k, file=sys.stderr) sys.exit(-1) ops[k] = OPS[o] # rename ops? if renames: ops_ = {} for new_k, old_k in renames: if old_k in ops: ops_[new_k] = ops[old_k] ops.update(ops_) if by is None and fields is None: print("error: needs --by or --fields to figure out fields", file=sys.stderr) sys.exit(-1) # use is just an alias if args.get('use'): csv_paths = csv_paths + [args['use']] # find CSV files fields_, results = collect(csv_paths, renames, defines) # homogenize Result = infer(fields_, results, by=by, fields=fields, types=types, ops=ops, renames=renames, defines=defines) results_ = [] for r in results: if not any(k in r and r[k].strip() for k in Result._fields): continue try: results_.append(Result(**{ k: r[k] for k in Result._by + Result._fields if k in r and r[k].strip()})) except TypeError: pass results = results_ # fold results = fold(Result, results, by=by, defines=defines) # sort, note that python's sort is stable results.sort() if sort: for k, reverse in reversed(sort): results.sort( key=lambda r: tuple( (getattr(r, k),) if getattr(r, k) is not None else () for k in ([k] if k else Result._sort)), reverse=reverse ^ (not k or k in Result._fields)) # write results to CSV if args.get('output'): with openio(args['output'], 'w') as f: writer = csv.DictWriter(f, Result._by + Result._fields) writer.writeheader() for r in results: # note we need to go through getattr to resolve lazy fields writer.writerow({ k: getattr(r, k) for k in Result._by + Result._fields}) # find previous results? if args.get('diff'): _, diff_results = collect([args['diff']], renames, defines) diff_results_ = [] for r in diff_results: if not any(k in r and r[k].strip() for k in Result._fields): continue try: diff_results_.append(Result(**{ k: r[k] for k in Result._by + Result._fields if k in r and r[k].strip()})) except TypeError: pass diff_results = diff_results_ # fold diff_results = fold(Result, diff_results, by=by, defines=defines) # print table if not args.get('quiet'): table(Result, results, diff_results if args.get('diff') else None, by=by, fields=fields, sort=sort, **args) if __name__ == "__main__": import argparse import sys parser = argparse.ArgumentParser( description="Script to manipulate CSV files.", allow_abbrev=False) parser.add_argument( 'csv_paths', nargs='*', help="Input *.csv files.") parser.add_argument( '-q', '--quiet', action='store_true', help="Don't show anything, useful with -o.") parser.add_argument( '-o', '--output', help="Specify CSV file to store results.") parser.add_argument( '-u', '--use', help="Don't parse anything, use this CSV file.") parser.add_argument( '-d', '--diff', help="Specify CSV file to diff against.") parser.add_argument( '-a', '--all', action='store_true', help="Show all, not just the ones that changed.") parser.add_argument( '-p', '--percent', action='store_true', help="Only show percentage change, not a full diff.") parser.add_argument( '-b', '--by', action='append', type=lambda x: ( lambda k, vs=None: ( k.strip(), tuple(v.strip() for v in vs.split(',')) if vs is not None else ()) )(*x.split('=', 1)), help="Group by this field. Can rename fields with " "new_name=old_name.") parser.add_argument( '-f', '--field', dest='fields', action='append', type=lambda x: ( lambda k, vs=None: ( k.strip(), tuple(v.strip() for v in vs.split(',')) if vs is not None else ()) )(*x.split('=', 1)), help="Show this field. Can rename fields with " "new_name=old_name.") parser.add_argument( '-D', '--define', dest='defines', action='append', type=lambda x: ( lambda k, vs: ( k.strip(), {v.strip() for v in vs.split(',')}) )(*x.split('=', 1)), help="Only include results where this field is this value. May " "include comma-separated options.") class AppendSort(argparse.Action): def __call__(self, parser, namespace, value, option): if namespace.sort is None: namespace.sort = [] namespace.sort.append((value, True if option == '-S' else False)) parser.add_argument( '-s', '--sort', nargs='?', action=AppendSort, help="Sort by this field.") parser.add_argument( '-S', '--reverse-sort', nargs='?', action=AppendSort, help="Sort by this field, but backwards.") parser.add_argument( '-Y', '--summary', action='store_true', help="Only show the total.") parser.add_argument( '--int', action='append', help="Treat these fields as ints.") parser.add_argument( '--float', action='append', help="Treat these fields as floats.") parser.add_argument( '--frac', action='append', help="Treat these fields as fractions.") parser.add_argument( '--sum', action='append', help="Add these fields (the default).") parser.add_argument( '--prod', action='append', help="Multiply these fields.") parser.add_argument( '--min', action='append', help="Take the minimum of these fields.") parser.add_argument( '--max', action='append', help="Take the maximum of these fields.") parser.add_argument( '--avg', '--mean', action='append', help="Average these fields.") parser.add_argument( '--stddev', action='append', help="Find the standard deviation of these fields.") parser.add_argument( '--gmean', action='append', help="Find the geometric mean of these fields.") parser.add_argument( '--gstddev', action='append', help="Find the geometric standard deviation of these fields.") sys.exit(main(**{k: v for k, v in vars(parser.parse_intermixed_args()).items() if v is not None}))