littlefs/scripts/perf.py

#!/usr/bin/env python3
#
# Script to aggregate and report Linux perf results.
#
# Example:
# ./scripts/perf.py -R -obench.perf ./runners/bench_runner
# ./scripts/perf.py bench.perf -Flfs.c -Flfs_util.c -Scycles
#
# Copyright (c) 2022, The littlefs authors.
# SPDX-License-Identifier: BSD-3-Clause
#

import bisect
import collections as co
import csv
import errno
import fcntl
import functools as ft
import glob
import itertools as it
import math as m
import multiprocessing as mp
import os
import re
import shlex
import shutil
import subprocess as sp
import tempfile
import zipfile


PERF_PATHS = ['*.perf']
PERF_TOOL = ['perf']
PERF_EVENTS = 'cycles,branch-misses,branches,cache-misses,cache-references'
PERF_FREQ = 100
OBJDUMP_TOOL = ['objdump']
THRESHOLD = (0.5, 0.85)


# integer fields
class Int(co.namedtuple('Int', 'x')):
    __slots__ = ()
    def __new__(cls, x=0):
        if isinstance(x, Int):
            return x
        if isinstance(x, str):
            try:
                x = int(x, 0)
            except ValueError:
                # also accept +-∞ and +-inf
                if re.match('^\s*\+?\s*(?:∞|inf)\s*$', x):
                    x = m.inf
                elif re.match('^\s*-\s*(?:∞|inf)\s*$', x):
                    x = -m.inf
                else:
                    raise
        assert isinstance(x, int) or m.isinf(x), x
        return super().__new__(cls, x)

    def __str__(self):
        if self.x == m.inf:
            return '∞'
        elif self.x == -m.inf:
            return '-∞'
        else:
            return str(self.x)

    def __int__(self):
        assert not m.isinf(self.x)
        return self.x

    def __float__(self):
        return float(self.x)

    none = '%7s' % '-'
    def table(self):
        return '%7s' % (self,)

    diff_none = '%7s' % '-'
    diff_table = table

    def diff_diff(self, other):
        new = self.x if self else 0
        old = other.x if other else 0
        diff = new - old
        if diff == +m.inf:
            return '%7s' % '+∞'
        elif diff == -m.inf:
            return '%7s' % '-∞'
        else:
            return '%+7d' % diff

    def ratio(self, other):
        new = self.x if self else 0
        old = other.x if other else 0
        if m.isinf(new) and m.isinf(old):
            return 0.0
        elif m.isinf(new):
            return +m.inf
        elif m.isinf(old):
            return -m.inf
        elif not old and not new:
            return 0.0
        elif not old:
            return 1.0
        else:
            return (new-old) / old

    def __add__(self, other):
        return self.__class__(self.x + other.x)

    def __sub__(self, other):
        return self.__class__(self.x - other.x)

    def __mul__(self, other):
        return self.__class__(self.x * other.x)

# perf results
class PerfResult(co.namedtuple('PerfResult', [
        'file', 'function', 'line',
        'self_cycles',
        'self_bmisses', 'self_branches',
        'self_cmisses', 'self_caches',
        'cycles',
        'bmisses', 'branches',
        'cmisses', 'caches',
        'children', 'parents'])):
    _by = ['file', 'function', 'line']
    _fields = [
        'self_cycles',
        'self_bmisses', 'self_branches',
        'self_cmisses', 'self_caches',
        'cycles',
        'bmisses', 'branches',
        'cmisses', 'caches']
    _types = {
        'self_cycles': Int,
        'self_bmisses': Int, 'self_branches': Int,
        'self_cmisses': Int, 'self_caches': Int,
        'cycles': Int,
        'bmisses': Int, 'branches': Int,
        'cmisses': Int, 'caches': Int}

    __slots__ = ()
    def __new__(cls, file='', function='', line=0,
            self_cycles=0,
            self_bmisses=0, self_branches=0,
            self_cmisses=0, self_caches=0,
            cycles=0,
            bmisses=0, branches=0,
            cmisses=0, caches=0,
            children=set(), parents=set()):
        return super().__new__(cls, file, function, int(Int(line)),
            Int(self_cycles),
            Int(self_bmisses), Int(self_branches),
            Int(self_cmisses), Int(self_caches),
            Int(cycles),
            Int(bmisses), Int(branches),
            Int(cmisses), Int(caches),
            children, parents)

    def __add__(self, other):
        return PerfResult(self.file, self.function, self.line,
            self.self_cycles + other.self_cycles,
            self.self_bmisses + other.self_bmisses,
            self.self_branches + other.self_branches,
            self.self_cmisses + other.self_cmisses,
            self.self_caches + other.self_caches,
            self.cycles + other.cycles,
            self.bmisses + other.bmisses,
            self.branches + other.branches,
            self.cmisses + other.cmisses,
            self.caches + other.caches,
            self.children | other.children,
            self.parents | other.parents)


def openio(path, mode='r'):
    if path == '-':
        if mode == 'r':
            return os.fdopen(os.dup(sys.stdin.fileno()), 'r')
        else:
            return os.fdopen(os.dup(sys.stdout.fileno()), 'w')
    else:
        return open(path, mode)

# run perf as a subprocess, storing measurements into a zip file
def record(command, *,
        output=None,
        perf_freq=PERF_FREQ,
        perf_period=None,
        perf_events=PERF_EVENTS,
        perf_tool=PERF_TOOL,
        **args):
    if not command:
        print('error: no command specified?')
        sys.exit(-1)

    if not output:
        print('error: no output file specified?')
        sys.exit(-1)

    # create a temporary file for perf to write to, as far as I can tell
    # this is strictly needed because perf's pipe-mode only works with stdout
    with tempfile.NamedTemporaryFile('rb') as f:
        # figure out our perf invocation
        perf = perf_tool + list(filter(None, [
            'record',
            '-F%s' % perf_freq
                if perf_freq is not None
                and perf_period is None else None,
            '-c%s' % perf_period
                if perf_period is not None else None,
            '-B',
            '-g',
            '--all-user',
            '-e%s' % perf_events,
            '-o%s' % f.name]))

        # run our command
        try:
            if args.get('verbose'):
                print(' '.join(shlex.quote(c) for c in perf + command))
            err = sp.call(perf + command, close_fds=False)

        except KeyboardInterrupt:
            err = errno.EOWNERDEAD

        # synchronize access
        z = os.open(output, os.O_RDWR | os.O_CREAT)
        fcntl.flock(z, fcntl.LOCK_EX)

        # copy measurements into our zip file
        with os.fdopen(z, 'r+b') as z:
            with zipfile.ZipFile(z, 'a',
                    compression=zipfile.ZIP_DEFLATED,
                    compresslevel=1) as z:
                with z.open('perf.%d' % os.getpid(), 'w') as g:
                    shutil.copyfileobj(f, g)

    # forward the return code
    return err


def collect_decompressed(path, *,
        perf_tool=PERF_TOOL,
        everything=False,
        depth=0,
        **args):
    sample_pattern = re.compile(
        '(?P<comm>\w+)'
        '\s+(?P<pid>\w+)'
        '\s+(?P<time>[\w.]+):'
        '\s*(?P<period>\w+)'
        '\s+(?P<event>[^:]+):')
    frame_pattern = re.compile(
        '\s+(?P<addr>\w+)'
        '\s+(?P<sym>[^\s]+)'
        '\s+\((?P<dso>[^\)]+)\)')
    events = {
        'cycles':           'cycles',
        'branch-misses':    'bmisses',
        'branches':         'branches',
        'cache-misses':     'cmisses',
        'cache-references': 'caches'}

    # note perf_tool may contain extra args
    cmd = perf_tool + [
        'script',
        '-i%s' % path]
    if args.get('verbose'):
        print(' '.join(shlex.quote(c) for c in cmd))
    proc = sp.Popen(cmd,
        stdout=sp.PIPE,
        stderr=sp.PIPE if not args.get('verbose') else None,
        universal_newlines=True,
        errors='replace',
        close_fds=False)

    last_filtered = False
    last_has_frame = False
    last_event = ''
    last_period = 0
    results = co.defaultdict(lambda: co.defaultdict(lambda: (0, 0)))

    for line in proc.stdout:
        # we need to process a lot of data, so wait to use regex as late
        # as possible
        if not line:
            continue
        if not line.startswith('\t'):
            m = sample_pattern.match(line)
            if m:
                last_event = m.group('event')
                last_filtered = last_event in events
                last_period = int(m.group('period'), 0)
                last_has_frame = False
        elif last_filtered:
            m = frame_pattern.match(line)
            if m:
                # filter out internal/kernel functions
                if not everything and (
                        m.group('sym').startswith('__')
                        or m.group('dso').startswith('/usr/lib')
                        or not m.group('sym')[:1].isalpha()):
                    continue

                name = (
                    m.group('dso'),
                    m.group('sym'),
                    int(m.group('addr'), 16))
                self, total = results[name][last_event]
                if not last_has_frame:
                    results[name][last_event] = (
                        self + last_period,
                        total + last_period)
                    last_has_frame = True
                else:
                    results[name][last_event] = (
                        self,
                        total + last_period)

    proc.wait()
    if proc.returncode != 0:
        if not args.get('verbose'):
            for line in proc.stderr:
                sys.stdout.write(line)
        sys.exit(-1)

    # rearrange results into result type
    results_ = []
    for name, r in results.items():
        results_.append(PerfResult(*name,
            **{'self_'+events[e]: s for e, (s, _) in r.items()},
            **{        events[e]: t for e, (_, t) in r.items()}))
    results = results_

    return results

def collect_job(path, i, **args):
    # decompress into a temporary file, this is to work around
    # some limitations of perf
    with zipfile.ZipFile(path) as z:
        with z.open(i) as f:
            with tempfile.NamedTemporaryFile('wb') as g:
                shutil.copyfileobj(f, g)
                g.flush()

                return collect_decompressed(g.name, **args)

def starapply(args):
    f, args, kwargs = args
    return f(*args, **kwargs)

def collect(paths, *,
        jobs=None,
        objdump_tool=None,
        sources=None,
        everything=False,
        **args):
    symbol_pattern = re.compile(
        '^(?P<addr>[0-9a-fA-F]+)\s.*\s(?P<name>[^\s]+)\s*$')
    line_pattern = re.compile(
        '^\s+(?:'
            # matches dir/file table
            '(?P<no>[0-9]+)\s+'
                '(?:(?P<dir>[0-9]+)\s+)?'
                '.*\s+'
                '(?P<path>[^\s]+)'
            # matches line opcodes
            '|' '\[[^\]]*\]\s+'
                '(?:'
                    '(?P<op_special>Special)'
                    '|' '(?P<op_copy>Copy)'
                    '|' '(?P<op_end>End of Sequence)'
                    '|' 'File .*?to (?:entry )?(?P<op_file>\d+)'
                    '|' 'Line .*?to (?P<op_line>[0-9]+)'
                    '|' '(?:Address|PC) .*?to (?P<op_addr>[0x0-9a-fA-F]+)'
                    '|' '.' ')*'
            ')$', re.IGNORECASE)

    records = []
    for path in paths:
        # each .perf file is actually a zip file containing perf files from
        # multiple runs
        with zipfile.ZipFile(path) as z:
            records.extend((path, i) for i in z.infolist())

    # we're dealing with a lot of data but also surprisingly
    # parallelizable
    dsos = {}
    results = []
    with mp.Pool(jobs or len(os.sched_getaffinity(0))) as p:
        for results_ in p.imap_unordered(
                starapply,
                ((collect_job, (path, i), dict(
                    everything=everything,
                    **args))
                    for path, i in records)):

            # organize by dso
            results__ = {}
            for r in results_:
                if r.file not in results__:
                    results__[r.file] = []
                results__[r.file].append(r)
            results_ = results__

            for dso, results_ in results_.items():
                if dso not in dsos:
                    # find file+line ranges for dsos
                    #
                    # do this here so we only process each dso once
                    syms = {}
                    sym_at = []
                    cmd = objdump_tool + ['-t', dso]
                    if args.get('verbose'):
                        print(' '.join(shlex.quote(c) for c in cmd))
                    proc = sp.Popen(cmd,
                        stdout=sp.PIPE,
                        stderr=sp.PIPE if not args.get('verbose') else None,
                        universal_newlines=True,
                        errors='replace',
                        close_fds=False)
                    for line in proc.stdout:
                        m = symbol_pattern.match(line)
                        if m:
                            name = m.group('name')
                            addr = int(m.group('addr'), 16)
                            # note multiple symbols can share a name
                            if name not in syms:
                                syms[name] = set()
                            syms[name].add(addr)
                            sym_at.append((addr, name))
                    proc.wait()
                    if proc.returncode != 0:
                        if not args.get('verbose'):
                            for line in proc.stderr:
                                sys.stdout.write(line)
                        # assume no debug-info on failure
                        pass

                    # sort and keep first when duplicates
                    sym_at.sort()
                    sym_at_ = []
                    for addr, name in sym_at:
                        if len(sym_at_) == 0 or sym_at_[-1][0] != addr:
                            sym_at_.append((addr, name))
                    sym_at = sym_at_

                    # state machine for dwarf line numbers, note that objdump's
                    # decodedline seems to have issues with multiple dir/file
                    # tables, which is why we need this
                    line_at = []
                    dirs = {}
                    files = {}
                    op_file = 1
                    op_line = 1
                    op_addr = 0
                    cmd = objdump_tool + ['--dwarf=rawline', dso]
                    if args.get('verbose'):
                        print(' '.join(shlex.quote(c) for c in cmd))
                    proc = sp.Popen(cmd,
                        stdout=sp.PIPE,
                        stderr=sp.PIPE if not args.get('verbose') else None,
                        universal_newlines=True,
                        errors='replace',
                        close_fds=False)
                    for line in proc.stdout:
                        m = line_pattern.match(line)
                        if m:
                            if m.group('no') and not m.group('dir'):
                                # found a directory entry
                                dirs[int(m.group('no'))] = m.group('path')
                            elif m.group('no'):
                                # found a file entry
                                dir = int(m.group('dir'))
                                if dir in dirs:
                                    files[int(m.group('no'))] = os.path.join(
                                        dirs[dir],
                                        m.group('path'))
                                else:
                                    files[int(m.group('no'))] = m.group('path')
                            else:
                                # found a state machine update
                                if m.group('op_file'):
                                    op_file = int(m.group('op_file'), 0)
                                if m.group('op_line'):
                                    op_line = int(m.group('op_line'), 0)
                                if m.group('op_addr'):
                                    op_addr = int(m.group('op_addr'), 0)

                                if (m.group('op_special')
                                        or m.group('op_copy')
                                        or m.group('op_end')):
                                    line_at.append((
                                        op_addr,
                                        files.get(op_file, '?'),
                                        op_line))

                                if m.group('op_end'):
                                    op_file = 1
                                    op_line = 1
                                    op_addr = 0
                    proc.wait()
                    if proc.returncode != 0:
                        if not args.get('verbose'):
                            for line in proc.stderr:
                                sys.stdout.write(line)
                        # assume no debug-info on failure
                        pass

                    # sort and keep first when duplicates
                    #
                    # I think dwarf requires this to be sorted but just in case
                    line_at.sort()
                    line_at_ = []
                    for addr, file, line in line_at:
                        if len(line_at_) == 0 or line_at_[-1][0] != addr:
                            line_at_.append((addr, file, line))
                    line_at = line_at_

                    # discard lines outside of the range of the containing
                    # function, these are introduced by dwarf for inlined
                    # functions but don't map to elf-level symbols
                    sym_at_ = []
                    for addr, sym in sym_at:
                        i = bisect.bisect(line_at, addr, key=lambda x: x[0])
                        if i > 0:
                            _, file, line = line_at[i-1]
                            sym_at_.append((file, line, sym))
                    sym_at_.sort()

                    line_at_ = []
                    for addr, file, line in line_at:
                        # only keep if sym-at-addr and sym-at-line match
                        i = bisect.bisect(
                            sym_at, addr, key=lambda x: x[0])
                        j = bisect.bisect(
                            sym_at_, (file, line), key=lambda x: (x[0], x[1]))
                        if i > 0 and j > 0 and (
                                sym_at[i-1][1] == sym_at_[j-1][2]):
                            line_at_.append((addr, file, line))
                    line_at = line_at_

                    dsos[dso] = (syms, sym_at, line_at)

                syms, _, line_at = dsos[dso]

                # first try to reverse ASLR
                def deltas(r, d):
                    if '+' in r.function:
                        sym, off = r.function.split('+', 1)
                        off = int(off, 0)
                    else:
                        sym, off = r.function, 0
                    addr = r.line - off + d

                    for addr_ in syms.get(sym, []):
                        yield addr_ - addr

                delta = min(
                    it.chain.from_iterable(
                        deltas(r, 0) for r in results_),
                    key=lambda d: sum(it.chain.from_iterable(
                        deltas(r, d) for r in results_)),
                    default=0)

                # then try to map addrs -> file+line
                for r in results_:
                    addr = r.line + delta
                    i = bisect.bisect(line_at, addr, key=lambda x: x[0])
                    if i > 0:
                        _, file, line = line_at[i-1]
                    else:
                        file, line = re.sub('(\.o)?$', '.c', r.file, 1), 0

                    # ignore filtered sources
                    if sources is not None:
                        if not any(
                                os.path.abspath(file) == os.path.abspath(s)
                                for s in sources):
                            continue
                    else:
                        # default to only cwd
                        if not everything and not os.path.commonpath([
                                os.getcwd(),
                                os.path.abspath(file)]) == os.getcwd():
                            continue

                    # simplify path
                    if os.path.commonpath([
                            os.getcwd(),
                            os.path.abspath(file)]) == os.getcwd():
                        file = os.path.relpath(file)
                    else:
                        file = os.path.abspath(file)

                    function, *_ = r.function.split('+', 1)
                    results.append(PerfResult(file, function, line,
                        **{k: getattr(r, k) for k in PerfResult._fields}))

    return results


def fold(Result, results, *,
        by=None,
        defines=None,
        **_):
    if by is None:
        by = Result._by

    for k in it.chain(by or [], (k for k, _ in defines or [])):
        if k not in Result._by and k not in Result._fields:
            print("error: could not find field %r?" % k)
            sys.exit(-1)

    # filter by matching defines
    if defines is not None:
        results_ = []
        for r in results:
            if all(getattr(r, k) in vs for k, vs in defines):
                results_.append(r)
        results = results_

    # organize results into conflicts
    folding = co.OrderedDict()
    for r in results:
        name = tuple(getattr(r, k) for k in by)
        if name not in folding:
            folding[name] = []
        folding[name].append(r)

    # merge conflicts
    folded = []
    for name, rs in folding.items():
        folded.append(sum(rs[1:], start=rs[0]))

    return folded

def table(Result, results, diff_results=None, *,
        by=None,
        fields=None,
        sort=None,
        summary=False,
        all=False,
        percent=False,
        **_):
    all_, all = all, __builtins__.all

    if by is None:
        by = Result._by
    if fields is None:
        fields = Result._fields
    types = Result._types

    # fold again
    results = fold(Result, results, by=by)
    if diff_results is not None:
        diff_results = fold(Result, diff_results, by=by)

    # organize by name
    table = {
        ','.join(str(getattr(r, k) or '') for k in by): r
        for r in results}
    diff_table = {
        ','.join(str(getattr(r, k) or '') for k in by): r
        for r in diff_results or []}
    names = list(table.keys() | diff_table.keys())

    # sort again, now with diff info, note that python's sort is stable
    names.sort()
    if diff_results is not None:
        names.sort(key=lambda n: tuple(
            types[k].ratio(
                getattr(table.get(n), k, None),
                getattr(diff_table.get(n), k, None))
            for k in fields),
            reverse=True)
    if sort:
        for k, reverse in reversed(sort):
            names.sort(key=lambda n: (getattr(table[n], k),)
                if getattr(table.get(n), k, None) is not None else (),
                reverse=reverse ^ (not k or k in Result._fields))


    # build up our lines
    lines = []

    # header
    line = []
    line.append('%s%s' % (
        ','.join(by),
        ' (%d added, %d removed)' % (
            sum(1 for n in table if n not in diff_table),
            sum(1 for n in diff_table if n not in table))
            if diff_results is not None and not percent else '')
        if not summary else '')
    if diff_results is None:
        for k in fields:
            line.append(k)
    elif percent:
        for k in fields:
            line.append(k)
    else:
        for k in fields:
            line.append('o'+k)
        for k in fields:
            line.append('n'+k)
        for k in fields:
            line.append('d'+k)
    line.append('')
    lines.append(line)

    # entries
    if not summary:
        for name in names:
            r = table.get(name)
            if diff_results is not None:
                diff_r = diff_table.get(name)
                ratios = [
                    types[k].ratio(
                        getattr(r, k, None),
                        getattr(diff_r, k, None))
                    for k in fields]
                if not any(ratios) and not all_:
                    continue

            line = []
            line.append(name)
            if diff_results is None:
                for k in fields:
                    line.append(getattr(r, k).table()
                        if getattr(r, k, None) is not None
                        else types[k].none)
            elif percent:
                for k in fields:
                    line.append(getattr(r, k).diff_table()
                        if getattr(r, k, None) is not None
                        else types[k].diff_none)
            else:
                for k in fields:
                    line.append(getattr(diff_r, k).diff_table()
                        if getattr(diff_r, k, None) is not None
                        else types[k].diff_none)
                for k in fields:
                    line.append(getattr(r, k).diff_table()
                        if getattr(r, k, None) is not None
                        else types[k].diff_none)
                for k in fields:
                    line.append(types[k].diff_diff(
                            getattr(r, k, None),
                            getattr(diff_r, k, None)))
            if diff_results is None:
                line.append('')
            elif percent:
                line.append(' (%s)' % ', '.join(
                    '+∞%' if t == +m.inf
                    else '-∞%' if t == -m.inf
                    else '%+.1f%%' % (100*t)
                    for t in ratios))
            else:
                line.append(' (%s)' % ', '.join(
                        '+∞%' if t == +m.inf
                        else '-∞%' if t == -m.inf
                        else '%+.1f%%' % (100*t)
                        for t in ratios
                        if t)
                    if any(ratios) else '')
            lines.append(line)

    # total
    r = next(iter(fold(Result, results, by=[])), None)
    if diff_results is not None:
        diff_r = next(iter(fold(Result, diff_results, by=[])), None)
        ratios = [
            types[k].ratio(
                getattr(r, k, None),
                getattr(diff_r, k, None))
            for k in fields]

    line = []
    line.append('TOTAL')
    if diff_results is None:
        for k in fields:
            line.append(getattr(r, k).table()
                if getattr(r, k, None) is not None
                else types[k].none)
    elif percent:
        for k in fields:
            line.append(getattr(r, k).diff_table()
                if getattr(r, k, None) is not None
                else types[k].diff_none)
    else:
        for k in fields:
            line.append(getattr(diff_r, k).diff_table()
                if getattr(diff_r, k, None) is not None
                else types[k].diff_none)
        for k in fields:
            line.append(getattr(r, k).diff_table()
                if getattr(r, k, None) is not None
                else types[k].diff_none)
        for k in fields:
            line.append(types[k].diff_diff(
                    getattr(r, k, None),
                    getattr(diff_r, k, None)))
    if diff_results is None:
        line.append('')
    elif percent:
        line.append(' (%s)' % ', '.join(
            '+∞%' if t == +m.inf
            else '-∞%' if t == -m.inf
            else '%+.1f%%' % (100*t)
            for t in ratios))
    else:
        line.append(' (%s)' % ', '.join(
                '+∞%' if t == +m.inf
                else '-∞%' if t == -m.inf
                else '%+.1f%%' % (100*t)
                for t in ratios
                if t)
            if any(ratios) else '')
    lines.append(line)

    # find the best widths, note that column 0 contains the names and column -1
    # the ratios, so those are handled a bit differently
    widths = [
        ((max(it.chain([w], (len(l[i]) for l in lines)))+1+4-1)//4)*4-1
        for w, i in zip(
            it.chain([23], it.repeat(7)),
            range(len(lines[0])-1))]

    # print our table
    for line in lines:
        print('%-*s  %s%s' % (
            widths[0], line[0],
            ' '.join('%*s' % (w, x)
                for w, x in zip(widths[1:], line[1:-1])),
            line[-1]))


def annotate(Result, results, *,
        annotate=None,
        threshold=None,
        branches=False,
        caches=False,
        **args):
    # figure out the threshold
    if threshold is None:
        t0, t1 = THRESHOLD
    elif len(threshold) == 1:
        t0, t1 = threshold[0], threshold[0]
    else:
        t0, t1 = threshold
    t0, t1 = min(t0, t1), max(t0, t1)

    if not branches and not caches:
        tk = 'self_cycles'
    elif branches:
        tk = 'self_bmisses'
    else:
        tk = 'self_cmisses'

    # find max cycles
    max_ = max(it.chain((float(getattr(r, tk)) for r in results), [1]))

    for path in co.OrderedDict.fromkeys(r.file for r in results).keys():
        # flatten to line info
        results = fold(Result, results, by=['file', 'line'])
        table = {r.line: r for r in results if r.file == path}

        # calculate spans to show
        if not annotate:
            spans = []
            last = None
            func = None
            for line, r in sorted(table.items()):
                if float(getattr(r, tk)) / max_ >= t0:
                    if last is not None and line - last.stop <= args['context']:
                        last = range(
                            last.start,
                            line+1+args['context'])
                    else:
                        if last is not None:
                            spans.append((last, func))
                        last = range(
                            line-args['context'],
                            line+1+args['context'])
                        func = r.function
            if last is not None:
                spans.append((last, func))

        with open(path) as f:
            skipped = False
            for i, line in enumerate(f):
                # skip lines not in spans?
                if not annotate and not any(i+1 in s for s, _ in spans):
                    skipped = True
                    continue

                if skipped:
                    skipped = False
                    print('%s@@ %s:%d: %s @@%s' % (
                        '\x1b[36m' if args['color'] else '',
                        path,
                        i+1,
                        next(iter(f for _, f in spans)),
                        '\x1b[m' if args['color'] else ''))

                # build line
                if line.endswith('\n'):
                    line = line[:-1]

                r = table.get(i+1)
                if r is not None and (
                        float(r.self_cycles) > 0
                        if not branches and not caches
                        else float(r.self_bmisses) > 0
                            or float(r.self_branches) > 0
                        if branches
                        else float(r.self_cmisses) > 0
                            or float(r.self_caches) > 0):
                    line = '%-*s // %s' % (
                        args['width'],
                        line,
                        '%s cycles' % r.self_cycles
                        if not branches and not caches
                        else '%s bmisses, %s branches' % (
                            r.self_bmisses, r.self_branches)
                        if branches
                        else '%s cmisses, %s caches' % (
                            r.self_cmisses, r.self_caches))

                    if args['color']:
                        if float(getattr(r, tk)) / max_ >= t1:
                            line = '\x1b[1;31m%s\x1b[m' % line
                        elif float(getattr(r, tk)) / max_ >= t0:
                            line = '\x1b[35m%s\x1b[m' % line

                print(line)


def report(perf_paths, *,
        by=None,
        fields=None,
        defines=None,
        sort=None,
        self=False,
        branches=False,
        caches=False,
        tree=False,
        depth=None,
        **args):
    # figure out what color should be
    if args.get('color') == 'auto':
        args['color'] = sys.stdout.isatty()
    elif args.get('color') == 'always':
        args['color'] = True
    else:
        args['color'] = False

    # it doesn't really make sense to not have a depth with tree,
    # so assume depth=inf if tree by default
    if args.get('depth') is None:
        args['depth'] = m.inf if tree else 1
    elif args.get('depth') == 0:
        args['depth'] = m.inf

    # find sizes
    if not args.get('use', None):
        # find .o files
        paths = []
        for path in perf_paths:
            if os.path.isdir(path):
                path = path + '/*.perf'

            for path in glob.glob(path):
                paths.append(path)

        if not paths:
            print("error: no .perf files found in %r?" % perf_paths)
            sys.exit(-1)

        results = collect(paths, **args)
    else:
        results = []
        with openio(args['use']) as f:
            reader = csv.DictReader(f, restval='')
            for r in reader:
                try:
                    results.append(PerfResult(
                        **{k: r[k] for k in PerfResult._by
                            if k in r and r[k].strip()},
                        **{k: r['perf_'+k] for k in PerfResult._fields
                            if 'perf_'+k in r and r['perf_'+k].strip()}))
                except TypeError:
                    pass

    # fold
    results = fold(PerfResult, results, by=by, defines=defines)

    # sort, note that python's sort is stable
    results.sort()
    if sort:
        for k, reverse in reversed(sort):
            results.sort(key=lambda r: (getattr(r, k),)
                if getattr(r, k) is not None else (),
                reverse=reverse ^ (not k or k in PerfResult._fields))

    # write results to CSV
    if args.get('output'):
        with openio(args['output'], 'w') as f:
            writer = csv.DictWriter(f,
                (by if by is not None else PerfResult._by)
                + ['perf_'+k for k in PerfResult._fields])
            writer.writeheader()
            for r in results:
                writer.writerow(
                    {k: getattr(r, k)
                        for k in (by if by is not None else PerfResult._by)}
                    | {'perf_'+k: getattr(r, k)
                        for k in PerfResult._fields})

    # find previous results?
    if args.get('diff'):
        diff_results = []
        try:
            with openio(args['diff']) as f:
                reader = csv.DictReader(f, restval='')
                for r in reader:
                    try:
                        diff_results.append(PerfResult(
                            **{k: r[k] for k in PerfResult._by
                                if k in r and r[k].strip()},
                            **{k: r['perf_'+k] for k in PerfResult._fields
                                if 'perf_'+k in r and r['perf_'+k].strip()}))
                    except TypeError:
                        pass
        except FileNotFoundError:
            pass

        # fold
        diff_results = fold(PerfResult, diff_results, by=by, defines=defines)

    # print table
    if not args.get('quiet'):
        if args.get('annotate') or args.get('threshold'):
            # annotate sources
            annotate(PerfResult, results,
                branches=branches,
                caches=caches,
                **args)
        else:
            # print table
            table(PerfResult, results,
                diff_results if args.get('diff') else None,
                by=by if by is not None else ['function'],
                fields=fields if fields is not None else [
                    'self_'+k if self else k
                    for k in (
                        ['cycles'] if not branches and not caches
                        else ['bmisses', 'branches'] if branches
                        else ['cmisses', 'caches'])],
                sort=sort,
                **args)


def main(**args):
    if args.get('record'):
        return record(**args)
    else:
        return report(**args)


if __name__ == "__main__":
    import argparse
    import sys

    # bit of a hack, but parse_intermixed_args and REMAINDER are
    # incompatible, so we need to figure out what we want before running
    # argparse
    if '-R' in sys.argv or '--record' in sys.argv:
        nargs = argparse.REMAINDER
    else:
        nargs = '*'

    argparse.ArgumentParser._handle_conflict_ignore = lambda *_: None
    argparse._ArgumentGroup._handle_conflict_ignore = lambda *_: None
    parser = argparse.ArgumentParser(
        description="Aggregate and report Linux perf results.",
        allow_abbrev=False,
        conflict_handler='ignore')
    parser.add_argument(
        'perf_paths',
        nargs=nargs,
        help="Description of where to find *.perf files. May be a directory "
            "or a list of paths. Defaults to %r." % PERF_PATHS)
    parser.add_argument(
        '-v', '--verbose',
        action='store_true',
        help="Output commands that run behind the scenes.")
    parser.add_argument(
        '-q', '--quiet',
        action='store_true',
        help="Don't show anything, useful with -o.")
    parser.add_argument(
        '-o', '--output',
        help="Specify CSV file to store results.")
    parser.add_argument(
        '-u', '--use',
        help="Don't parse anything, use this CSV file.")
    parser.add_argument(
        '-d', '--diff',
        help="Specify CSV file to diff against.")
    parser.add_argument(
        '-a', '--all',
        action='store_true',
        help="Show all, not just the ones that changed.")
    parser.add_argument(
        '-p', '--percent',
        action='store_true',
        help="Only show percentage change, not a full diff.")
    parser.add_argument(
        '-b', '--by',
        action='append',
        choices=PerfResult._by,
        help="Group by this field.")
    parser.add_argument(
        '-f', '--field',
        dest='fields',
        action='append',
        choices=PerfResult._fields,
        help="Show this field.")
    parser.add_argument(
        '-D', '--define',
        dest='defines',
        action='append',
        type=lambda x: (lambda k,v: (k, set(v.split(','))))(*x.split('=', 1)),
        help="Only include results where this field is this value.")
    class AppendSort(argparse.Action):
        def __call__(self, parser, namespace, value, option):
            if namespace.sort is None:
                namespace.sort = []
            namespace.sort.append((value, True if option == '-S' else False))
    parser.add_argument(
        '-s', '--sort',
        action=AppendSort,
        help="Sort by this fields.")
    parser.add_argument(
        '-S', '--reverse-sort',
        action=AppendSort,
        help="Sort by this fields, but backwards.")
    parser.add_argument(
        '-Y', '--summary',
        action='store_true',
        help="Only show the total.")
    parser.add_argument(
        '-F', '--source',
        dest='sources',
        action='append',
        help="Only consider definitions in this file. Defaults to anything "
            "in the current directory.")
    parser.add_argument(
        '--everything',
        action='store_true',
        help="Include builtin and libc specific symbols.")
    parser.add_argument(
        '--self',
        action='store_true',
        help="Show samples before propagation up the call-chain.")
    parser.add_argument(
        '--branches',
        action='store_true',
        help="Show branches and branch misses.")
    parser.add_argument(
        '--caches',
        action='store_true',
        help="Show cache accesses and cache misses.")
    parser.add_argument(
        '-A', '--annotate',
        action='store_true',
        help="Show source files annotated with coverage info.")
    parser.add_argument(
        '-T', '--threshold',
        nargs='?',
        type=lambda x: tuple(float(x) for x in x.split(',')),
        const=THRESHOLD,
        help="Show lines wth samples above this threshold as a percent of "
            "all lines. Defaults to %s." % ','.join(str(t) for t in THRESHOLD))
    parser.add_argument(
        '-c', '--context',
        type=lambda x: int(x, 0),
        default=3,
        help="Show n additional lines of context. Defaults to 3.")
    parser.add_argument(
        '-W', '--width',
        type=lambda x: int(x, 0),
        default=80,
        help="Assume source is styled with this many columns. Defaults to 80.")
    parser.add_argument(
        '--color',
        choices=['never', 'always', 'auto'],
        default='auto',
        help="When to use terminal colors. Defaults to 'auto'.")
    parser.add_argument(
        '-j', '--jobs',
        nargs='?',
        type=lambda x: int(x, 0),
        const=0,
        help="Number of processes to use. 0 spawns one process per core.")
    parser.add_argument(
        '--perf-tool',
        type=lambda x: x.split(),
        help="Path to the perf tool to use. Defaults to %r." % PERF_TOOL)
    parser.add_argument(
        '--objdump-tool',
        type=lambda x: x.split(),
        default=OBJDUMP_TOOL,
        help="Path to the objdump tool to use. Defaults to %r." % OBJDUMP_TOOL)

    # record flags
    record_parser = parser.add_argument_group('record options')
    record_parser.add_argument(
        'command',
        nargs=nargs,
        help="Command to run.")
    record_parser.add_argument(
        '-R', '--record',
        action='store_true',
        help="Run a command and aggregate perf measurements.")
    record_parser.add_argument(
        '-o', '--output',
        help="Output file. Uses flock to synchronize. This is stored as a "
            "zip-file of multiple perf results.")
    record_parser.add_argument(
        '--perf-freq',
        help="perf sampling frequency. This is passed directly to perf. "
            "Defaults to %r." % PERF_FREQ)
    record_parser.add_argument(
        '--perf-period',
        help="perf sampling period. This is passed directly to perf.")
    record_parser.add_argument(
        '--perf-events',
        help="perf events to record. This is passed directly to perf. "
            "Defaults to %r." % PERF_EVENTS)
    record_parser.add_argument(
        '--perf-tool',
        type=lambda x: x.split(),
        help="Path to the perf tool to use. Defaults to %r." % PERF_TOOL)

    # avoid intermixed/REMAINDER conflict, see above
    if nargs == argparse.REMAINDER:
        args = parser.parse_args()
    else:
        args = parser.parse_intermixed_args()

    # perf_paths/command overlap, so need to do some munging here
    args.command = args.perf_paths
    args.perf_paths = args.perf_paths or PERF_PATHS

    sys.exit(main(**{k: v
        for k, v in vars(args).items()
        if v is not None}))