littlefs/scripts/perf.py

#!/usr/bin/env python3
#
# Script to aggregate and report Linux perf results.
#
# Example:
# ./scripts/perf.py --record -obench.perf ./runners/bench_runner
# ./scripts/perf.py bench.perf -j -Flfs.c -Flfs_util.c -Scycles
#
# Copyright (c) 2022, The littlefs authors.
# SPDX-License-Identifier: BSD-3-Clause
#

# prevent local imports
if __name__ == "__main__":
    __import__('sys').path.pop(0)

import bisect
import collections as co
import csv
import errno
import fcntl
import fnmatch
import functools as ft
import io
import itertools as it
import math as mt
import multiprocessing as mp
import os
import re
import shlex
import shutil
import subprocess as sp
import sys
import tempfile
import zipfile


# TODO support non-zip perf results?

PERF_PATH = ['perf']
PERF_EVENTS = 'cycles,branch-misses,branches,cache-misses,cache-references'
PERF_FREQ = 100
OBJDUMP_PATH = ['objdump']
THRESHOLD = (0.5, 0.85)


# integer fields
class CsvInt(co.namedtuple('CsvInt', 'a')):
    __slots__ = ()
    def __new__(cls, a=0):
        if isinstance(a, CsvInt):
            return a
        if isinstance(a, str):
            try:
                a = int(a, 0)
            except ValueError:
                # also accept +-∞ and +-inf
                if re.match('^\s*\+?\s*(?:∞|inf)\s*$', a):
                    a = mt.inf
                elif re.match('^\s*-\s*(?:∞|inf)\s*$', a):
                    a = -mt.inf
                else:
                    raise
        if not (isinstance(a, int) or mt.isinf(a)):
            a = int(a)
        return super().__new__(cls, a)

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, self.a)

    def __str__(self):
        if self.a == mt.inf:
            return '∞'
        elif self.a == -mt.inf:
            return '-∞'
        else:
            return str(self.a)

    def __csv__(self):
        if self.a == mt.inf:
            return 'inf'
        elif self.a == -mt.inf:
            return '-inf'
        else:
            return repr(self.a)

    def __bool__(self):
        return bool(self.a)

    def __int__(self):
        assert not mt.isinf(self.a)
        return self.a

    def __float__(self):
        return float(self.a)

    none = '%7s' % '-'
    def table(self):
        return '%7s' % (self,)

    def diff(self, other):
        new = self.a if self else 0
        old = other.a if other else 0
        diff = new - old
        if diff == +mt.inf:
            return '%7s' % '+∞'
        elif diff == -mt.inf:
            return '%7s' % '-∞'
        else:
            return '%+7d' % diff

    def ratio(self, other):
        new = self.a if self else 0
        old = other.a if other else 0
        if mt.isinf(new) and mt.isinf(old):
            return 0.0
        elif mt.isinf(new):
            return +mt.inf
        elif mt.isinf(old):
            return -mt.inf
        elif not old and not new:
            return 0.0
        elif not old:
            return +mt.inf
        else:
            return (new-old) / old

    def __pos__(self):
        return self.__class__(+self.a)

    def __neg__(self):
        return self.__class__(-self.a)

    def __abs__(self):
        return self.__class__(abs(self.a))

    def __add__(self, other):
        return self.__class__(self.a + other.a)

    def __sub__(self, other):
        return self.__class__(self.a - other.a)

    def __mul__(self, other):
        return self.__class__(self.a * other.a)

    def __truediv__(self, other):
        if not other:
            if self >= self.__class__(0):
                return self.__class__(+mt.inf)
            else:
                return self.__class__(-mt.inf)
        return self.__class__(self.a // other.a)

    def __mod__(self, other):
        return self.__class__(self.a % other.a)

# perf results
class PerfResult(co.namedtuple('PerfResult', [
        'z', 'file', 'function', 'line',
        'cycles', 'bmisses', 'branches', 'cmisses', 'caches',
        'children'])):
    _prefix = 'perf'
    _by = ['z', 'file', 'function', 'line']
    _fields = ['cycles', 'bmisses', 'branches', 'cmisses', 'caches']
    _sort = ['cycles', 'bmisses', 'cmisses', 'branches', 'caches']
    _types = {
            'cycles': CsvInt,
            'bmisses': CsvInt, 'branches': CsvInt,
            'cmisses': CsvInt, 'caches': CsvInt}
    _children = 'children'

    __slots__ = ()
    def __new__(cls, z=0, file='', function='', line=0,
            cycles=0, bmisses=0, branches=0, cmisses=0, caches=0,
            children=None):
        return super().__new__(cls, z, file, function, int(CsvInt(line)),
                CsvInt(cycles),
                CsvInt(bmisses), CsvInt(branches),
                CsvInt(cmisses), CsvInt(caches),
                children if children is not None else [])

    def __add__(self, other):
        return PerfResult(self.z, self.file, self.function, self.line,
                self.cycles + other.cycles,
                self.bmisses + other.bmisses,
                self.branches + other.branches,
                self.cmisses + other.cmisses,
                self.caches + other.caches,
                self.children + other.children)


# open with '-' for stdin/stdout
def openio(path, mode='r', buffering=-1):
    import os
    if path == '-':
        if 'r' in mode:
            return os.fdopen(os.dup(sys.stdin.fileno()), mode, buffering)
        else:
            return os.fdopen(os.dup(sys.stdout.fileno()), mode, buffering)
    else:
        return open(path, mode, buffering)

# run perf as a subprocess, storing measurements into a zip file
def record(command, *,
        output=None,
        perf_freq=PERF_FREQ,
        perf_period=None,
        perf_events=PERF_EVENTS,
        perf_path=PERF_PATH,
        **args):
    # create a temporary file for perf to write to, as far as I can tell
    # this is strictly needed because perf's pipe-mode only works with stdout
    with tempfile.NamedTemporaryFile('rb') as f:
        # figure out our perf invocation
        perf = perf_path + list(filter(None, [
                'record',
                '-F%s' % perf_freq
                    if perf_freq is not None
                    and perf_period is None else None,
                '-c%s' % perf_period
                    if perf_period is not None else None,
                '-B',
                '-g',
                '--all-user',
                '-e%s' % perf_events,
                '-o%s' % f.name]))

        # run our command
        try:
            if args.get('verbose'):
                print(' '.join(shlex.quote(c) for c in perf + command))
            err = sp.call(perf + command, close_fds=False)

        except KeyboardInterrupt:
            err = errno.EOWNERDEAD

        # synchronize access
        z = os.open(output, os.O_RDWR | os.O_CREAT)
        fcntl.flock(z, fcntl.LOCK_EX)

        # copy measurements into our zip file
        with os.fdopen(z, 'r+b') as z:
            with zipfile.ZipFile(z, 'a',
                    compression=zipfile.ZIP_DEFLATED,
                    compresslevel=1) as z:
                with z.open('perf.%d' % os.getpid(), 'w') as g:
                    shutil.copyfileobj(f, g)

    # forward the return code
    return err


# try to only process each dso once
#
# note this only caches with the non-keyword arguments
def multiprocessing_cache(f):
    local_cache = {}
    manager = mp.Manager()
    global_cache = manager.dict()
    lock = mp.Lock()

    def multiprocessing_cache(*args, **kwargs):
        # check local cache?
        if args in local_cache:
            return local_cache[args]
        # check global cache?
        with lock:
            if args in global_cache:
                v = global_cache[args]
                local_cache[args] = v
                return v
            # fall back to calling the function
            v = f(*args, **kwargs)
            global_cache[args] = v
            local_cache[args] = v
            return v

    return multiprocessing_cache

class Sym(co.namedtuple('Sym', [
        'name', 'global_', 'section', 'addr', 'size'])):
    __slots__ = ()
    def __new__(cls, name, global_, section, addr, size):
        return super().__new__(cls, name, global_, section, addr, size)

    def __repr__(self):
        return '%s(%r, %r, %r, 0x%x, 0x%x)' % (
                self.__class__.__name__,
                self.name,
                self.global_,
                self.section,
                self.addr,
                self.size)

class SymInfo:
    def __init__(self, syms):
        self.syms = syms

    def get(self, k, d=None):
        # allow lookup by both symbol and address
        if isinstance(k, str):
            # organize by symbol, note multiple symbols can share a name
            if not hasattr(self, '_by_sym'):
                by_sym = {}
                for sym in self.syms:
                    if sym.name not in by_sym:
                        by_sym[sym.name] = []
                    if sym not in by_sym[sym.name]:
                        by_sym[sym.name].append(sym)
                self._by_sym = by_sym

            return self._by_sym.get(k, d)

        else:
            import bisect

            # organize by address
            if not hasattr(self, '_by_addr'):
                # sort and keep largest/first when duplicates
                syms = self.syms.copy()
                syms.sort(key=lambda x: (x.addr, -x.size))

                by_addr = []
                for sym in syms:
                    if (len(by_addr) == 0
                            or by_addr[-1].addr != sym.addr):
                        by_addr.append(sym)
                self._by_addr = by_addr

            # find sym by range
            i = bisect.bisect(self._by_addr, k,
                    key=lambda x: x.addr) - 1
            # check that we're actually in this sym's size
            if i > -1 and k < self._by_addr[i].addr+self._by_addr[i].size:
                return self._by_addr[i]
            else:
                return d

    def __getitem__(self, k):
        v = self.get(k)
        if v is None:
            raise KeyError(k)
        return v

    def __contains__(self, k):
        return self.get(k) is not None

    def __bool__(self):
        return bool(self.syms)

    def __len__(self):
        return len(self.syms)

    def __iter__(self):
        return iter(self.syms)

    def globals(self):
        return SymInfo([sym for sym in self.syms
                if sym.global_])

    def section(self, section):
        return SymInfo([sym for sym in self.syms
                # note we accept prefixes
                if s.startswith(section)])

@multiprocessing_cache
def collect_syms(obj_path, global_=False, sections=None, *,
        objdump_path=OBJDUMP_PATH,
        **args):
    symbol_pattern = re.compile(
            '^(?P<addr>[0-9a-fA-F]+)'
                ' (?P<scope>.).*'
                '\s+(?P<section>[^\s]+)'
                '\s+(?P<size>[0-9a-fA-F]+)'
                '\s+(?P<name>[^\s]+)\s*$')

    # find symbol addresses and sizes
    syms = []
    cmd = objdump_path + ['--syms', obj_path]
    if args.get('verbose'):
        print(' '.join(shlex.quote(c) for c in cmd))
    proc = sp.Popen(cmd,
            stdout=sp.PIPE,
            universal_newlines=True,
            errors='replace',
            close_fds=False)
    for line in proc.stdout:
        m = symbol_pattern.match(line)
        if m:
            name = m.group('name')
            scope = m.group('scope')
            section = m.group('section')
            addr = int(m.group('addr'), 16)
            size = int(m.group('size'), 16)
            # skip non-globals?
            # l => local
            # g => global
            # u => unique global
            #   => neither
            # ! => local + global
            global__ = scope not in 'l '
            if global_ and not global__:
                continue
            # filter by section? note we accept prefixes
            if (sections is not None
                    and not any(section.startswith(prefix)
                        for prefix in sections)):
                continue
            # skip zero sized symbols
            if not size:
                continue
            # note multiple symbols can share a name
            syms.append(Sym(name, global__, section, addr, size))
    proc.wait()
    if proc.returncode != 0:
        raise sp.CalledProcessError(proc.returncode, proc.args)

    return SymInfo(syms)

class Line(co.namedtuple('Line', ['file', 'line', 'addr'])):
    __slots__ = ()
    def __new__(cls, file, line, addr):
        return super().__new__(cls, file, line, addr)

    def __repr__(self):
        return '%s(%r, %r, 0x%x)' % (
                self.__class__.__name__,
                self.file,
                self.line,
                self.addr)

class LineInfo:
    def __init__(self, lines):
        self.lines = lines

    def get(self, k, d=None):
        # allow lookup by both address and file+line tuple
        if not isinstance(k, tuple):
            import bisect

            # organize by address
            if not hasattr(self, '_by_addr'):
                # sort and keep first when duplicates
                lines = self.lines.copy()
                lines.sort(key=lambda x: (x.addr, x.file, x.line))

                by_addr = []
                for line in lines:
                    if (len(by_addr) == 0
                            or by_addr[-1].addr != line.addr):
                        by_addr.append(line)
                self._by_addr = by_addr

            # find file+line by addr
            i = bisect.bisect(self._by_addr, k,
                    key=lambda x: x.addr) - 1
            if i > -1:
                return self._by_addr[i]
            else:
                return d

        else:
            import bisect

            # organize by file+line
            if not hasattr(self, '_by_line'):
                # sort and keep first when duplicates
                lines = self.lines.copy()
                lines.sort()

                by_line = []
                for line in lines:
                    if (len(by_line) == 0
                            or by_line[-1].file != line.file
                            or by_line[-1].line != line.line):
                        by_line.append(line)
                self._by_line = by_line

            # find addr by file+line tuple
            i = bisect.bisect(self._by_line, k,
                    key=lambda x: (x.file, x.line)) - 1
            # make sure file at least matches!
            if i > -1 and self._by_line[i].file == k[0]:
                return self._by_line[i]
            else:
                return d

    def __getitem__(self, k):
        v = self.get(k)
        if v is None:
            raise KeyError(k)
        return v

    def __contains__(self, k):
        return self.get(k) is not None

    def __bool__(self):
        return bool(self.lines)

    def __len__(self):
        return len(self.lines)

    def __iter__(self):
        return iter(self.lines)

@multiprocessing_cache
def collect_dwarf_lines(obj_path, *,
        objdump_path=OBJDUMP_PATH,
        **args):
    line_pattern = re.compile(
            # matches dir/file table
            '^\s*(?P<no>[0-9]+)'
                    '(?:\s+(?P<dir>[0-9]+))?'
                    '.*\s+(?P<path>[^\s]+)\s*$'
                # matches line opcodes
                '|' '^\s*\[[^\]]*\]' '(?:'
                    '\s+(?P<op_special>Special)'
                        '|' '\s+(?P<op_copy>Copy)'
                        '|' '\s+(?P<op_end>End of Sequence)'
                        '|' '\s+File.*?to.*?(?P<op_file>[0-9]+)'
                        '|' '\s+Line.*?to.*?(?P<op_line>[0-9]+)'
                        '|' '\s+(?:Address|PC)'
                            '\s+.*?to.*?(?P<op_addr>[0xX0-9a-fA-F]+)'
                        '|' '\s+[^\s]+' ')+\s*$',
            re.IGNORECASE)

    # state machine for dwarf line numbers, note that objdump's
    # decodedline seems to have issues with multiple dir/file
    # tables, which is why we need this
    lines = []
    dirs = co.OrderedDict()
    files = co.OrderedDict()
    op_file = 1
    op_line = 1
    op_addr = 0
    cmd = objdump_path + ['--dwarf=rawline', obj_path]
    if args.get('verbose'):
        print(' '.join(shlex.quote(c) for c in cmd))
    proc = sp.Popen(cmd,
            stdout=sp.PIPE,
            universal_newlines=True,
            errors='replace',
            close_fds=False)
    for line in proc.stdout:
        m = line_pattern.match(line)
        if m:
            if m.group('no') and not m.group('dir'):
                # found a directory entry
                dirs[int(m.group('no'))] = m.group('path')
            elif m.group('no'):
                # found a file entry
                dir = int(m.group('dir'))
                if dir in dirs:
                    files[int(m.group('no'))] = os.path.join(
                            dirs[dir],
                            m.group('path'))
                else:
                    files[int(m.group('no'))] = m.group('path')
            else:
                # found a state machine update
                if m.group('op_file'):
                    op_file = int(m.group('op_file'), 0)
                if m.group('op_line'):
                    op_line = int(m.group('op_line'), 0)
                if m.group('op_addr'):
                    op_addr = int(m.group('op_addr'), 0)

                if (m.group('op_special')
                        or m.group('op_copy')
                        or m.group('op_end')):
                    file = os.path.abspath(files.get(op_file, '?'))
                    lines.append(Line(file, op_line, op_addr))

                if m.group('op_end'):
                    op_file = 1
                    op_line = 1
                    op_addr = 0
    proc.wait()
    if proc.returncode != 0:
        raise sp.CalledProcessError(proc.returncode, proc.args)

    return LineInfo(lines)


def collect_decompressed(path, *,
        perf_path=PERF_PATH,
        sources=None,
        everything=False,
        no_strip=False,
        propagate=0,
        depth=1,
        **args):
    sample_pattern = re.compile(
            '(?P<comm>\w+)'
            '\s+(?P<pid>\w+)'
            '\s+(?P<time>[\w.]+):'
            '\s*(?P<period>\w+)'
            '\s+(?P<event>[^:]+):')
    frame_pattern = re.compile(
            '\s+(?P<addr>\w+)'
            '\s+(?P<sym>[^\s\+]+)(?:\+(?P<off>\w+))?'
            '\s+\((?P<dso>[^\)]+)\)')
    events = {
            'cycles':           'cycles',
            'branch-misses':    'bmisses',
            'branches':         'branches',
            'cache-misses':     'cmisses',
            'cache-references': 'caches'}

    # note perf_path may contain extra args
    cmd = perf_path + [
            'script',
            '-i%s' % path]
    if args.get('verbose'):
        print(' '.join(shlex.quote(c) for c in cmd))
    proc = sp.Popen(cmd,
        stdout=sp.PIPE,
        universal_newlines=True,
        errors='replace',
        close_fds=False)

    last_filtered = False
    last_event = ''
    last_period = 0
    last_stack = []
    deltas = co.defaultdict(lambda: {})
    syms_ = co.defaultdict(lambda: {})
    at_cache = {}
    results = {}

    def commit():
        # tail-recursively propagate measurements
        for i in range(len(last_stack)):
            results_ = results
            for j in reversed(range(i+1)):
                if i+1-j > depth:
                    break

                # propagate
                name = last_stack[j]
                if name not in results_:
                    results_[name] = (co.defaultdict(lambda: 0), {})
                results_[name][0][last_event] += last_period

                # recurse
                results_ = results_[name][1]

    for line in proc.stdout:
        # we need to process a lot of data, so wait to use regex as late
        # as possible
        if not line.startswith('\t'):
            if last_filtered:
                commit()
            last_filtered = False

            if line:
                m = sample_pattern.match(line)
                if m and m.group('event') in events:
                    last_filtered = True
                    last_event = m.group('event')
                    last_period = int(m.group('period'), 0)
                    last_stack = []

        elif last_filtered:
            m = frame_pattern.match(line)
            if m:
                # filter out internal/kernel functions
                if not everything and (
                        m.group('sym').startswith('__')
                            or m.group('sym').startswith('0')
                            or m.group('sym').startswith('-')
                            or m.group('sym').startswith('[')
                            or m.group('dso').startswith('/usr/lib')):
                    continue

                dso = m.group('dso')
                sym = m.group('sym')
                off = int(m.group('off'), 0) if m.group('off') else 0
                addr_ = int(m.group('addr'), 16)

                # get the syms/lines for the dso, this is cached
                syms = collect_syms(dso,
                        sections=['.text'],
                        **args)
                lines = collect_dwarf_lines(dso, **args)

                # ASLR is tricky, we have symbols+offsets, but static symbols
                # means we may have multiple options for each symbol.
                #
                # To try to solve this, we use previous seen symbols to build
                # confidence for the correct ASLR delta. This means we may
                # guess incorrectly for early symbols, but this will only affect
                # a few samples.
                if sym in syms:
                    sym_addr_ = addr_ - off

                    # track possible deltas?
                    for sym_ in syms[sym]:
                        delta = sym_.addr - sym_addr_
                        if delta not in deltas[dso]:
                            deltas[dso][delta] = sum(
                                    abs(a_+delta - a)
                                        for s, (a_, _) in syms_[dso].items()
                                        for a, _ in syms[s])
                    for delta in deltas[dso].keys():
                        deltas[dso][delta] += abs(sym_addr_+delta - sym_.addr)
                    syms_[dso][sym] = sym_addr_, sym_.size

                    # guess the best delta
                    delta, _ = min(deltas[dso].items(),
                            key=lambda x: (x[1], x[0]))
                    addr = addr_ + delta

                    # cached?
                    if (dso,addr) in at_cache:
                        cached = at_cache[(dso,addr)]
                        if cached is None:
                            # cache says to skip
                            continue
                        file, line = cached
                    else:
                        # find file+line
                        line_ = lines.get(addr)
                        if line_ is not None:
                            file, line = line_.file, line_.line
                        else:
                            file, line = re.sub('(\.o)?$', '.c', dso, 1), 0

                        # ignore filtered sources
                        if sources is not None:
                            if not any(
                                    os.path.abspath(file) == os.path.abspath(s)
                                        for s in sources):
                                at_cache[(dso,addr)] = None
                                continue
                        else:
                            # default to only cwd
                            if not everything and not os.path.commonpath([
                                    os.getcwd(),
                                    os.path.abspath(file)]) == os.getcwd():
                                at_cache[(dso,addr)] = None
                                continue

                        # simplify path
                        if os.path.commonpath([
                                os.getcwd(),
                                os.path.abspath(file)]) == os.getcwd():
                            file = os.path.relpath(file)
                        else:
                            file = os.path.abspath(file)

                        at_cache[(dso,addr)] = file, line
                else:
                    file, line = re.sub('(\.o)?$', '.c', dso, 1), 0

                # strip compiler suffixes
                if not no_strip:
                    sym = sym.split('.', 1)[0]

                last_stack.append((file, sym, line))

                # stop propogating?
                if propagate and len(last_stack) >= propagate:
                    commit()
                    last_filtered = False
    if last_filtered:
        commit()

    proc.wait()
    if proc.returncode != 0:
        raise sp.CalledProcessError(proc.returncode, proc.args)

    # rearrange results into result type
    def to_results(results, z):
        results_ = []
        for name, (r, children) in results.items():
            results_.append(PerfResult(z, *name,
                    **{events[k]: v for k, v in r.items()},
                    children=to_results(children, z+1)))
        return results_

    return to_results(results, 0)

def collect_job(path, i, **args):
    # decompress into a temporary file, this is to work around
    # some limitations of perf
    with zipfile.ZipFile(path) as z:
        with z.open(i) as f:
            with tempfile.NamedTemporaryFile('wb') as g:
                shutil.copyfileobj(f, g)
                g.flush()

                return collect_decompressed(g.name, **args)

def starapply(args):
    f, args, kwargs = args
    return f(*args, **kwargs)

def collect_perf(perf_paths, *,
        jobs=None,
        **args):
    # automatic job detection?
    if jobs == 0:
        jobs = len(os.sched_getaffinity(0))

    records = []
    for path in perf_paths:
        # each .perf file is actually a zip file containing perf files from
        # multiple runs
        with zipfile.ZipFile(path) as z:
            records.extend((path, i) for i in z.infolist())

    # we're dealing with a lot of data but also surprisingly
    # parallelizable
    if jobs is not None:
        results = []
        with mp.Pool(jobs) as p:
            for results_ in p.imap_unordered(
                    starapply,
                    ((collect_job, (path, i), args)
                        for path, i in records)):
                results.extend(results_)
    else:
        results = []
        for path, i in records:
            results.extend(collect_job(path, i, **args))

    return results


# common folding/tabling/read/write code

class Rev(co.namedtuple('Rev', 'a')):
    __slots__ = ()
    # yes we need all of these because we're a namedtuple
    def __lt__(self, other):
        return self.a > other.a
    def __gt__(self, other):
        return self.a < other.a
    def __le__(self, other):
        return self.a >= other.a
    def __ge__(self, other):
        return self.a <= other.a

def fold(Result, results, *,
        by=None,
        defines=[],
        sort=None,
        depth=1,
        **_):
    # stop when depth hits zero
    if depth == 0:
        return []

    # organize by by
    if by is None:
        by = Result._by

    for k in it.chain(by or [], (k for k, _ in defines)):
        if k not in Result._by and k not in Result._fields:
            print("error: could not find field %r?" % k,
                    file=sys.stderr)
            sys.exit(-1)

    # filter by matching defines
    if defines:
        results_ = []
        for r in results:
            if all(any(fnmatch.fnmatchcase(str(getattr(r, k, '')), v)
                        for v in vs)
                    for k, vs in defines):
                results_.append(r)
        results = results_

    # organize results into conflicts
    folding = co.OrderedDict()
    for r in results:
        name = tuple(getattr(r, k) for k in by)
        if name not in folding:
            folding[name] = []
        folding[name].append(r)

    # merge conflicts
    folded = []
    for name, rs in folding.items():
        folded.append(sum(rs[1:], start=rs[0]))

    # sort, note that python's sort is stable
    folded.sort(key=lambda r: (
            # sort by explicit sort fields
            tuple((Rev
                        if reverse ^ (not k or k in Result._fields)
                        else lambda x: x)(
                    tuple((getattr(r, k_),)
                            if getattr(r, k_) is not None
                            else ()
                        for k_ in ([k] if k else Result._sort)))
                for k, reverse in (sort or [])),
            # sort by result
            r))

    # recurse if we have recursive results
    if hasattr(Result, '_children'):
        folded = [r._replace(**{
                Result._children: fold(
                        Result, getattr(r, Result._children),
                        by=by,
                        # only filter defines at the top level!
                        sort=sort,
                        depth=depth-1)})
                    for r in folded]

    return folded

def hotify(Result, results, *,
        enumerates=None,
        depth=1,
        hot=None,
        **_):
    # note! hotifying risks confusion if you don't enumerate/have a
    # z field, since it will allow folding across recursive boundaries

    # hotify only makes sense for recursive results
    assert hasattr(Result, '_children')

    results_ = []
    for r in results:
        hot_ = []
        def recurse(results_, depth_):
            nonlocal hot_
            if not results_:
                return

            # find the hottest result
            r = min(results_, key=lambda r:
                    tuple((Rev
                                if reverse ^ (not k or k in Result._fields)
                                else lambda x: x)(
                            tuple((getattr(r, k_),)
                                    if getattr(r, k_) is not None
                                    else ()
                                for k_ in ([k] if k else Result._sort)))
                        for k, reverse in it.chain(hot, [(None, False)])))

            hot_.append(r._replace(**(
                    # enumerate?
                    ({e: len(hot_) for e in enumerates}
                            if enumerates is not None
                            else {})
                        | {Result._children: []})))

            # recurse?
            if depth_ > 1:
                recurse(getattr(r, Result._children),
                        depth_-1)

        recurse(getattr(r, Result._children), depth-1)
        results_.append(r._replace(**{Result._children: hot_}))

    return results_

def table(Result, results, diff_results=None, *,
        by=None,
        fields=None,
        sort=None,
        labels=None,
        depth=1,
        hot=None,
        percent=False,
        all=False,
        compare=None,
        no_header=False,
        small_header=False,
        no_total=False,
        small_table=False,
        summary=False,
        **_):
    import builtins
    all_, all = all, builtins.all

    if by is None:
        by = Result._by
    if fields is None:
        fields = Result._fields
    types = Result._types

    # organize by name
    table = {
            ','.join(str(getattr(r, k)
                        if getattr(r, k) is not None
                        else '')
                    for k in by): r
                for r in results}
    diff_table = {
            ','.join(str(getattr(r, k)
                        if getattr(r, k) is not None
                        else '')
                    for k in by): r
                for r in diff_results or []}

    # lost results? this only happens if we didn't fold by the same
    # by field, which is an error and risks confusing results
    assert len(table) == len(results)
    if diff_results is not None:
        assert len(diff_table) == len(diff_results)

    # find compare entry if there is one
    if compare:
        compare_ = min(
            (n for n in table.keys()
                if all(fnmatch.fnmatchcase(k, c)
                    for k, c in it.zip_longest(n.split(','), compare,
                        fillvalue=''))),
            default=compare)
        compare_r = table.get(compare_)

    # build up our lines
    lines = []

    # header
    if not no_header:
        header = ['%s%s' % (
                    ','.join(labels if labels is not None else by),
                    ' (%d added, %d removed)' % (
                            sum(1 for n in table if n not in diff_table),
                            sum(1 for n in diff_table if n not in table))
                        if diff_results is not None and not percent else '')
                if not small_header and not small_table and not summary
                    else '']
        if diff_results is None or percent:
            for k in fields:
                header.append(k)
        else:
            for k in fields:
                header.append('o'+k)
            for k in fields:
                header.append('n'+k)
            for k in fields:
                header.append('d'+k)
        lines.append(header)

    # delete these to try to catch typos below, we need to rebuild
    # these tables at each recursive layer
    del table
    del diff_table

    # entry helper
    def table_entry(name, r, diff_r=None):
        # prepend name
        entry = [name]

        # normal entry?
        if ((compare is None or r == compare_r)
                and diff_results is None):
            for k in fields:
                entry.append(
                        (getattr(r, k).table(),
                                getattr(getattr(r, k), 'notes', lambda: [])())
                            if getattr(r, k, None) is not None
                            else types[k].none)
        # compare entry?
        elif diff_results is None:
            for k in fields:
                entry.append(
                        (getattr(r, k).table()
                                if getattr(r, k, None) is not None
                                else types[k].none,
                            (lambda t: ['+∞%'] if t == +mt.inf
                                    else ['-∞%'] if t == -mt.inf
                                    else ['%+.1f%%' % (100*t)])(
                                types[k].ratio(
                                    getattr(r, k, None),
                                    getattr(compare_r, k, None)))))
        # percent entry?
        elif percent:
            for k in fields:
                entry.append(
                        (getattr(r, k).table()
                                if getattr(r, k, None) is not None
                                else types[k].none,
                            (lambda t: ['+∞%'] if t == +mt.inf
                                    else ['-∞%'] if t == -mt.inf
                                    else ['%+.1f%%' % (100*t)])(
                                types[k].ratio(
                                    getattr(r, k, None),
                                    getattr(diff_r, k, None)))))
        # diff entry?
        else:
            for k in fields:
                entry.append(getattr(diff_r, k).table()
                        if getattr(diff_r, k, None) is not None
                        else types[k].none)
            for k in fields:
                entry.append(getattr(r, k).table()
                        if getattr(r, k, None) is not None
                        else types[k].none)
            for k in fields:
                entry.append(
                        (types[k].diff(
                                getattr(r, k, None),
                                getattr(diff_r, k, None)),
                            (lambda t: ['+∞%'] if t == +mt.inf
                                    else ['-∞%'] if t == -mt.inf
                                    else ['%+.1f%%' % (100*t)] if t
                                    else [])(
                                types[k].ratio(
                                    getattr(r, k, None),
                                    getattr(diff_r, k, None)))))

        # append any notes
        if hasattr(Result, '_notes') and r is not None:
            notes = sorted(getattr(r, Result._notes))
            if isinstance(entry[-1], tuple):
                entry[-1] = (entry[-1][0], entry[-1][1] + notes)
            else:
                entry[-1] = (entry[-1], notes)

        return entry

    # recursive entry helper
    def table_recurse(results_, diff_results_,
            depth_,
            prefixes=('', '', '', '')):
        # build the children table at each layer
        table_ = {
                ','.join(str(getattr(r, k)
                            if getattr(r, k) is not None
                            else '')
                        for k in by): r
                    for r in results_}
        diff_table_ = {
                ','.join(str(getattr(r, k)
                            if getattr(r, k) is not None
                            else '')
                        for k in by): r
                    for r in diff_results_ or []}
        names_ = [n
                for n in table_.keys() | diff_table_.keys()
                if diff_results is None
                    or all_
                    or any(
                        types[k].ratio(
                                getattr(table_.get(n), k, None),
                                getattr(diff_table_.get(n), k, None))
                            for k in fields)]

        # sort again, now with diff info, note that python's sort is stable
        names_.sort(key=lambda n: (
                # sort by explicit sort fields
                next(
                    tuple((Rev
                                    if reverse ^ (not k or k in Result._fields)
                                    else lambda x: x)(
                                tuple((getattr(r_, k_),)
                                        if getattr(r_, k_) is not None
                                        else ()
                                    for k_ in ([k] if k else Result._sort)))
                            for k, reverse in (sort or []))
                        for r_ in [table_.get(n), diff_table_.get(n)]
                        if r_ is not None),
                # sort by ratio if diffing
                Rev(tuple(types[k].ratio(
                            getattr(table_.get(n), k, None),
                            getattr(diff_table_.get(n), k, None))
                        for k in fields))
                    if diff_results is not None
                    else (),
                # move compare entry to the top, note this can be
                # overridden by explicitly sorting by fields
                (table_.get(n) != compare_r,
                        # sort by ratio if comparing
                        Rev(tuple(
                            types[k].ratio(
                                    getattr(table_.get(n), k, None),
                                    getattr(compare_r, k, None))
                                for k in fields)))
                    if compare
                    else (),
                # sort by result
                (table_[n],) if n in table_ else (),
                # and finally by name (diffs may be missing results)
                n))

        for i, name in enumerate(names_):
            # find comparable results
            r = table_.get(name)
            diff_r = diff_table_.get(name)

            # figure out a good label
            if labels is not None:
                label = next(
                        ','.join(str(getattr(r_, k)
                                    if getattr(r_, k) is not None
                                    else '')
                                for k in labels)
                            for r_ in [r, diff_r]
                            if r_ is not None)
            else:
                label = name

            # build line
            line = table_entry(label, r, diff_r)

            # add prefixes
            line = [x if isinstance(x, tuple) else (x, []) for x in line]
            line[0] = (prefixes[0+(i==len(names_)-1)] + line[0][0], line[0][1])
            lines.append(line)

            # recurse?
            if name in table_ and depth_ > 1:
                table_recurse(
                        getattr(r, Result._children),
                        getattr(diff_r, Result._children, None),
                        depth_-1,
                        (prefixes[2+(i==len(names_)-1)] + "|-> ",
                         prefixes[2+(i==len(names_)-1)] + "'-> ",
                         prefixes[2+(i==len(names_)-1)] + "|   ",
                         prefixes[2+(i==len(names_)-1)] + "    "))

    # build entries
    if not summary:
        table_recurse(results, diff_results, depth)

    # total
    if not no_total and not (small_table and not summary):
        r = next(iter(fold(Result, results, by=[])), None)
        if diff_results is None:
            diff_r = None
        else:
            diff_r = next(iter(fold(Result, diff_results, by=[])), None)
        lines.append(table_entry('TOTAL', r, diff_r))

    # homogenize
    lines = [[x if isinstance(x, tuple) else (x, []) for x in line]
            for line in lines]

    # find the best widths, note that column 0 contains the names and is
    # handled a bit differently
    widths = co.defaultdict(lambda: 7, {0: 7})
    nwidths = co.defaultdict(lambda: 0)
    for line in lines:
        for i, x in enumerate(line):
            widths[i] = max(widths[i], ((len(x[0])+1+4-1)//4)*4-1)
            if i != len(line)-1:
                nwidths[i] = max(nwidths[i], 1+sum(2+len(n) for n in x[1]))

    # print our table
    for line in lines:
        print('%-*s  %s' % (
                widths[0], line[0][0],
                ' '.join('%*s%-*s' % (
                        widths[i], x[0],
                        nwidths[i], ' (%s)' % ', '.join(x[1]) if x[1] else '')
                    for i, x in enumerate(line[1:], 1))))

def read_csv(path, Result, *,
        depth=1,
        prefix=None,
        **_):
    # prefix? this only applies to field fields
    if prefix is None:
        if hasattr(Result, '_prefix'):
            prefix = '%s_' % Result._prefix
        else:
            prefix = ''

    by = Result._by
    fields = Result._fields

    with openio(path, 'r') as f:
        # csv or json? assume json starts with [
        is_json = (f.buffer.peek(1)[:1] == b'[')

        # read csv?
        if not is_json:
            results = []
            reader = csv.DictReader(f, restval='')
            for r in reader:
                if not any(prefix+k in r and r[prefix+k].strip()
                        for k in fields):
                    continue
                try:
                    # note this allows by/fields to overlap
                    results.append(Result(**(
                            {k: r[k] for k in by
                                    if k in r
                                        and r[k].strip()}
                                | {k: r[prefix+k] for k in fields
                                    if prefix+k in r
                                        and r[prefix+k].strip()})))
                except TypeError:
                    pass
            return results

        # read json?
        else:
            import json
            def unjsonify(results, depth_):
                results_ = []
                for r in results:
                    if not any(prefix+k in r and r[prefix+k].strip()
                            for k in fields):
                        continue
                    try:
                        # note this allows by/fields to overlap
                        results_.append(Result(**(
                                {k: r[k] for k in by
                                        if k in r
                                            and r[k] is not None}
                                    | {k: r[prefix+k] for k in fields
                                        if prefix+k in r
                                            and r[prefix+k] is not None}
                                    | ({Result._children: unjsonify(
                                            r[Result._children],
                                            depth_-1)}
                                        if hasattr(Result, '_children')
                                            and Result._children in r
                                            and r[Result._children] is not None
                                            and depth_ > 1
                                        else {})
                                    | ({Result._notes: set(r[Result._notes])}
                                        if hasattr(Result, '_notes')
                                            and Result._notes in r
                                            and r[Result._notes] is not None
                                        else {}))))
                    except TypeError:
                        pass
                return results_
            return unjsonify(json.load(f), depth)

def write_csv(path, Result, results, *,
        json=False,
        by=None,
        fields=None,
        depth=1,
        prefix=None,
        **_):
    # prefix? this only applies to field fields
    if prefix is None:
        if hasattr(Result, '_prefix'):
            prefix = '%s_' % Result._prefix
        else:
            prefix = ''

    if by is None:
        by = Result._by
    if fields is None:
        fields = Result._fields

    with openio(path, 'w') as f:
        # write csv?
        if not json:
            writer = csv.DictWriter(f, list(
                    co.OrderedDict.fromkeys(it.chain(
                        by,
                        (prefix+k for k in fields))).keys()))
            writer.writeheader()
            for r in results:
                # note this allows by/fields to overlap
                writer.writerow(
                        {k: getattr(r, k)
                                for k in by
                                if getattr(r, k) is not None}
                            | {prefix+k: getattr(r, k).__csv__()
                                for k in fields
                                if getattr(r, k) is not None})

        # write json?
        else:
            import json
            # the neat thing about json is we can include recursive results
            def jsonify(results, depth_):
                results_ = []
                for r in results:
                    # note this allows by/fields to overlap
                    results_.append(
                            {k: getattr(r, k)
                                    for k in by
                                    if getattr(r, k) is not None}
                                | {prefix+k: getattr(r, k).__csv__()
                                    for k in fields
                                    if getattr(r, k) is not None}
                                | ({Result._children: jsonify(
                                        getattr(r, Result._children),
                                        depth_-1)}
                                    if hasattr(Result, '_children')
                                        and getattr(r, Result._children)
                                        and depth_ > 1
                                    else {})
                                | ({Result._notes: list(
                                        getattr(r, Result._notes))}
                                    if hasattr(Result, '_notes')
                                        and getattr(r, Result._notes)
                                    else {}))
                return results_
            json.dump(jsonify(results, depth), f,
                    separators=(',', ':'))


def annotate(Result, results, *,
        annotate=None,
        threshold=None,
        branches=False,
        caches=False,
        **args):
    # figure out the threshold
    if threshold is None:
        t0, t1 = THRESHOLD
    elif len(threshold) == 1:
        t0, t1 = threshold[0], threshold[0]
    else:
        t0, t1 = threshold
    t0, t1 = min(t0, t1), max(t0, t1)

    if not branches and not caches:
        tk = 'cycles'
    elif branches:
        tk = 'bmisses'
    else:
        tk = 'cmisses'

    # find max cycles
    max_ = max(it.chain((float(getattr(r, tk)) for r in results), [1]))

    for path in co.OrderedDict.fromkeys(r.file for r in results).keys():
        # flatten to line info
        results = fold(Result, results, by=['file', 'line'])
        table = {r.line: r for r in results if r.file == path}

        # calculate spans to show
        if not annotate:
            spans = []
            last = None
            func = None
            for line, r in sorted(table.items()):
                if float(getattr(r, tk)) / max_ >= t0:
                    if last is not None and line - last.stop <= args['context']:
                        last = range(
                                last.start,
                                line+1+args['context'])
                    else:
                        if last is not None:
                            spans.append((last, func))
                        last = range(
                                line-args['context'],
                                line+1+args['context'])
                        func = r.function
            if last is not None:
                spans.append((last, func))

        with open(path) as f:
            skipped = False
            for i, line in enumerate(f):
                # skip lines not in spans?
                if not annotate and not any(i+1 in s for s, _ in spans):
                    skipped = True
                    continue

                if skipped:
                    skipped = False
                    print('%s@@ %s:%d: %s @@%s' % (
                            '\x1b[36m' if args['color'] else '',
                            path,
                            i+1,
                            next(iter(f for _, f in spans)),
                            '\x1b[m' if args['color'] else ''))

                # build line
                if line.endswith('\n'):
                    line = line[:-1]

                r = table.get(i+1)
                if r is not None and (
                        float(r.cycles) > 0
                            if not branches and not caches
                            else float(r.bmisses) > 0 or float(r.branches) > 0
                            if branches
                            else float(r.cmisses) > 0 or float(r.caches) > 0):
                    line = '%-*s // %s' % (
                            args['width'],
                            line,
                            '%s cycles' % r.cycles
                                if not branches and not caches
                                else '%s bmisses, %s branches' % (
                                    r.bmisses, r.branches)
                                if branches
                                else '%s cmisses, %s caches' % (
                                    r.cmisses, r.caches))

                    if args['color']:
                        if float(getattr(r, tk)) / max_ >= t1:
                            line = '\x1b[1;31m%s\x1b[m' % line
                        elif float(getattr(r, tk)) / max_ >= t0:
                            line = '\x1b[35m%s\x1b[m' % line

                print(line)


def main_(perf_paths, *,
        by=None,
        fields=None,
        defines=[],
        sort=None,
        branches=False,
        caches=False,
        depth=None,
        hot=None,
        **args):
    # figure out what color should be
    if args.get('color') == 'auto':
        args['color'] = sys.stdout.isatty()
    elif args.get('color') == 'always':
        args['color'] = True
    else:
        args['color'] = False

    # figure out what fields we're interested in
    labels = None
    if by is None:
        if (args.get('annotate')
                or args.get('threshold')
                or args.get('output')
                or args.get('output_json')):
            by = PerfResult._by
        elif depth is not None or hot is not None:
            by = ['z', 'function']
            labels = ['function']
        else:
            by = ['function']

    if fields is None:
        if (args.get('annotate')
                or args.get('threshold')
                or args.get('output')
                or args.get('output_json')):
            fields = PerfResult._fields
        elif not branches and not caches:
            fields = ['cycles']
        elif branches:
            fields = ['bmisses', 'branches']
        else:
            fields = ['cmisses', 'caches']

    # figure out depth
    if depth is None:
        depth = mt.inf if hot else 1
    elif depth == 0:
        depth = mt.inf

    # find sizes
    if not args.get('use', None):
        # not enough info?
        if not perf_paths:
            print("error: no *.perf files?",
                    file=sys.stderr)
            sys.exit(1)

        # collect info
        results = collect_perf(perf_paths,
                depth=depth,
                **args)

    else:
        results = read_csv(args['use'], PerfResult,
                depth=depth,
                **args)

    # fold
    results = fold(PerfResult, results,
            by=by,
            defines=defines,
            depth=depth)

    # hotify?
    if hot:
        results = hotify(PerfResult, results,
                depth=depth,
                hot=hot)

    # find previous results?
    diff_results = None
    if args.get('diff'):
        try:
            diff_results = read_csv(
                    args.get('diff'),
                    PerfResult,
                    depth=depth,
                    **args)
        except FileNotFoundError:
            diff_results = []

        # fold
        diff_results = fold(PerfResult, diff_results,
                by=by,
                defines=defines,
                depth=depth)

        # hotify?
        if hot:
            diff_results = hotify(PerfResult, diff_results,
                    depth=depth,
                    hot=hot)

    # print table
    if (args.get('annotate')
            or args.get('threshold')):
        annotate(PerfResult, results,
                branches=branches,
                caches=caches,
                **args)
    # write results to JSON
    elif args.get('output_json'):
        write_csv(args['output_json'], PerfResult, results, json=True,
                by=by,
                fields=fields,
                depth=depth,
                **args)
    # write results to CSV
    elif args.get('output'):
        write_csv(args['output'], PerfResult, results,
                by=by,
                fields=fields,
                depth=depth,
                **args)
    elif not args.get('quiet'):
        # print table
        table(PerfResult, results, diff_results,
                by=by,
                fields=fields,
                sort=sort,
                labels=labels,
                depth=depth,
                **args)


def main(**args):
    if args.get('record'):
        return record(**args)
    else:
        return main_(**args)


if __name__ == "__main__":
    import argparse
    import sys

    # bit of a hack, but parse_intermixed_args and REMAINDER are
    # incompatible, so we need to figure out what we want before running
    # argparse
    if '--record' in sys.argv:
        nargs = argparse.REMAINDER
    else:
        nargs = '*'

    argparse.ArgumentParser._handle_conflict_ignore = lambda *_: None
    argparse._ArgumentGroup._handle_conflict_ignore = lambda *_: None
    parser = argparse.ArgumentParser(
            description="Aggregate and report Linux perf results.",
            allow_abbrev=False,
            conflict_handler='ignore')
    parser.add_argument(
            'perf_paths',
            nargs=nargs,
            help="Input *.perf files.")
    parser.add_argument(
            '-v', '--verbose',
            action='store_true',
            help="Output commands that run behind the scenes.")
    parser.add_argument(
            '-q', '--quiet',
            action='store_true',
            help="Don't show anything, useful when checking for errors.")
    parser.add_argument(
            '-o', '--output',
            help="Specify CSV file to store results.")
    parser.add_argument(
            '-O', '--output-json',
            help="Specify JSON file to store results. This may contain "
                "recursive info.")
    parser.add_argument(
            '-u', '--use',
            help="Don't parse anything, use this CSV/JSON file.")
    parser.add_argument(
            '-d', '--diff',
            help="Specify CSV/JSON file to diff against.")
    parser.add_argument(
            '-p', '--percent',
            action='store_true',
            help="Only show percentage change, not a full diff.")
    parser.add_argument(
            '-c', '--compare',
            type=lambda x: tuple(v.strip() for v in x.split(',')),
            help="Compare results to the row matching this by pattern.")
    parser.add_argument(
            '-a', '--all',
            action='store_true',
            help="Show all, not just the ones that changed.")
    parser.add_argument(
            '-b', '--by',
            action='append',
            choices=PerfResult._by,
            help="Group by this field.")
    parser.add_argument(
            '-f', '--field',
            dest='fields',
            action='append',
            choices=PerfResult._fields,
            help="Show this field.")
    parser.add_argument(
            '-D', '--define',
            dest='defines',
            action='append',
            type=lambda x: (
                lambda k, vs: (
                    k.strip(),
                    {v.strip() for v in vs.split(',')})
                )(*x.split('=', 1)),
            help="Only include results where this field is this value. May "
                "include comma-separated options and globs.")
    class AppendSort(argparse.Action):
        def __call__(self, parser, namespace, value, option):
            if namespace.sort is None:
                namespace.sort = []
            namespace.sort.append((value, option in {'-S', '--reverse-sort'}))
    parser.add_argument(
            '-s', '--sort',
            nargs='?',
            action=AppendSort,
            help="Sort by this field.")
    parser.add_argument(
            '-S', '--reverse-sort',
            nargs='?',
            action=AppendSort,
            help="Sort by this field, but backwards.")
    parser.add_argument(
            '-z', '--depth',
            nargs='?',
            type=lambda x: int(x, 0),
            const=0,
            help="Depth of function calls to show. 0 shows all calls unless "
                "we find a cycle. Defaults to 0.")
    parser.add_argument(
            '-g', '--propagate',
            type=lambda x: int(x, 0),
            help="Depth to propagate samples up the call-stack. 0 propagates "
                "up to the entry point, 1 does no propagation. Defaults to 0.")
    class AppendHot(argparse.Action):
        def __call__(self, parser, namespace, value, option):
            if namespace.hot is None:
                namespace.hot = []
            namespace.hot.append((value, option in {'-R', '--reverse-hot'}))
    parser.add_argument(
            '-r', '--hot',
            nargs='?',
            action=AppendHot,
            help="Show only the hot path for each function call. Can "
                "optionally provide fields like sort.")
    parser.add_argument(
            '-R', '--reverse-hot',
            nargs='?',
            action=AppendHot,
            help="Like -r/--hot, but backwards.")
    parser.add_argument(
            '--no-header',
            action='store_true',
            help="Don't show the header.")
    parser.add_argument(
            '--small-header',
            action='store_true',
            help="Don't show by field names.")
    parser.add_argument(
            '--no-total',
            action='store_true',
            help="Don't show the total.")
    parser.add_argument(
            '-Q', '--small-table',
            action='store_true',
            help="Equivalent to --small-header + --no-total.")
    parser.add_argument(
            '-Y', '--summary',
            action='store_true',
            help="Only show the total.")
    parser.add_argument(
            '--prefix',
            help="Prefix to use for fields in CSV/JSON output. Defaults "
                "to %r." % ("%s_" % PerfResult._prefix))
    parser.add_argument(
            '-F', '--source',
            dest='sources',
            action='append',
            help="Only consider definitions in this file. Defaults to "
                "anything in the current directory.")
    parser.add_argument(
            '-!', '--everything',
            action='store_true',
            help="Include builtin and libc specific symbols.")
    parser.add_argument(
            '-x', '--no-strip',
            action='store_true',
            help="Don't strip compiler optimization suffixes from symbols.")
    parser.add_argument(
            '--branches',
            action='store_true',
            help="Show branches and branch misses.")
    parser.add_argument(
            '--caches',
            action='store_true',
            help="Show cache accesses and cache misses.")
    parser.add_argument(
            '-A', '--annotate',
            action='store_true',
            help="Show source files annotated with coverage info.")
    parser.add_argument(
            '-T', '--threshold',
            nargs='?',
            type=lambda x: tuple(float(x) for x in x.split(',')),
            const=THRESHOLD,
            help="Show lines with samples above this threshold as a percent "
                "of all lines. Defaults to "
                "%s." % ','.join(str(t) for t in THRESHOLD))
    parser.add_argument(
            '-C', '--context',
            type=lambda x: int(x, 0),
            default=3,
            help="Show n additional lines of context. Defaults to 3.")
    parser.add_argument(
            '-W', '--width',
            type=lambda x: int(x, 0),
            default=80,
            help="Assume source is styled with this many columns. Defaults "
                "to 80.")
    parser.add_argument(
            '--color',
            choices=['never', 'always', 'auto'],
            default='auto',
            help="When to use terminal colors. Defaults to 'auto'.")
    parser.add_argument(
            '-j', '--jobs',
            nargs='?',
            type=lambda x: int(x, 0),
            const=0,
            help="Number of processes to use. 0 spawns one process per core.")
    parser.add_argument(
            '--perf-path',
            type=lambda x: x.split(),
            help="Path to the perf executable, may include flags. "
                "Defaults to %r." % PERF_PATH)
    parser.add_argument(
            '--objdump-path',
            type=lambda x: x.split(),
            default=OBJDUMP_PATH,
            help="Path to the objdump executable, may include flags. "
                "Defaults to %r." % OBJDUMP_PATH)

    # record flags
    record_parser = parser.add_argument_group('record options')
    record_parser.add_argument(
            'command',
            nargs=nargs,
            help="Command to run.")
    record_parser.add_argument(
            '--record',
            action='store_true',
            help="Run a command and aggregate perf measurements.")
    record_parser.add_argument(
            '-o', '--output',
            help="Output file. Uses flock to synchronize. This is stored as a "
                "zip-file of multiple perf results.")
    record_parser.add_argument(
            '--perf-freq',
            help="perf sampling frequency. This is passed directly to perf. "
                "Defaults to %r." % PERF_FREQ)
    record_parser.add_argument(
            '--perf-period',
            help="perf sampling period. This is passed directly to perf.")
    record_parser.add_argument(
            '--perf-events',
            help="perf events to record. This is passed directly to perf. "
                "Defaults to %r." % PERF_EVENTS)
    record_parser.add_argument(
            '--perf-path',
            type=lambda x: x.split(),
            help="Path to the perf executable, may include flags. "
                "Defaults to %r." % PERF_PATH)

    # avoid intermixed/REMAINDER conflict, see above
    if nargs == argparse.REMAINDER:
        args = parser.parse_args()
    else:
        args = parser.parse_intermixed_args()

    # perf_paths/command overlap, so need to do some munging here
    args.command = args.perf_paths
    if args.record:
        if not args.command:
            print('error: no command specified?',
                    file=sys.stderr)
            sys.exit(-1)
        if not args.output:
            print('error: no output file specified?',
                    file=sys.stderr)
            sys.exit(-1)

    sys.exit(main(**{k: v
            for k, v in vars(args).items()
            if v is not None}))