diff --git a/Makefile b/Makefile
index d4a310f6..681fa569 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,6 @@ SRC  ?= $(filter-out $(wildcard *.t.* *.b.*),$(wildcard *.c))
 OBJ  := $(SRC:%.c=$(BUILDDIR)/%.o)
 DEP  := $(SRC:%.c=$(BUILDDIR)/%.d)
 ASM  := $(SRC:%.c=$(BUILDDIR)/%.s)
-CI   := $(SRC:%.c=$(BUILDDIR)/%.ci)
 GCDA := $(SRC:%.c=$(BUILDDIR)/%.t.a.gcda)
 
 TESTS ?= $(wildcard tests/*.toml)
@@ -35,7 +34,6 @@ TEST_C     := $(TESTS:%.toml=$(BUILDDIR)/%.t.c) \
 TEST_A     := $(TEST_C:%.t.c=%.t.a.c)
 TEST_OBJ   := $(TEST_A:%.t.a.c=%.t.a.o)
 TEST_DEP   := $(TEST_A:%.t.a.c=%.t.a.d)
-TEST_CI	   := $(TEST_A:%.t.a.c=%.t.a.ci)
 TEST_GCNO  := $(TEST_A:%.t.a.c=%.t.a.gcno)
 TEST_GCDA  := $(TEST_A:%.t.a.c=%.t.a.gcda)
 TEST_PERF  := $(TEST_RUNNER:%=%.perf)
@@ -52,7 +50,6 @@ BENCH_C     := $(BENCHES:%.toml=$(BUILDDIR)/%.b.c) \
 BENCH_A     := $(BENCH_C:%.b.c=%.b.a.c)
 BENCH_OBJ   := $(BENCH_A:%.b.a.c=%.b.a.o)
 BENCH_DEP   := $(BENCH_A:%.b.a.c=%.b.a.d)
-BENCH_CI    := $(BENCH_A:%.b.a.c=%.b.a.ci)
 BENCH_GCNO  := $(BENCH_A:%.b.a.c=%.b.a.gcno)
 BENCH_GCDA  := $(BENCH_A:%.b.a.c=%.b.a.gcda)
 BENCH_PERF  := $(BENCH_RUNNER:%=%.perf)
@@ -70,7 +67,6 @@ GDB           ?= gdb
 PERF          ?= perf
 PRETTYASSERTS ?= ./scripts/prettyasserts.py
 
-CFLAGS += -fcallgraph-info=su
 CFLAGS += -g3
 CFLAGS += -I.
 CFLAGS += -std=c99 -Wall -Wextra -pedantic
@@ -117,6 +113,7 @@ PERFBDFLAGS += $(filter -j%,$(MAKEFLAGS))
 ifneq ($(OBJDUMP),objdump)
 CODEFLAGS    += --objdump-path="$(OBJDUMP)"
 DATAFLAGS    += --objdump-path="$(OBJDUMP)"
+STACKFLAGS	 += --objdump-path="$(OBJDUMP)"
 CTXFLAGS	 += --objdump-path="$(OBJDUMP)"
 STRUCTSFLAGS += --objdump-path="$(OBJDUMP)"
 PERFFLAGS    += --objdump-path="$(OBJDUMP)"
@@ -234,12 +231,12 @@ data-diff: $(OBJ)
 ## Find the per-function stack usage
 .PHONY: stack
 stack: STACKFLAGS+=-S
-stack: $(CI) $(BUILDDIR)/lfs.stack.csv
-	./scripts/stack.py $(CI) $(STACKFLAGS)
+stack: $(OBJ) $(BUILDDIR)/lfs.stack.csv
+	./scripts/stack.py $(OBJ) $(STACKFLAGS)
 
 ## Compare per-function stack usage
 .PHONY: stack-diff
-stack-diff: $(CI)
+stack-diff: $(OBJ)
 	./scripts/stack.py $^ $(STACKFLAGS) -d $(BUILDDIR)/lfs.stack.csv
 
 ## Find the per-function context
@@ -250,7 +247,7 @@ ctx: $(OBJ) $(BUILDDIR)/lfs.ctx.csv
 
 ## Compare per-function context
 .PHONY: ctx-diff
-ctx-diff: $(CI)
+ctx-diff: $(OBJ)
 	./scripts/ctx.py $^ $(CTXFLAGS) -d $(BUILDDIR)/lfs.ctx.csv
 
 ## Find function sizes
@@ -274,16 +271,16 @@ funcs: \
 ## Compare function sizes
 .PHONY: funcs-diff
 funcs-diff: SHELL=/bin/bash
-funcs-diff: $(OBJ) $(CI)
+funcs-diff: $(OBJ)
 	$(strip ./scripts/csv.py \
 		<(./scripts/csv.py \
-			<(./scripts/code.py $(OBJ) -q $(CODEFLAGS) -o-) \
+			<(./scripts/code.py $^ -q $(CODEFLAGS) -o-) \
 			-fcode=size -q -o-) \
 		<(./scripts/csv.py \
-			<(./scripts/stack.py $(CI) -q $(STACKFLAGS) -o-) \
+			<(./scripts/stack.py $^ -q $(STACKFLAGS) -o-) \
 			-fstack='max(limit)' -q -o-) \
 		<(./scripts/csv.py \
-			<(./scripts/ctx.py $(OBJ) -q $(CTXFLAGS) -o-) \
+			<(./scripts/ctx.py $^ -q $(CTXFLAGS) -o-) \
 			-fctx='max(size)' -q -o-) \
 		-bfunction -fcode -fstack='max(stack)' -fctx='max(ctx)' \
 		$(SUMMARYFLAGS) -d <(./scripts/csv.py \
@@ -375,20 +372,20 @@ summary sizes: \
 ## Compare compile-time sizes
 .PHONY: summary-diff sizes-diff
 summary-diff sizes-diff: SHELL=/bin/bash
-summary-diff sizes-diff: $(OBJ) $(CI)
+summary-diff sizes-diff: $(OBJ)
 	$(strip ./scripts/csv.py \
 		<(./scripts/csv.py \
 			<(./scripts/csv.py \
-				<(./scripts/code.py $(OBJ) -q $(CODEFLAGS) -o-) \
+				<(./scripts/code.py $^ -q $(CODEFLAGS) -o-) \
 				-fcode=size -q -o-) \
 			<(./scripts/csv.py \
-				<(./scripts/data.py $(OBJ) -q $(DATAFLAGS) -o-) \
+				<(./scripts/data.py $^ -q $(DATAFLAGS) -o-) \
 				-fdata=size -q -o-) \
 			<(./scripts/csv.py \
-				<(./scripts/stack.py $(CI) -q $(STACKFLAGS) -o-) \
+				<(./scripts/stack.py $^ -q $(STACKFLAGS) -o-) \
 				-fstack='max(limit)' -q -o-) \
 			<(./scripts/csv.py \
-				<(./scripts/ctx.py $(OBJ) -q $(CTXFLAGS) -o-) \
+				<(./scripts/ctx.py $^ -q $(CTXFLAGS) -o-) \
 				-fctx='max(size)' -q -o-) \
 			-fcode -fdata -fstack='max(stack)' -fctx='max(ctx)' \
 			-bbuild='"AFTER"' -q -o-) \
@@ -507,7 +504,7 @@ $(BUILDDIR)/lfs.code.csv: $(OBJ)
 $(BUILDDIR)/lfs.data.csv: $(OBJ)
 	./scripts/data.py $^ -q $(DATAFLAGS) -o $@
 
-$(BUILDDIR)/lfs.stack.csv: $(CI)
+$(BUILDDIR)/lfs.stack.csv: $(OBJ)
 	./scripts/stack.py $^ -q $(STACKFLAGS) -o $@
 
 $(BUILDDIR)/lfs.ctx.csv: $(OBJ)
@@ -543,9 +540,7 @@ $(BUILDDIR)/runners/test_runner: $(TEST_OBJ)
 $(BUILDDIR)/runners/bench_runner: $(BENCH_OBJ)
 	$(CC) $(CFLAGS) $^ $(LFLAGS) -o $@
 
-# our main build rule generates .o, .d, and .ci files, the latter
-# used for stack analysis
-$(BUILDDIR)/%.o $(BUILDDIR)/%.ci: %.c
+$(BUILDDIR)/%.o: %.c
 	$(CC) -c -MMD $(CFLAGS) $< -o $(BUILDDIR)/$*.o
 
 $(BUILDDIR)/%.s: %.c
@@ -587,13 +582,11 @@ clean:
 	rm -f $(OBJ)
 	rm -f $(DEP)
 	rm -f $(ASM)
-	rm -f $(CI)
 	rm -f $(TEST_RUNNER)
 	rm -f $(TEST_A)
 	rm -f $(TEST_C)
 	rm -f $(TEST_OBJ)
 	rm -f $(TEST_DEP)
-	rm -f $(TEST_CI)
 	rm -f $(TEST_GCNO)
 	rm -f $(TEST_GCDA)
 	rm -f $(TEST_PERF)
@@ -604,7 +597,6 @@ clean:
 	rm -f $(BENCH_C)
 	rm -f $(BENCH_OBJ)
 	rm -f $(BENCH_DEP)
-	rm -f $(BENCH_CI)
 	rm -f $(BENCH_GCNO)
 	rm -f $(BENCH_GCDA)
 	rm -f $(BENCH_PERF)
diff --git a/scripts/stack.py b/scripts/stack.py
index b2a2520d..7bcf71bb 100755
--- a/scripts/stack.py
+++ b/scripts/stack.py
@@ -16,9 +16,14 @@ __import__('sys').path.pop(0)
 import collections as co
 import csv
 import itertools as it
+import functools as ft
 import math as mt
 import os
 import re
+import subprocess as sp
+
+
+OBJDUMP_PATH = ['objdump']
 
 
 # integer fields
@@ -125,25 +130,30 @@ class RInt(co.namedtuple('RInt', 'x')):
 
 # size results
 class StackResult(co.namedtuple('StackResult', [
-        'file', 'function', 'frame', 'limit', 'children'])):
+        'file', 'function',
+        'frame', 'limit',
+        'children', 'notes'])):
     _by = ['file', 'function']
     _fields = ['frame', 'limit']
     _sort = ['limit', 'frame']
     _types = {'frame': RInt, 'limit': RInt}
     _children = 'children'
+    _notes = 'notes'
 
     __slots__ = ()
     def __new__(cls, file='', function='', frame=0, limit=0,
-            children=None):
+            children=None, notes=None):
         return super().__new__(cls, file, function,
                 RInt(frame), RInt(limit),
-                children if children is not None else [])
+                children if children is not None else [],
+                notes if notes is not None else [])
 
     def __add__(self, other):
         return StackResult(self.file, self.function,
                 self.frame + other.frame,
                 max(self.limit, other.limit),
-                self.children + other.children)
+                self.children + other.children,
+                self.notes + other.notes)
 
 
 def openio(path, mode='r', buffering=-1):
@@ -156,151 +166,777 @@ def openio(path, mode='r', buffering=-1):
     else:
         return open(path, mode, buffering)
 
-def collect(ci_paths, *,
+class Sym(co.namedtuple('Sym', [
+        'name', 'global_', 'section', 'addr', 'size'])):
+    __slots__ = ()
+    def __new__(cls, name, global_, section, addr, size):
+        return super().__new__(cls, name, global_, section, addr, size)
+
+    def __repr__(self):
+        return '%s(%r, %r, %r, 0x%x, 0x%x)' % (
+                self.__class__.__name__,
+                self.name,
+                self.global_,
+                self.section,
+                self.addr,
+                self.size)
+
+class SymInfo:
+    def __init__(self, syms):
+        self.syms = syms
+
+    def get(self, k, d=None):
+        # allow lookup by both symbol and address
+        if isinstance(k, str):
+            # organize by symbol, note multiple symbols can share a name
+            if not hasattr(self, '_by_sym'):
+                by_sym = {}
+                for sym in self.syms:
+                    if sym.name not in by_sym:
+                        by_sym[sym.name] = []
+                    if sym not in by_sym[sym.name]:
+                        by_sym[sym.name].append(sym)
+                self._by_sym = by_sym
+
+            return self._by_sym.get(k, d)
+
+        else:
+            import bisect
+
+            # organize by address
+            if not hasattr(self, '_by_addr'):
+                # sort and keep largest/first when duplicates
+                syms = self.syms.copy()
+                syms.sort(key=lambda x: (x.addr, -x.size))
+
+                by_addr = []
+                for sym in syms:
+                    if (len(by_addr) == 0
+                            or by_addr[-1].addr != sym.addr):
+                        by_addr.append(sym)
+                self._by_addr = by_addr
+
+            # find sym by range
+            i = bisect.bisect(self._by_addr, k,
+                    key=lambda x: x.addr)
+            # check that we're actually in this sym's size
+            if i > 0 and k < self._by_addr[i-1].addr+self._by_addr[i-1].size:
+                return self._by_addr[i-1]
+            else:
+                return d
+
+    def __getitem__(self, k):
+        v = self.get(k)
+        if v is None:
+            raise KeyError(k)
+        return v
+
+    def __contains__(self, k):
+        return self.get(k) is not None
+
+    def __len__(self):
+        return len(self.syms)
+
+    def __iter__(self):
+        return iter(self.syms)
+
+    def globals(self):
+        return SymInfo([sym for sym in self.syms
+                if sym.global_])
+
+    def section(self, section):
+        return SymInfo([sym for sym in self.syms
+                # note we accept prefixes
+                if s.startswith(section)])
+
+def collect_syms(obj_path, global_=False, sections=None, *,
+        objdump_path=OBJDUMP_PATH,
+        **args):
+    symbol_pattern = re.compile(
+            '^(?P<addr>[0-9a-fA-F]+)'
+                ' (?P<scope>.).*'
+                '\s+(?P<section>[^\s]+)'
+                '\s+(?P<size>[0-9a-fA-F]+)'
+                '\s+(?P<name>[^\s]+)\s*$')
+
+    # find symbol addresses and sizes
+    syms = []
+    cmd = objdump_path + ['--syms', obj_path]
+    if args.get('verbose'):
+        print(' '.join(shlex.quote(c) for c in cmd))
+    proc = sp.Popen(cmd,
+            stdout=sp.PIPE,
+            universal_newlines=True,
+            errors='replace',
+            close_fds=False)
+    for line in proc.stdout:
+        m = symbol_pattern.match(line)
+        if m:
+            name = m.group('name')
+            scope = m.group('scope')
+            section = m.group('section')
+            addr = int(m.group('addr'), 16)
+            size = int(m.group('size'), 16)
+            # skip non-globals?
+            # l => local
+            # g => global
+            # u => unique global
+            #   => neither
+            # ! => local + global
+            global__ = scope not in 'l '
+            if global_ and not global__:
+                continue
+            # filter by section? note we accept prefixes
+            if (sections is not None
+                    and not any(section.startswith(prefix)
+                        for prefix in sections)):
+                continue
+            # skip zero sized symbols
+            if not size:
+                continue
+            # note multiple symbols can share a name
+            syms.append(Sym(name, global__, section, addr, size))
+    proc.wait()
+    if proc.returncode != 0:
+        raise sp.CalledProcessError(proc.returncode, proc.args)
+
+    return SymInfo(syms)
+
+def collect_dwarf_files(obj_path, *,
+        objdump_path=OBJDUMP_PATH,
+        **args):
+    line_pattern = re.compile(
+            '^\s*(?P<no>[0-9]+)'
+                '(?:\s+(?P<dir>[0-9]+))?'
+                '.*\s+(?P<path>[^\s]+)\s*$')
+
+    # find source paths
+    dirs = co.OrderedDict()
+    files = co.OrderedDict()
+    # note objdump-path may contain extra args
+    cmd = objdump_path + ['--dwarf=rawline', obj_path]
+    if args.get('verbose'):
+        print(' '.join(shlex.quote(c) for c in cmd))
+    proc = sp.Popen(cmd,
+            stdout=sp.PIPE,
+            universal_newlines=True,
+            errors='replace',
+            close_fds=False)
+    for line in proc.stdout:
+        # note that files contain references to dirs, which we
+        # dereference as soon as we see them as each file table
+        # follows a dir table
+        m = line_pattern.match(line)
+        if m:
+            if not m.group('dir'):
+                # found a directory entry
+                dirs[int(m.group('no'))] = m.group('path')
+            else:
+                # found a file entry
+                dir = int(m.group('dir'))
+                if dir in dirs:
+                    files[int(m.group('no'))] = os.path.join(
+                            dirs[dir],
+                            m.group('path'))
+                else:
+                    files[int(m.group('no'))] = m.group('path')
+    proc.wait()
+    if proc.returncode != 0:
+        raise sp.CalledProcessError(proc.returncode, proc.args)
+
+    # simplify paths
+    files_ = co.OrderedDict()
+    for no, file in files.items():
+        if os.path.commonpath([
+                    os.getcwd(),
+                    os.path.abspath(file)]) == os.getcwd():
+            files_[no] = os.path.relpath(file)
+        else:
+            files_[no] = os.path.abspath(file)
+    files = files_
+
+    return files
+
+# each dwarf entry can have attrs and children entries
+class DwarfEntry:
+    def __init__(self, level, off, tag, ats={}, children=[]):
+        self.level = level
+        self.off = off
+        self.tag = tag
+        self.ats = ats or {}
+        self.children = children or []
+
+    def get(self, k, d=None):
+        return self.ats.get(k, d)
+
+    def __getitem__(self, k):
+        return self.ats[k]
+
+    def __contains__(self, k):
+        return k in self.ats
+
+    def __repr__(self):
+        return '%s(%d, 0x%x, %r, %r)' % (
+                self.__class__.__name__,
+                self.level,
+                self.off,
+                self.tag,
+                self.ats)
+
+    @ft.cached_property
+    def name(self):
+        if 'DW_AT_name' in self:
+            name = self['DW_AT_name'].split(':')[-1].strip()
+            # prefix with struct/union/enum
+            if self.tag == 'DW_TAG_structure_type':
+                name = 'struct ' + name
+            elif self.tag == 'DW_TAG_union_type':
+                name = 'union ' + name
+            elif self.tag == 'DW_TAG_enumeration_type':
+                name = 'enum ' + name
+            return name
+        else:
+            return None
+
+    @ft.cached_property
+    def addr(self):
+        if (self.tag == 'DW_TAG_subprogram'
+                and 'DW_AT_low_pc' in self):
+            return int(self['DW_AT_low_pc'], 0)
+        else:
+            return None
+
+    @ft.cached_property
+    def size(self):
+        if (self.tag == 'DW_TAG_subprogram'
+                and 'DW_AT_high_pc' in self):
+            # this looks wrong, but high_pc does store the size,
+            # for whatever reason
+            return int(self['DW_AT_high_pc'], 0)
+        else:
+            return None
+
+    def info(self, tags=None):
+        # recursively flatten children
+        def flatten(entry):
+            for child in entry.children:
+                # filter if requested
+                if tags is None or child.tag in tags:
+                    yield child
+
+                yield from flatten(child)
+
+        return DwarfInfo(co.OrderedDict(
+                (child.off, child) for child in flatten(self)))
+
+# a collection of dwarf entries
+class DwarfInfo:
+    def __init__(self, entries):
+        self.entries = entries
+
+    def get(self, k, d=None):
+        # allow lookup by offset, symbol, or dwarf name
+        if not isinstance(k, str) and not hasattr(k, 'addr'):
+            return self.entries.get(k, d)
+
+        elif hasattr(k, 'addr'):
+            import bisect
+
+            # organize by address
+            if not hasattr(self, '_by_addr'):
+                # sort and keep largest/first when duplicates
+                entries = [entry
+                        for entry in self.entries.values()
+                        if entry.addr is not None
+                            and entry.size is not None]
+                entries.sort(key=lambda x: (x.addr, -x.size))
+
+                by_addr = []
+                for entry in entries:
+                    if (len(by_addr) == 0
+                            or by_addr[-1].addr != entry.addr):
+                        by_addr.append(entry)
+                self._by_addr = by_addr
+
+            # find entry by range
+            i = bisect.bisect(self._by_addr, k.addr,
+                    key=lambda x: x.addr)
+            # check that we're actually in this entry's size
+            if (i > 0
+                    and k.addr
+                        < self._by_addr[i-1].addr
+                            + self._by_addr[i-1].size):
+                return self._by_addr[i-1]
+            else:
+                # fallback to lookup by name
+                return self.get(k.name, d)
+
+        else:
+            # organize entries by name
+            if not hasattr(self, '_by_name'):
+                self._by_name = {}
+                for entry in self.entries.values():
+                    if entry.name is not None:
+                        self._by_name[entry.name] = entry
+
+            # exact match? do a quick lookup
+            if k in self._by_name:
+                return self._by_name[k]
+            # find the best matching dwarf entry with a simple
+            # heuristic
+            #
+            # this can be different from the actual symbol because
+            # of optimization passes
+            else:
+                def key(entry):
+                    i = entry.name.find(k)
+                    if i == -1:
+                        return None
+                    return (i, len(entry.name)-(i+len(k)), entry.name)
+                return min(
+                        filter(key, self._by_name.values()),
+                        key=key,
+                        default=d)
+
+    def __getitem__(self, k):
+        v = self.get(k)
+        if v is None:
+            raise KeyError(k)
+        return v
+
+    def __contains__(self, k):
+        return self.get(k) is not None
+
+    def __len__(self):
+        return len(self.entries)
+
+    def __iter__(self):
+        return iter(self.entries.values())
+
+def collect_dwarf_info(obj_path, tags=None, *,
+        objdump_path=OBJDUMP_PATH,
+        **args):
+    info_pattern = re.compile(
+            '^\s*<(?P<level>[^>]*)>'
+                    '\s*<(?P<off>[^>]*)>'
+                    '.*\(\s*(?P<tag>[^)]*?)\s*\)\s*$'
+                '|' '^\s*<(?P<off_>[^>]*)>'
+                    '\s*(?P<at>[^>:]*?)'
+                    '\s*:(?P<v>.*)\s*$')
+
+    # collect dwarf entries
+    info = co.OrderedDict()
+    entry = None
+    levels = {}
+    # note objdump-path may contain extra args
+    cmd = objdump_path + ['--dwarf=info', obj_path]
+    if args.get('verbose'):
+        print(' '.join(shlex.quote(c) for c in cmd))
+    proc = sp.Popen(cmd,
+            stdout=sp.PIPE,
+            universal_newlines=True,
+            errors='replace',
+            close_fds=False)
+    for line in proc.stdout:
+        # state machine here to find dwarf entries
+        m = info_pattern.match(line)
+        if m:
+            if m.group('tag'):
+                entry = DwarfEntry(
+                    level=int(m.group('level'), 0),
+                    off=int(m.group('off'), 16),
+                    tag=m.group('tag').strip(),
+                )
+                # keep track of unfiltered entries
+                if tags is None or entry.tag in tags:
+                    info[entry.off] = entry
+                # store entry in parent
+                levels[entry.level] = entry
+                if entry.level-1 in levels:
+                    levels[entry.level-1].children.append(entry)
+            elif m.group('at'):
+                if entry:
+                    entry.ats[m.group('at').strip()] = (
+                            m.group('v').strip())
+    proc.wait()
+    if proc.returncode != 0:
+        raise sp.CalledProcessError(proc.returncode, proc.args)
+
+    # resolve abstract origins
+    for entry in info.values():
+        if 'DW_AT_abstract_origin' in entry:
+            off = int(entry['DW_AT_abstract_origin'].strip('<>'), 0)
+            origin = info[off]
+            assert 'DW_AT_abstract_origin' not in origin, (
+                    "Recursive abstract origin?")
+
+            for k, v in origin.ats.items():
+                if k not in entry.ats:
+                    entry.ats[k] = v
+
+    return DwarfInfo(info)
+
+class Frame(co.namedtuple('Sym', ['addr', 'frame'])):
+    __slots__ = ()
+    def __new__(cls, addr, frame):
+        return super().__new__(cls, addr, frame)
+
+    def __repr__(self):
+        return '%s(0x%x, %d)' % (
+                self.__class__.__name__,
+                self.addr,
+                self.frame)
+
+class FrameInfo:
+    def __init__(self, frames):
+        self.frames = frames
+
+    def get(self, k, d=None):
+        import bisect
+
+        # organize by address
+        if not hasattr(self, '_by_addr'):
+            # sort and keep largest when duplicates
+            frames = self.frames.copy()
+            frames.sort(key=lambda x: (x.addr, -x.frame))
+
+            by_addr = []
+            for frame in frames:
+                if (len(by_addr) == 0
+                        or by_addr[-1].addr != frame.addr):
+                    by_addr.append(frame)
+            self._by_addr = by_addr
+
+        # allow lookup by addr or range of addrs
+        if not isinstance(k, slice):
+            # find frame by addr
+            i = bisect.bisect(self._by_addr, k,
+                    key=lambda x: x.addr)
+            if i > 0:
+                return self._by_addr[i-1]
+            else:
+                return d
+
+        else:
+            # find frame by range
+            if k.start is None:
+                start = 0
+            else:
+                start = max(
+                        bisect.bisect(self._by_addr, k.start,
+                            key=lambda x: x.addr) - 1,
+                        0)
+            if k.stop is None:
+                stop = len(self._by_addr)
+            else:
+                stop = bisect.bisect(self._by_addr, k.stop,
+                        key=lambda x: x.addr)
+
+            return FrameInfo(self._by_addr[start:stop])
+
+    def __getitem__(self, k):
+        v = self.get(k)
+        if v is None:
+            raise KeyError(k)
+        return v
+
+    def __contains__(self, k):
+        return self.get(k) is not None
+
+    def __len__(self):
+        return len(self.frames)
+
+    def __iter__(self):
+        return iter(self.frames)
+
+def collect_dwarf_frames(obj_path, tags=None, *,
+        objdump_path=OBJDUMP_PATH,
+        **args):
+    frame_pattern = re.compile(
+            '^\s*(?P<cie_off>[0-9a-fA-F]+)'
+                    '\s+(?P<cie_size>[0-9a-fA-F]+)'
+                    '\s+(?P<cie_id>[0-9a-fA-F]+)'
+                    '\s+CIE\s*$'
+                '|' '^\s*(?P<fde_off>[0-9a-fA-F]+)'
+                    '\s+(?P<fde_size>[0-9a-fA-F]+)'
+                    '\s+(?P<fde_id>[0-9a-fA-F]+)'
+                    '\s+FDE'
+                    '\s+cie=(?P<fde_cie>[0-9a-fA-F]+)'
+                    '\s+pc=(?P<fde_pc_lo>[0-9a-fA-F]+)'
+                        '\.\.(?P<fde_pc_hi>[0-9a-fA-F]+)\s*$'
+                '|' '^\s*(?P<op>DW_CFA_[^\s:]*)\s*:?'
+                    '\s*(?P<change>.*?)\s*$')
+
+    # collect frame info
+    #
+    # Frame info is encoded in a state machine stored in fde/cie
+    # entries. fde entries can share cie entries, otherwise they are
+    # mostly the same.
+    #
+    cies = co.OrderedDict()
+    fdes = co.OrderedDict()
+    entry = None
+    # note objdump-path may contain extra args
+    cmd = objdump_path + ['--dwarf=frames', obj_path]
+    if args.get('verbose'):
+        print(' '.join(shlex.quote(c) for c in cmd))
+    proc = sp.Popen(cmd,
+            stdout=sp.PIPE,
+            universal_newlines=True,
+            errors='replace',
+            close_fds=False)
+    for line in proc.stdout:
+        # state machine here to find fde/cie entries
+        m = frame_pattern.match(line)
+        if m:
+            # start cie?
+            if m.group('cie_off'):
+                entry = {
+                        'type': 'cie',
+                        'off': int(m.group('cie_off'), 16),
+                        'ops': []}
+                cies[entry['off']] = entry
+
+            # start fde?
+            elif m.group('fde_off'):
+                entry = {
+                        'type': 'fde',
+                        'off': int(m.group('fde_off'), 16),
+                        'cie': int(m.group('fde_cie'), 16),
+                        'pc': (
+                            int(m.group('fde_pc_lo'), 16),
+                            int(m.group('fde_pc_hi'), 16)),
+                        'ops': []}
+                fdes[entry['off']] = entry
+
+            # found op?
+            elif m.group('op'):
+                entry['ops'].append((m.group('op'), m.group('change')))
+
+            else:
+                assert False
+    proc.wait()
+    if proc.returncode != 0:
+        raise sp.CalledProcessError(proc.returncode, proc.args)
+
+    # execute the state machine
+    frames = []
+    for _, fde in fdes.items():
+        cie = cies[fde['cie']]
+
+        cfa_loc = fde['pc'][0]
+        cfa_stack = []
+        for op, change in it.chain(cie['ops'], fde['ops']):
+            # advance location
+            if op in {
+                    'DW_CFA_advance_loc',
+                    'DW_CFA_advance_loc1',
+                    'DW_CFA_advance_loc2',
+                    'DW_CFA_advance_loc4'}:
+                cfa_loc = int(change.split('to')[-1], 16)
+            # change cfa offset
+            elif op in {
+                    'DW_CFA_def_cfa',
+                    'DW_CFA_def_cfa_offset'}:
+                cfa_off = int(change.split('ofs')[-1], 0)
+                frames.append(Frame(cfa_loc, cfa_off))
+            # push state, because of course we need a stack
+            elif op == 'DW_CFA_remember_state':
+                cfa_stack.append(cfa_off)
+            # pop state
+            elif op == 'DW_CFA_restore_state':
+                cfa_off = cfa_stack.pop()
+            # ignore these
+            elif op in {
+                    'DW_CFA_nop',
+                    'DW_CFA_offset',
+                    'DW_CFA_restore'}:
+                pass
+            else:
+                assert False, "Unknown frame op? %r" % op
+
+    return FrameInfo(frames)
+
+def collect(obj_paths, *,
         sources=None,
         everything=False,
         **args):
-    # parse the vcg format
-    k_pattern = re.compile('([a-z]+)\s*:', re.DOTALL)
-    v_pattern = re.compile('(?:"(.*?)"|([a-z]+))', re.DOTALL)
-    def parse_vcg(rest):
-        def parse_vcg(rest):
-            node = []
-            while True:
-                rest = rest.lstrip()
-                m_ = k_pattern.match(rest)
-                if not m_:
-                    return (node, rest)
-                k, rest = m_.group(1), rest[m_.end(0):]
+    funcs = []
+    globals = co.OrderedDict()
+    for obj_path in obj_paths:
+        # find relevant symbols
+        syms = collect_syms(obj_path,
+                sections=['.text'],
+                **args)
 
-                rest = rest.lstrip()
-                if rest.startswith('{'):
-                    v, rest = parse_vcg(rest[1:])
-                    assert rest[0] == '}', "unexpected %r" % rest[0:1]
-                    rest = rest[1:]
-                    node.append((k, v))
-                else:
-                    m_ = v_pattern.match(rest)
-                    assert m_, "unexpected %r" % rest[0:1]
-                    v, rest = m_.group(1) or m_.group(2), rest[m_.end(0):]
-                    node.append((k, v))
+        # find source paths
+        files = collect_dwarf_files(obj_path, **args)
 
-        node, rest = parse_vcg(rest)
-        assert rest == '', "unexpected %r" % rest[0:1]
-        return node
+        # find dwarf info, we only care about functions
+        info = collect_dwarf_info(obj_path,
+                tags={'DW_TAG_subprogram'},
+                **args)
 
-    # collect into functions
-    callgraph = co.defaultdict(lambda: (None, None, 0, set()))
-    f_pattern = re.compile(
-            r'([^\\]*)\\n([^:]*)[^\\]*\\n([0-9]+) bytes \((.*)\)')
-    for path in ci_paths:
-        with open(path) as f:
-            vcg = parse_vcg(f.read())
-        for k, graph in vcg:
-            if k != 'graph':
+        # find frame info
+        frames = collect_dwarf_frames(obj_path, **args)
+
+        # find the max stack frame for each function
+        locals = co.OrderedDict()
+        for sym in syms:
+            # discard internal functions
+            if not everything and sym.name.startswith('__'):
                 continue
-            for k, info in graph:
-                if k == 'node':
-                    info = dict(info)
-                    m_ = f_pattern.match(info['label'])
-                    if m_:
-                        function, file, size, type = m_.groups()
-                        if (not args.get('quiet')
-                                and 'static' not in type
-                                and 'bounded' not in type):
-                            print("warning: found non-static stack "
-                                    "for %s (%s, %s)" % (
-                                        function, type, size))
-                        _, _, _, targets = callgraph[info['title']]
-                        callgraph[info['title']] = (
-                                file, function, int(size), targets)
-                elif k == 'edge':
-                    info = dict(info)
-                    _, _, _, targets = callgraph[info['sourcename']]
-                    targets.add(info['targetname'])
-                else:
+
+            # find best matching dwarf entry, this may have a slightly
+            # different name due to optimizations
+            entry = info.get(sym)
+
+            # if we have no file guess from obj path
+            if entry is not None and 'DW_AT_decl_file' in entry:
+                file = files.get(int(entry['DW_AT_decl_file']), '?')
+            else:
+                file = re.sub('(\.o)?$', '.c', obj_path, 1)
+
+            # ignore filtered sources
+            if sources is not None:
+                if not any(os.path.abspath(file) == os.path.abspath(s)
+                        for s in sources):
+                    continue
+            else:
+                # default to only cwd
+                if not everything and not os.path.commonpath([
+                        os.getcwd(),
+                        os.path.abspath(file)]) == os.getcwd():
                     continue
 
-    callgraph_ = co.defaultdict(lambda: (None, None, 0, set()))
-    for source, (s_file, s_function, frame, targets) in callgraph.items():
-        # discard internal functions
-        if not everything and s_function.startswith('__'):
-            continue
-        # ignore filtered sources
-        if sources is not None:
-            if not any(os.path.abspath(s_file) == os.path.abspath(s)
-                    for s in sources):
-                continue
-        else:
-            # default to only cwd
-            if not everything and not os.path.commonpath([
-                    os.getcwd(),
-                    os.path.abspath(s_file)]) == os.getcwd():
+            # find the stack frames for each function
+            frames_ = frames[sym.addr:sym.addr+sym.size]
+
+            func = {'file': file,
+                    'sym': sym,
+                    'entry': entry,
+                    'frames': frames_,
+                    'calls': []}
+            funcs.append(func)
+
+            # keep track of locals/globals
+            if sym.global_:
+                globals[sym.name] = func
+            if entry is not None:
+                locals[entry.off] = func
+
+        # link local function calls via dwarf entries
+        for caller in locals.values():
+            if not caller['entry']:
                 continue
 
-        # smiplify path
-        if os.path.commonpath([
-                os.getcwd(),
-                os.path.abspath(s_file)]) == os.getcwd():
-            s_file = os.path.relpath(s_file)
-        else:
-            s_file = os.path.abspath(s_file)
+            for call in caller['entry'].info(
+                    tags={'DW_TAG_call_site'}):
+                if ('DW_AT_call_return_pc' not in call
+                        or 'DW_AT_call_origin' not in call):
+                    continue
 
-        callgraph_[source] = (s_file, s_function, frame, targets)
-    callgraph = callgraph_
+                # note DW_AT_call_return_pc refers to the address
+                # _after_ the call
+                #
+                # we change this to the last byte in the call
+                # instruction, which is a bit weird, but should at least
+                # map to the right stack frame
+                addr = int(call['DW_AT_call_return_pc'], 0) - 1
+                off = int(call['DW_AT_call_origin'].strip('<>'), 0)
 
-    if not everything:
-        callgraph_ = co.defaultdict(lambda: (None, None, 0, set()))
-        for source, (s_file, s_function, frame, targets) in callgraph.items():
-            # discard filtered sources
-            if sources is not None and not any(
-                    os.path.abspath(s_file) == os.path.abspath(s)
-                    for s in sources):
-                continue
-            # discard internal functions
-            if s_function.startswith('__'):
-                continue
-            callgraph_[source] = (s_file, s_function, frame, targets)
-        callgraph = callgraph_
+                # callee in locals?
+                if off in locals:
+                    callee = locals[off]
+                else:
+                    # if not, just keep track of the symbol and try to link
+                    # during the global pass
+                    callee = info[off]
+                    if callee.name is None:
+                        continue
+                    callee = callee.name
 
-    # find maximum stack size recursively, this requires also detecting cycles
-    # (in case of recursion)
-    def find_limit(source, seen=set()):
-        if not hasattr(find_limit, 'cache'):
-            find_limit.cache = {}
-        if source in find_limit.cache:
-            return find_limit.cache[source]
+                caller['calls'].append((addr, callee))
 
-        if source not in callgraph:
-            return 0
-        _, _, frame, targets = callgraph[source]
+    # link global function calls via symbol
+    for caller in funcs:
+        calls_ = []
+        for addr, callee in caller['calls']:
+            if isinstance(callee, str):
+                if callee in globals:
+                    calls_.append((addr, globals[callee]))
+            else:
+                calls_.append((addr, callee))
+        caller['calls'] = calls_
 
-        limit = 0
-        for target in targets:
-            # found a cycle?
-            if target in seen:
-                return mt.inf
-            limit_ = find_limit(target, seen | {target})
-            limit = max(limit, limit_)
+    # recursive+cached limit finder
+    def limitof(func, seen=set()):
+        # found a cycle? stop here
+        if id(func) in seen:
+            return 0, 0
+        # cached?
+        if not hasattr(limitof, 'cache'):
+            limitof.cache = {}
+        if id(func) in limitof.cache:
+            return limitof.cache[id(func)]
 
-        find_limit.cache[source] = frame + limit
-        return frame + limit
+        # find max stack frame
+        frame = max((frame.frame for frame in func['frames']), default=0)
+
+        # find stack limit recursively
+        limit = frame
+        for addr, callee in func['calls']:
+            if args.get('no_shrinkwrap'):
+                frame_ = frame
+            else:
+                # use stack frame at call site
+                frame_ = func['frames'][addr].frame
+
+            _, limit_ = limitof(callee, seen | {id(func)})
+
+            limit = max(limit, frame_ + limit_)
+
+        limitof.cache[id(func)] = frame, limit
+        return frame, limit
+
+    # recursive+cached children finder
+    def childrenof(func, seen=set()):
+        # found a cycle? stop here
+        if id(func) in seen:
+            return [], ['cycle detected']
+        # cached?
+        if not hasattr(childrenof, 'cache'):
+            childrenof.cache = {}
+        if id(func) in childrenof.cache:
+            return childrenof.cache[id(func)]
+
+        # find children recursively
+        children = []
+        for addr, callee in func['calls']:
+            file_ = callee['file']
+            name_ = callee['sym'].name
+            frame_, limit_ = limitof(callee, seen | {id(func)})
+            children_, notes_ = childrenof(callee, seen | {id(func)})
+            children.append(StackResult(file_, name_, frame_, limit_,
+                    children=children_,
+                    notes=notes_))
+
+        childrenof.cache[id(func)] = children, []
+        return children, []
 
     # build results
-    results = {}
-    for source, (s_file, s_function, frame, _) in callgraph.items():
-        limit = find_limit(source)
-        results[source] = StackResult(s_file, s_function, frame, limit)
+    results = []
+    for func in funcs:
+        file = func['file']
+        name = func['sym'].name
+        frame, limit = limitof(func)
+        children, notes = childrenof(func)
 
-    # connect parents to their children, this may create a fully cyclic graph
-    # in the case of recursion
-    for source, (_, _, _, targets) in callgraph.items():
-        results[source].children.extend(
-                results[target]
-                    for target in targets
-                    if target in results)
+        results.append(StackResult(file, name, frame, limit,
+                children=children,
+                notes=notes))
 
-    return list(results.values())
+    return results
 
 
 def fold(Result, results, by=None, defines=[]):
@@ -682,7 +1318,7 @@ def table(Result, results, diff_results=None, *,
                     for i, x in enumerate(line[1:], 1))))
 
 
-def main(ci_paths,
+def main(obj_paths,
         by=None,
         fields=None,
         defines=[],
@@ -696,7 +1332,7 @@ def main(ci_paths,
 
     # find sizes
     if not args.get('use', None):
-        results = collect(ci_paths, **args)
+        results = collect(obj_paths, **args)
     else:
         results = []
         with openio(args['use']) as f:
@@ -798,9 +1434,9 @@ if __name__ == "__main__":
             description="Find stack usage at the function level.",
             allow_abbrev=False)
     parser.add_argument(
-            'ci_paths',
+            'obj_paths',
             nargs='*',
-            help="Input *.ci files.")
+            help="Input *.o files.")
     parser.add_argument(
             '-v', '--verbose',
             action='store_true',
@@ -880,6 +1516,11 @@ if __name__ == "__main__":
             '--everything',
             action='store_true',
             help="Include builtin and libc specific symbols.")
+    parser.add_argument(
+            '--no-shrinkwrap',
+            action='store_true',
+            help="Ignore the effects of shrinkwrap optimizations (assume one "
+                "big frame per function).")
     parser.add_argument(
             '-z', '--depth',
             nargs='?',
@@ -896,6 +1537,12 @@ if __name__ == "__main__":
             '-e', '--error-on-recursion',
             action='store_true',
             help="Error if any functions are recursive.")
+    parser.add_argument(
+            '--objdump-path',
+            type=lambda x: x.split(),
+            default=OBJDUMP_PATH,
+            help="Path to the objdump executable, may include flags. "
+                "Defaults to %r." % OBJDUMP_PATH)
     sys.exit(main(**{k: v
             for k, v in vars(parser.parse_intermixed_args()).items()
             if v is not None}))