scripts: Simplified result->file mapping, dropped collect_dwarf_files

This reverts per-result source file mapping, and tears out of a bunch of
messy dwarf parsing code. Results from the same .o file are now mapped
to the same source file.

This was just way too much complexity for slightly better result->file
mapping, which risked losing results accidentally mapped to the wrong
file.

---

I was originally going to revert all the way back to relying strictly on
the .o name and --build-dir (490e1c4) (this is the simplest solution),
but after poking around in dwarf-info a bit, I realized we do have
access to the original source file in DW_TAG_compile_unit's
DW_AT_comp_dir + DW_AT_name.

This is much simpler/more robust than parsing objdump --dwarf=rawline,
and avoid needing --build-dir in a bunch of scripts.

---

This also reverts stack.py to rely only on the .ci files. These seem as
reliable as DW_TAG_compile_unit while simplifying things significantly.

Symbol mapping used to be a problem, but this was fixed by using the
symbol in the title field instead of the label field (which strips some
optimization suffixes?)
This commit is contained in:
Christopher Haster
2024-12-11 18:45:46 -06:00
parent dad3367e9e
commit 4c87d59c7b
9 changed files with 190 additions and 1222 deletions

View File

@@ -229,6 +229,9 @@ class SymInfo:
def __contains__(self, k):
return self.get(k) is not None
def __bool__(self):
return bool(self.syms)
def __len__(self):
return len(self.syms)
@@ -297,61 +300,6 @@ def collect_syms(obj_path, global_=False, sections=None, *,
return SymInfo(syms)
def collect_dwarf_files(obj_path, *,
objdump_path=OBJDUMP_PATH,
**args):
line_pattern = re.compile(
'^\s*(?P<no>[0-9]+)'
'(?:\s+(?P<dir>[0-9]+))?'
'.*\s+(?P<path>[^\s]+)\s*$')
# find source paths
dirs = co.OrderedDict()
files = co.OrderedDict()
# note objdump-path may contain extra args
cmd = objdump_path + ['--dwarf=rawline', obj_path]
if args.get('verbose'):
print(' '.join(shlex.quote(c) for c in cmd))
proc = sp.Popen(cmd,
stdout=sp.PIPE,
universal_newlines=True,
errors='replace',
close_fds=False)
for line in proc.stdout:
# note that files contain references to dirs, which we
# dereference as soon as we see them as each file table
# follows a dir table
m = line_pattern.match(line)
if m:
if not m.group('dir'):
# found a directory entry
dirs[int(m.group('no'))] = m.group('path')
else:
# found a file entry
dir = int(m.group('dir'))
if dir in dirs:
files[int(m.group('no'))] = os.path.join(
dirs[dir],
m.group('path'))
else:
files[int(m.group('no'))] = m.group('path')
proc.wait()
if proc.returncode != 0:
raise sp.CalledProcessError(proc.returncode, proc.args)
# simplify paths
files_ = co.OrderedDict()
for no, file in files.items():
if os.path.commonpath([
os.getcwd(),
os.path.abspath(file)]) == os.getcwd():
files_[no] = os.path.relpath(file)
else:
files_[no] = os.path.abspath(file)
files = files_
return files
# each dwarf entry can have attrs and children entries
class DwarfEntry:
def __init__(self, level, off, tag, ats={}, children=[]):
@@ -393,79 +341,16 @@ class DwarfEntry:
else:
return None
@ft.cached_property
def addr(self):
if (self.tag == 'DW_TAG_subprogram'
and 'DW_AT_low_pc' in self):
return int(self['DW_AT_low_pc'], 0)
else:
return None
@ft.cached_property
def size(self):
if (self.tag == 'DW_TAG_subprogram'
and 'DW_AT_high_pc' in self):
# this looks wrong, but high_pc does store the size,
# for whatever reason
return int(self['DW_AT_high_pc'], 0)
else:
return None
def info(self, tags=None):
# recursively flatten children
def flatten(entry):
for child in entry.children:
# filter if requested
if tags is None or child.tag in tags:
yield child
yield from flatten(child)
return DwarfInfo(co.OrderedDict(
(child.off, child) for child in flatten(self)))
# a collection of dwarf entries
class DwarfInfo:
def __init__(self, entries):
self.entries = entries
def get(self, k, d=None):
# allow lookup by offset, symbol, or dwarf name
if not isinstance(k, str) and not hasattr(k, 'addr'):
# allow lookup by offset or dwarf name
if not isinstance(k, str):
return self.entries.get(k, d)
elif hasattr(k, 'addr'):
import bisect
# organize by address
if not hasattr(self, '_by_addr'):
# sort and keep largest/first when duplicates
entries = [entry
for entry in self.entries.values()
if entry.addr is not None
and entry.size is not None]
entries.sort(key=lambda x: (x.addr, -x.size))
by_addr = []
for entry in entries:
if (len(by_addr) == 0
or by_addr[-1].addr != entry.addr):
by_addr.append(entry)
self._by_addr = by_addr
# find entry by range
i = bisect.bisect(self._by_addr, k.addr,
key=lambda x: x.addr)
# check that we're actually in this entry's size
if (i > 0
and k.addr
< self._by_addr[i-1].addr
+ self._by_addr[i-1].size):
return self._by_addr[i-1]
else:
# fallback to lookup by name
return self.get(k.name, d)
else:
# organize entries by name
if not hasattr(self, '_by_name'):
@@ -502,6 +387,9 @@ class DwarfInfo:
def __contains__(self, k):
return self.get(k) is not None
def __bool__(self):
return bool(self.entries)
def __len__(self):
return len(self.entries)
@@ -557,22 +445,9 @@ def collect_dwarf_info(obj_path, tags=None, *,
if proc.returncode != 0:
raise sp.CalledProcessError(proc.returncode, proc.args)
# resolve abstract origins
for entry in info.values():
if 'DW_AT_abstract_origin' in entry:
off = int(entry['DW_AT_abstract_origin'].strip('<>'), 0)
origin = info[off]
assert 'DW_AT_abstract_origin' not in origin, (
"Recursive abstract origin?")
for k, v in origin.ats.items():
if k not in entry.ats:
entry.ats[k] = v
return DwarfInfo(info)
def collect(obj_paths, *,
sources=None,
everything=False,
**args):
results = []
@@ -582,42 +457,38 @@ def collect(obj_paths, *,
sections=SECTIONS,
**args)
# find source paths
files = collect_dwarf_files(obj_path, **args)
# find dwarf info
info = collect_dwarf_info(obj_path,
tags={'DW_TAG_subprogram', 'DW_TAG_variable'},
tags={'DW_TAG_compile_unit'},
**args)
# map function sizes to debug symbols
# find source file from dwarf info
for entry in info:
if (entry.tag == 'DW_TAG_compile_unit'
and 'DW_AT_name' in entry
and 'DW_AT_comp_dir' in entry):
file = os.path.join(
entry['DW_AT_comp_dir'].split(':')[-1].strip(),
entry['DW_AT_name'].split(':')[-1].strip())
break
else:
# guess from obj path
file = re.sub('(\.o)?$', '.c', obj_path, 1)
# simplify path
if os.path.commonpath([
os.getcwd(),
os.path.abspath(file)]) == os.getcwd():
file = os.path.relpath(file)
else:
file = os.path.abspath(file)
# find function sizes
for sym in syms:
# discard internal functions
if not everything and sym.name.startswith('__'):
continue
# find best matching dwarf entry, this may be slightly different
# due to optimizations
entry = info.get(sym)
# if we have no file guess from obj path
if entry is not None and 'DW_AT_decl_file' in entry:
file = files.get(int(entry['DW_AT_decl_file']), '?')
else:
file = re.sub('(\.o)?$', '.c', obj_path, 1)
# ignore filtered sources
if sources is not None:
if not any(os.path.abspath(file) == os.path.abspath(s)
for s in sources):
continue
else:
# default to only cwd
if not everything and not os.path.commonpath([
os.getcwd(),
os.path.abspath(file)]) == os.getcwd():
continue
results.append(CodeResult(file, sym.name, sym.size))
return results
@@ -1170,12 +1041,6 @@ if __name__ == "__main__":
nargs='?',
action=AppendSort,
help="Sort by this field, but backwards.")
parser.add_argument(
'-F', '--source',
dest='sources',
action='append',
help="Only consider definitions in this file. Defaults to "
"anything in the current directory.")
parser.add_argument(
'--everything',
action='store_true',