scripts: Reworked structs.py to include field info

This reworks structs.py's internal dwarf-info parser to be a bit more
flexible. The eventual plan is to adopt this parser in other scripts.

The main difference is we now parse the dwarf-info into a full tree,
with optional filtering, before extracting the fields we care about.
This is both more flexible and gives us more confidence the parser is
not misparsing something.

(Unrelated but apparently misparsing is a real word.)

This also extends structs.py to include field info for structs and
unions. This is quite useful for understanding the size of things:

  $ ./scripts/structs.py thumb/lfs.o -Dstruct=lfsr_bptr_t -z
  struct                      size
  lfsr_bptr_t                   20
  |-> cksize                     4
  |-> cksum                      4
  '-> data                      12
      |-> size                   4
      '-> u                      8
          |-> buffer             4
          '-> disk               8
              |-> block          4
              '-> off            4
  TOTAL                         20

The field info uses the same -z/--depth flag from stack.py/perf.py/
perbd.py, however the cycle detector needed a bit of tweaking. Detecting
cycles purely by name doesn't quite work with structs:

  file->o.o.flags
        ^ |
        '-' not a cycle!

Unfortunately, we do lose the field order in structs. But this info is
still useful.

Oh, we also prefer typedef names over struct/union names now. These are
a bit easier to read since they are more common in the codebase.
This commit is contained in:
Christopher Haster
2024-11-26 14:34:44 -06:00
parent 51b8cdb1f0
commit 35f68a733c

View File

@@ -128,20 +128,23 @@ class RInt(co.namedtuple('RInt', 'x')):
# struct size results
class StructResult(co.namedtuple('StructResult', [
'file', 'struct',
'size'])):
'size',
'children'])):
_by = ['file', 'struct']
_fields = ['size']
_sort = ['size']
_types = {'size': RInt}
__slots__ = ()
def __new__(cls, file='', struct='', size=0):
def __new__(cls, file='', struct='', size=0, children=[]):
return super().__new__(cls, file, struct,
RInt(size))
RInt(size),
children or [])
def __add__(self, other):
return StructResult(self.file, self.struct,
self.size + other.size)
self.size + other.size,
self.children + other.children)
def openio(path, mode='r', buffering=-1):
@@ -154,136 +157,239 @@ def openio(path, mode='r', buffering=-1):
else:
return open(path, mode, buffering)
def collect(obj_paths, *,
def collect_dwarf_files(obj_path, *,
objdump_path=OBJDUMP_PATH,
**args):
line_pattern = re.compile(
'^\s*(?P<no>[0-9]+)'
'(?:\s+(?P<dir>[0-9]+))?'
'.*\s+(?P<path>[^\s]+)\s*$')
# find source paths
dirs = {}
files = {}
# note objdump-path may contain extra args
cmd = objdump_path + ['--dwarf=rawline', obj_path]
if args.get('verbose'):
print(' '.join(shlex.quote(c) for c in cmd))
proc = sp.Popen(cmd,
stdout=sp.PIPE,
stderr=None if args.get('verbose') else sp.DEVNULL,
universal_newlines=True,
errors='replace',
close_fds=False)
for line in proc.stdout:
# note that files contain references to dirs, which we
# dereference as soon as we see them as each file table
# follows a dir table
m = line_pattern.match(line)
if m:
if not m.group('dir'):
# found a directory entry
dirs[int(m.group('no'))] = m.group('path')
else:
# found a file entry
dir = int(m.group('dir'))
if dir in dirs:
files[int(m.group('no'))] = os.path.join(
dirs[dir],
m.group('path'))
else:
files[int(m.group('no'))] = m.group('path')
proc.wait()
if proc.returncode != 0:
if not args.get('verbose'):
for line in proc.stderr:
sys.stderr.write(line)
raise sp.CalledProcessError(proc.returncode, proc.args)
# simplify paths
files_ = {}
for no, file in files.items():
if os.path.commonpath([
os.getcwd(),
os.path.abspath(file)]) == os.getcwd():
files_[no] = os.path.relpath(file)
else:
files_[no] = os.path.abspath(file)
files = files_
return files
def collect_dwarf_info(obj_path, filter=None, *,
objdump_path=OBJDUMP_PATH,
**args):
filter_, filter = filter, __builtins__.filter
# each dwarf entry can have attrs and children entries
class DwarfEntry:
def __init__(self, level, off, tag, ats={}, children=[]):
self.level = level
self.off = off
self.tag = tag
self.ats = ats or {}
self.children = children or []
def __getitem__(self, k):
return self.ats[k]
def __contains__(self, k):
return k in self.ats
info_pattern = re.compile(
'^\s*(?:<(?P<level>[^>]*)>'
'\s*<(?P<off>[^>]*)>'
'.*\(\s*(?P<tag>[^)]*?)\s*\)'
'|\s*<(?P<off_>[^>]*)>'
'\s*(?P<at>[^>:]*?)'
'\s*:(?P<v>.*))\s*$')
# collect dwarf entries
entries = co.OrderedDict()
entry = None
levels = {}
# note objdump-path may contain extra args
cmd = objdump_path + ['--dwarf=info', obj_path]
if args.get('verbose'):
print(' '.join(shlex.quote(c) for c in cmd))
proc = sp.Popen(cmd,
stdout=sp.PIPE,
stderr=None if args.get('verbose') else sp.DEVNULL,
universal_newlines=True,
errors='replace',
close_fds=False)
for line in proc.stdout:
# state machine here to find dwarf entries
m = info_pattern.match(line)
if m:
if m.group('tag'):
entry = DwarfEntry(
level=int(m.group('level'), 0),
off=int(m.group('off'), 16),
tag=m.group('tag').strip(),
)
# keep track of top-level entries
if (entry.level == 1 and (
# unless this entry is filtered
filter_ is None or entry.tag in filter_)):
entries[entry.off] = entry
# store entry in parent
levels[entry.level] = entry
if entry.level-1 in levels:
levels[entry.level-1].children.append(entry)
elif m.group('at'):
if entry:
entry.ats[m.group('at').strip()] = (
m.group('v').strip())
proc.wait()
if proc.returncode != 0:
if not args.get('verbose'):
for line in proc.stderr:
sys.stderr.write(line)
raise sp.CalledProcessError(proc.returncode, proc.args)
return entries
def collect(obj_paths, *,
sources=None,
everything=False,
internal=False,
**args):
line_pattern = re.compile(
'^\s+(?P<no>[0-9]+)'
'(?:\s+(?P<dir>[0-9]+))?'
'\s+.*'
'\s+(?P<path>[^\s]+)$')
info_pattern = re.compile(
'^(?:.*(?P<tag>DW_TAG_[a-z_]+).*'
'|.*DW_AT_name.*:\s*(?P<name>[^:\s]+)\s*'
'|.*DW_AT_decl_file.*:\s*(?P<file>[0-9]+)\s*'
'|.*DW_AT_byte_size.*:\s*(?P<size>[0-9]+)\s*)$')
results = []
for path in obj_paths:
# find files, we want to filter by structs in .h files
dirs = {}
files = {}
# note objdump-path may contain extra args
cmd = objdump_path + ['--dwarf=rawline', path]
if args.get('verbose'):
print(' '.join(shlex.quote(c) for c in cmd))
proc = sp.Popen(cmd,
stdout=sp.PIPE,
stderr=None if args.get('verbose') else sp.DEVNULL,
universal_newlines=True,
errors='replace',
close_fds=False)
for line in proc.stdout:
# note that files contain references to dirs, which we
# dereference as soon as we see them as each file table follows a
# dir table
m = line_pattern.match(line)
if m:
if not m.group('dir'):
# found a directory entry
dirs[int(m.group('no'))] = m.group('path')
else:
# found a file entry
dir = int(m.group('dir'))
if dir in dirs:
files[int(m.group('no'))] = os.path.join(
dirs[dir],
m.group('path'))
else:
files[int(m.group('no'))] = m.group('path')
proc.wait()
if proc.returncode != 0:
if not args.get('verbose'):
for line in proc.stderr:
sys.stderr.write(line)
sys.exit(-1)
for obj_path in obj_paths:
# find source paths
files = collect_dwarf_files(obj_path, **args)
# collect structs as we parse dwarf info
results_ = []
is_struct = False
s_name = None
s_file = None
s_size = None
def append():
# ignore non-structs and unnamed files
if is_struct and s_name:
file = files.get(s_file, '?')
results_.append(StructResult(file, s_name, s_size))
# note objdump-path may contain extra args
cmd = objdump_path + ['--dwarf=info', path]
if args.get('verbose'):
print(' '.join(shlex.quote(c) for c in cmd))
proc = sp.Popen(cmd,
stdout=sp.PIPE,
stderr=None if args.get('verbose') else sp.DEVNULL,
universal_newlines=True,
errors='replace',
close_fds=False)
for i, line in enumerate(proc.stdout):
# state machine here to find structs
m = info_pattern.match(line)
if m:
if m.group('tag'):
append()
is_struct = (m.group('tag') == 'DW_TAG_structure_type'
or m.group('tag') == 'DW_TAG_union_type')
s_name = None
s_file = None
s_size = None
elif m.group('name'):
s_name = m.group('name')
elif m.group('file'):
s_file = int(m.group('file'))
elif m.group('size'):
s_size = int(m.group('size'))
# don't forget the last struct
append()
proc.wait()
if proc.returncode != 0:
if not args.get('verbose'):
for line in proc.stderr:
sys.stderr.write(line)
sys.exit(-1)
# find dwarf info
info = collect_dwarf_info(obj_path, **args)
# collect structs and other types
typedefs = {}
typedefed = set()
types = {}
for no, entry in info.items():
# skip non-types
if entry.tag not in {
'DW_TAG_typedef',
'DW_TAG_structure_type',
'DW_TAG_union_type',
'DW_TAG_enumeration_type'}:
continue
for r in results_:
# ignore filtered sources
file = files.get(int(entry['DW_AT_decl_file']), '?')
if sources is not None:
if not any(os.path.abspath(r.file) == os.path.abspath(s)
if not any(os.path.abspath(file) == os.path.abspath(s)
for s in sources):
continue
else:
# default to only cwd
if not everything and not os.path.commonpath([
os.getcwd(),
os.path.abspath(r.file)]) == os.getcwd():
if (not everything and not os.path.commonpath([
os.getcwd(),
os.path.abspath(file)]) == os.getcwd()):
continue
# limit to .h files unless --internal
if not internal and not r.file.endswith('.h'):
if not internal and not file.endswith('.h'):
continue
# simplify path
if os.path.commonpath([
os.getcwd(),
os.path.abspath(r.file)]) == os.getcwd():
file = os.path.relpath(r.file)
else:
file = os.path.abspath(r.file)
# skip types with no names
if 'DW_AT_name' not in entry:
continue
name = entry['DW_AT_name'].split(':')[-1].strip()
results.append(r._replace(file=file))
# find the size of a type, recursing if necessary
def sizeof(entry):
# explicit size?
if 'DW_AT_byte_size' in entry:
return int(entry['DW_AT_byte_size'])
# indirect type?
elif 'DW_AT_type' in entry:
type = int(entry['DW_AT_type'].strip('<>'), 0)
size = sizeof(info[type])
# wait are we an array?
if entry.tag == 'DW_TAG_array_type':
for child in entry.children:
if child.tag == 'DW_TAG_subrange_type':
size *= int(child['DW_AT_upper_bound']) + 1
return size
else:
assert False
size = sizeof(entry)
# find children, recursing if necessary
def childrenof(entry):
# pointer? these end up recursive but the underlying
# type doesn't really matter here
if entry.tag == 'DW_TAG_pointer_type':
return []
# indirect type?
elif 'DW_AT_type' in entry:
type = int(entry['DW_AT_type'].strip('<>'), 0)
return childrenof(info[type])
else:
children = []
for child in entry.children:
name = child['DW_AT_name'].split(':')[-1].strip()
size = sizeof(child)
children.append(StructResult(file, name, size,
childrenof(child)))
return children
children = childrenof(entry)
# typdefs exist in a separate namespace, so we need to track
# these separately
if entry.tag == 'DW_TAG_typedef':
typedefs[no] = StructResult(file, name, size, children)
typedefed.add(int(entry['DW_AT_type'].strip('<>'), 0))
else:
types[no] = StructResult(file, name, size, children)
# let typedefs take priority
results.extend(typedefs.values())
results.extend(type
for no, type in types.items()
if no not in typedefed)
return results
@@ -330,6 +436,8 @@ def table(Result, results, diff_results=None, *,
all=False,
compare=None,
summary=False,
depth=None,
hot=None,
**_):
all_, all = all, __builtins__.all
@@ -344,6 +452,34 @@ def table(Result, results, diff_results=None, *,
if diff_results is not None:
diff_results = fold(Result, diff_results, by=by)
# reduce children to hot paths?
if hot:
def rec_hot(results_, seen=set()):
if not results_:
return []
r = max(results_,
key=lambda r: tuple(
tuple((getattr(r, k),)
if getattr(r, k, None) is not None
else ()
for k in (
[k] if k else [
k for k in Result._sort
if k in fields])
if k in fields)
for k in it.chain(hot, [None])))
# found a cycle?
if id(r) in seen:
return []
return [r._replace(children=[])] + rec_hot(
r.children,
seen | {id(r)})
results = [r._replace(children=rec_hot(r.children)) for r in results]
# organize by name
table = {
','.join(str(getattr(r, k) or '') for k in by): r
@@ -485,6 +621,59 @@ def table(Result, results, diff_results=None, *,
getattr(diff_r, k, None)))))
return entry
# recursive entry helper
def recurse(results_, depth_, seen=set(),
prefixes=('', '', '', '')):
# build the children table at each layer
results_ = fold(Result, results_, by=by)
table_ = {
','.join(str(getattr(r, k) or '') for k in by): r
for r in results_}
names_ = list(table_.keys())
# sort the children layer
names_.sort()
if sort:
for k, reverse in reversed(sort):
names_.sort(
key=lambda n: tuple(
(getattr(table_[n], k),)
if getattr(table_.get(n), k, None)
is not None
else ()
for k in (
[k] if k else [
k for k in Result._sort
if k in fields])),
reverse=reverse ^ (not k or k in Result._fields))
for i, name in enumerate(names_):
r = table_[name]
is_last = (i == len(names_)-1)
line = table_entry(name, r)
line = [x if isinstance(x, tuple) else (x, []) for x in line]
# add prefixes
line[0] = (prefixes[0+is_last] + line[0][0], line[0][1])
# add cycle detection
if id(r) in seen:
line[-1] = (line[-1][0], line[-1][1] + ['cycle detected'])
lines.append(line)
# found a cycle?
if id(r) in seen:
continue
# recurse?
if depth_ > 1:
recurse(r.children,
depth_-1,
seen | {id(r)},
(prefixes[2+is_last] + "|-> ",
prefixes[2+is_last] + "'-> ",
prefixes[2+is_last] + "| ",
prefixes[2+is_last] + " "))
# entries
if (not summary) or compare:
for name in names:
@@ -495,6 +684,16 @@ def table(Result, results, diff_results=None, *,
diff_r = diff_table.get(name)
lines.append(table_entry(name, r, diff_r))
# recursive entries
if name in table and depth > 1:
recurse(table[name].children,
depth-1,
{id(r)},
("|-> ",
"'-> ",
"| ",
" "))
# total, unless we're comparing
if not (compare and not percent and not diff):
r = next(iter(fold(Result, results, by=[])), None)
@@ -534,6 +733,12 @@ def main(obj_paths, *,
defines=[],
sort=None,
**args):
# figure out depth
if args.get('depth') is None:
args['depth'] = mt.inf if args.get('hot') else 1
elif args.get('depth') == 0:
args['depth'] = mt.inf
# find sizes
if not args.get('use', None):
results = collect(obj_paths, **args)
@@ -721,6 +926,18 @@ if __name__ == "__main__":
'--internal',
action='store_true',
help="Also show structs in .c files.")
parser.add_argument(
'-z', '--depth',
nargs='?',
type=lambda x: int(x, 0),
const=0,
help="Depth of function calls to show. 0 shows all calls unless "
"we find a cycle. Defaults to 0.")
parser.add_argument(
'-t', '--hot',
nargs='?',
action='append',
help="Show only the hot path for each function call.")
parser.add_argument(
'--objdump-path',
type=lambda x: x.split(),