scripts: Adopted new dwarf-info parser in code.py/data.py

This breaks the collect function down into collect_dwarf_files, collect_dwarf_info, and collect_sizes. This makes the dwarf-info parser a bit easier to share with structs.py, etc. Sharing easily copy-pastable chunks of code in scripts like this has allowed for better code reuse without intricately tying script dependencies together. Being able to run each of these scripts standalone is a goal.
2025-12-09 09:02:53 +00:00 · 2024-11-27 03:05:22 -06:00
parent e00db216c1
commit 26ba7bdebc
2 changed files with 422 additions and 280 deletions
--- a/scripts/code.py
+++ b/scripts/code.py
@@ -159,181 +159,252 @@ def openio(path, mode='r', buffering=-1):
    else:
        return open(path, mode, buffering)

-def collect(obj_paths, *,
+def collect_dwarf_files(obj_path, *,
+        objdump_path=OBJDUMP_PATH,
+        **args):
+    line_pattern = re.compile(
+            '^\s*(?P<no>[0-9]+)'
+                '(?:\s+(?P<dir>[0-9]+))?'
+                '.*\s+(?P<path>[^\s]+)\s*$')
+
+    # find source paths
+    dirs = {}
+    files = {}
+    # note objdump-path may contain extra args
+    cmd = objdump_path + ['--dwarf=rawline', obj_path]
+    if args.get('verbose'):
+        print(' '.join(shlex.quote(c) for c in cmd))
+    proc = sp.Popen(cmd,
+            stdout=sp.PIPE,
+            stderr=None if args.get('verbose') else sp.DEVNULL,
+            universal_newlines=True,
+            errors='replace',
+            close_fds=False)
+    for line in proc.stdout:
+        # note that files contain references to dirs, which we
+        # dereference as soon as we see them as each file table
+        # follows a dir table
+        m = line_pattern.match(line)
+        if m:
+            if not m.group('dir'):
+                # found a directory entry
+                dirs[int(m.group('no'))] = m.group('path')
+            else:
+                # found a file entry
+                dir = int(m.group('dir'))
+                if dir in dirs:
+                    files[int(m.group('no'))] = os.path.join(
+                            dirs[dir],
+                            m.group('path'))
+                else:
+                    files[int(m.group('no'))] = m.group('path')
+    proc.wait()
+    if proc.returncode != 0:
+        if not args.get('verbose'):
+            for line in proc.stderr:
+                sys.stderr.write(line)
+        raise sp.CalledProcessError(proc.returncode, proc.args)
+
+    # simplify paths
+    files_ = {}
+    for no, file in files.items():
+        if os.path.commonpath([
+                    os.getcwd(),
+                    os.path.abspath(file)]) == os.getcwd():
+            files_[no] = os.path.relpath(file)
+        else:
+            files_[no] = os.path.abspath(file)
+    files = files_
+
+    return files
+
+def collect_dwarf_info(obj_path, filter=None, *,
+        objdump_path=OBJDUMP_PATH,
+        **args):
+    filter_, filter = filter, __builtins__.filter
+
+    # each dwarf entry can have attrs and children entries
+    class DwarfEntry:
+        def __init__(self, level, off, tag, ats={}, children=[]):
+            self.level = level
+            self.off = off
+            self.tag = tag
+            self.ats = ats or {}
+            self.children = children or []
+
+        def __getitem__(self, k):
+            return self.ats[k]
+
+        def __contains__(self, k):
+            return k in self.ats
+
+        def __repr__(self):
+            return '%s(%d, 0x%x, %r, %r)' % (
+                    self.__class__.__name__,
+                    self.level,
+                    self.off,
+                    self.tag,
+                    self.ats)
+
+    info_pattern = re.compile(
+            '^\s*(?:<(?P<level>[^>]*)>'
+                    '\s*<(?P<off>[^>]*)>'
+                    '.*\(\s*(?P<tag>[^)]*?)\s*\)'
+                '|\s*<(?P<off_>[^>]*)>'
+                    '\s*(?P<at>[^>:]*?)'
+                    '\s*:(?P<v>.*))\s*$')
+
+    # collect dwarf entries
+    entries = co.OrderedDict()
+    entry = None
+    levels = {}
+    # note objdump-path may contain extra args
+    cmd = objdump_path + ['--dwarf=info', obj_path]
+    if args.get('verbose'):
+        print(' '.join(shlex.quote(c) for c in cmd))
+    proc = sp.Popen(cmd,
+            stdout=sp.PIPE,
+            stderr=None if args.get('verbose') else sp.DEVNULL,
+            universal_newlines=True,
+            errors='replace',
+            close_fds=False)
+    for line in proc.stdout:
+        # state machine here to find dwarf entries
+        m = info_pattern.match(line)
+        if m:
+            if m.group('tag'):
+                entry = DwarfEntry(
+                    level=int(m.group('level'), 0),
+                    off=int(m.group('off'), 16),
+                    tag=m.group('tag').strip(),
+                )
+                # keep track of top-level entries
+                if (entry.level == 1 and (
+                        # unless this entry is filtered
+                        filter_ is None or entry.tag in filter_)):
+                    entries[entry.off] = entry
+                # store entry in parent
+                levels[entry.level] = entry
+                if entry.level-1 in levels:
+                    levels[entry.level-1].children.append(entry)
+            elif m.group('at'):
+                if entry:
+                    entry.ats[m.group('at').strip()] = (
+                            m.group('v').strip())
+    proc.wait()
+    if proc.returncode != 0:
+        if not args.get('verbose'):
+            for line in proc.stderr:
+                sys.stderr.write(line)
+        raise sp.CalledProcessError(proc.returncode, proc.args)
+
+    return entries
+
+def collect_sizes(obj_path, *,
        nm_path=NM_PATH,
        nm_types=NM_TYPES,
-        objdump_path=OBJDUMP_PATH,
-        sources=None,
        everything=False,
        **args):
    size_pattern = re.compile(
            '^(?P<size>[0-9a-fA-F]+)'
-                + ' (?P<type>[%s])' % re.escape(nm_types)
-                + ' (?P<func>.+?)$')
-    line_pattern = re.compile(
-            '^\s+(?P<no>[0-9]+)'
-                '(?:\s+(?P<dir>[0-9]+))?'
-                '\s+.*'
-                '\s+(?P<path>[^\s]+)$')
-    info_pattern = re.compile(
-            '^(?:.*(?P<tag>DW_TAG_[a-z_]+).*'
-                '|.*DW_AT_name.*:\s*(?P<name>[^:\s]+)\s*'
-                '|.*DW_AT_decl_file.*:\s*(?P<file>[0-9]+)\s*)$')
+                + '\s+(?P<type>[%s])' % re.escape(nm_types)
+                + '\s+(?P<func>[^\s]+)\s*$')

+    # find sizes
+    sizes = {}
+    # note nm-path may contain extra args
+    cmd = nm_path + ['--size-sort', obj_path]
+    if args.get('verbose'):
+        print(' '.join(shlex.quote(c) for c in cmd))
+    proc = sp.Popen(cmd,
+            stdout=sp.PIPE,
+            stderr=None if args.get('verbose') else sp.DEVNULL,
+            universal_newlines=True,
+            errors='replace',
+            close_fds=False)
+    for line in proc.stdout:
+        m = size_pattern.match(line)
+        if m:
+            func = m.group('func')
+            size = int(m.group('size'), 16)
+            sizes[func] = size
+    proc.wait()
+    if proc.returncode != 0:
+        if not args.get('verbose'):
+            for line in proc.stderr:
+                sys.stderr.write(line)
+        raise sp.CalledProcessError(proc.returncode, proc.args)
+
+    return sizes
+
+def collect(obj_paths, *,
+        sources=None,
+        everything=False,
+        **args):
    results = []
-    for path in obj_paths:
+    for obj_path in obj_paths:
        # guess the source, if we have debug-info we'll replace this later
-        file = re.sub('(\.o)?$', '.c', path, 1)
-
-        # find symbol sizes
-        results_ = []
-        # note nm-path may contain extra args
-        cmd = nm_path + ['--size-sort', path]
-        if args.get('verbose'):
-            print(' '.join(shlex.quote(c) for c in cmd))
-        proc = sp.Popen(cmd,
-                stdout=sp.PIPE,
-                stderr=None if args.get('verbose') else sp.DEVNULL,
-                universal_newlines=True,
-                errors='replace',
-                close_fds=False)
-        for line in proc.stdout:
-            m = size_pattern.match(line)
-            if m:
-                func = m.group('func')
-                # discard internal functions
-                if not everything and func.startswith('__'):
-                    continue
-                results_.append(CodeResult(
-                        file, func,
-                        int(m.group('size'), 16)))
-        proc.wait()
-        if proc.returncode != 0:
-            if not args.get('verbose'):
-                for line in proc.stderr:
-                    sys.stderr.write(line)
-            sys.exit(-1)
+        file = re.sub('(\.o)?$', '.c', obj_path, 1)

+        # find sizes
+        sizes = collect_sizes(obj_path, everything=everything, **args)

        # try to figure out the source file if we have debug-info
-        dirs = {}
-        files = {}
-        # note objdump-path may contain extra args
-        cmd = objdump_path + ['--dwarf=rawline', path]
-        if args.get('verbose'):
-            print(' '.join(shlex.quote(c) for c in cmd))
-        proc = sp.Popen(cmd,
-                stdout=sp.PIPE,
-                stderr=None if args.get('verbose') else sp.DEVNULL,
-                universal_newlines=True,
-                errors='replace',
-                close_fds=False)
-        for line in proc.stdout:
-            # note that files contain references to dirs, which we
-            # dereference as soon as we see them as each file table follows a
-            # dir table
-            m = line_pattern.match(line)
-            if m:
-                if not m.group('dir'):
-                    # found a directory entry
-                    dirs[int(m.group('no'))] = m.group('path')
-                else:
-                    # found a file entry
-                    dir = int(m.group('dir'))
-                    if dir in dirs:
-                        files[int(m.group('no'))] = os.path.join(
-                                dirs[dir],
-                                m.group('path'))
-                    else:
-                        files[int(m.group('no'))] = m.group('path')
-        proc.wait()
-        if proc.returncode != 0:
-            if not args.get('verbose'):
-                for line in proc.stderr:
-                    sys.stderr.write(line)
-            # do nothing on error, we don't need objdump to work, source files
-            # may just be inaccurate
-            pass
-
        defs = {}
-        is_func = False
-        f_name = None
-        f_file = None
-        def append():
-            # ignore non-functions and unnamed files
-            if is_func and f_name:
-                defs[f_name] = files.get(f_file, '?')
-        # note objdump-path may contain extra args
-        cmd = objdump_path + ['--dwarf=info', path]
-        if args.get('verbose'):
-            print(' '.join(shlex.quote(c) for c in cmd))
-        proc = sp.Popen(cmd,
-                stdout=sp.PIPE,
-                stderr=None if args.get('verbose') else sp.DEVNULL,
-                universal_newlines=True,
-                errors='replace',
-                close_fds=False)
-        for line in proc.stdout:
-            # state machine here to find definitions
-            m = info_pattern.match(line)
-            if m:
-                if m.group('tag'):
-                    append()
-                    is_func = (m.group('tag') == 'DW_TAG_subprogram')
-                    f_name = None
-                    f_file = None
-                elif m.group('name'):
-                    f_name = m.group('name')
-                elif m.group('file'):
-                    f_file = int(m.group('file'))
-        # don't forget the last function
-        append()
-        proc.wait()
-        if proc.returncode != 0:
-            if not args.get('verbose'):
-                for line in proc.stderr:
-                    sys.stderr.write(line)
-            # do nothing on error, we don't need objdump to work, source files
-            # may just be inaccurate
+        try:
+            files = collect_dwarf_files(obj_path, **args)
+            info = collect_dwarf_info(obj_path,
+                    filter={'DW_TAG_subprogram', 'DW_TAG_variable'},
+                    **args)
+
+            for no, entry in info.items():
+                # skip funcs with no name or no file
+                if ('DW_AT_name' not in entry
+                        or 'DW_AT_decl_file' not in entry):
+                    continue
+                name_ = entry['DW_AT_name'].split(':')[-1].strip()
+                file_ = files.get(int(entry['DW_AT_decl_file']), '?')
+                defs[name_] = file_
+
+        except sp.CalledProcessError:
+            # do nothing on error, we don't need objdump to work, source
+            # files may just be inaccurate
            pass

-        for r in results_:
+        # map function sizes to debug symbols
+        for func, size in sizes.items():
+            # discard internal functions
+            if not everything and func.startswith('__'):
+                continue
+
            # find best matching debug symbol, this may be slightly different
            # due to optimizations
            if defs:
                # exact match? avoid difflib if we can for speed
-                if r.function in defs:
-                    file = defs[r.function]
+                if func in defs:
+                    file_ = defs[func]
                else:
-                    _, file = max(
+                    _, file_ = max(
                            defs.items(),
                            key=lambda d: difflib.SequenceMatcher(None,
                                d[0],
-                                r.function, False).ratio())
+                                func, False).ratio())
            else:
-                file = r.file
+                file_ = file

            # ignore filtered sources
            if sources is not None:
-                if not any(os.path.abspath(file) == os.path.abspath(s)
+                if not any(os.path.abspath(file_) == os.path.abspath(s)
                        for s in sources):
                    continue
            else:
                # default to only cwd
                if not everything and not os.path.commonpath([
                        os.getcwd(),
-                        os.path.abspath(file)]) == os.getcwd():
+                        os.path.abspath(file_)]) == os.getcwd():
                    continue

-            # simplify path
-            if os.path.commonpath([
-                    os.getcwd(),
-                    os.path.abspath(file)]) == os.getcwd():
-                file = os.path.relpath(file)
-            else:
-                file = os.path.abspath(file)
-
-            results.append(r._replace(file=file))
+            results.append(CodeResult(file_, func, size))

    return results