scripts: Simplified result->file mapping, dropped collect_dwarf_files

This reverts per-result source file mapping, and tears out of a bunch of messy dwarf parsing code. Results from the same .o file are now mapped to the same source file. This was just way too much complexity for slightly better result->file mapping, which risked losing results accidentally mapped to the wrong file. --- I was originally going to revert all the way back to relying strictly on the .o name and --build-dir (490e1c4) (this is the simplest solution), but after poking around in dwarf-info a bit, I realized we do have access to the original source file in DW_TAG_compile_unit's DW_AT_comp_dir + DW_AT_name. This is much simpler/more robust than parsing objdump --dwarf=rawline, and avoid needing --build-dir in a bunch of scripts. --- This also reverts stack.py to rely only on the .ci files. These seem as reliable as DW_TAG_compile_unit while simplifying things significantly. Symbol mapping used to be a problem, but this was fixed by using the symbol in the title field instead of the label field (which strips some optimization suffixes?)
2025-12-07 16:12:47 +00:00 · 2024-12-11 18:45:46 -06:00
parent dad3367e9e
commit 4c87d59c7b
9 changed files with 190 additions and 1222 deletions
--- a/scripts/code.py
+++ b/scripts/code.py
@@ -229,6 +229,9 @@ class SymInfo:
    def __contains__(self, k):
        return self.get(k) is not None

+    def __bool__(self):
+        return bool(self.syms)
+
    def __len__(self):
        return len(self.syms)

@@ -297,61 +300,6 @@ def collect_syms(obj_path, global_=False, sections=None, *,

    return SymInfo(syms)

-def collect_dwarf_files(obj_path, *,
-        objdump_path=OBJDUMP_PATH,
-        **args):
-    line_pattern = re.compile(
-            '^\s*(?P<no>[0-9]+)'
-                '(?:\s+(?P<dir>[0-9]+))?'
-                '.*\s+(?P<path>[^\s]+)\s*$')
-
-    # find source paths
-    dirs = co.OrderedDict()
-    files = co.OrderedDict()
-    # note objdump-path may contain extra args
-    cmd = objdump_path + ['--dwarf=rawline', obj_path]
-    if args.get('verbose'):
-        print(' '.join(shlex.quote(c) for c in cmd))
-    proc = sp.Popen(cmd,
-            stdout=sp.PIPE,
-            universal_newlines=True,
-            errors='replace',
-            close_fds=False)
-    for line in proc.stdout:
-        # note that files contain references to dirs, which we
-        # dereference as soon as we see them as each file table
-        # follows a dir table
-        m = line_pattern.match(line)
-        if m:
-            if not m.group('dir'):
-                # found a directory entry
-                dirs[int(m.group('no'))] = m.group('path')
-            else:
-                # found a file entry
-                dir = int(m.group('dir'))
-                if dir in dirs:
-                    files[int(m.group('no'))] = os.path.join(
-                            dirs[dir],
-                            m.group('path'))
-                else:
-                    files[int(m.group('no'))] = m.group('path')
-    proc.wait()
-    if proc.returncode != 0:
-        raise sp.CalledProcessError(proc.returncode, proc.args)
-
-    # simplify paths
-    files_ = co.OrderedDict()
-    for no, file in files.items():
-        if os.path.commonpath([
-                    os.getcwd(),
-                    os.path.abspath(file)]) == os.getcwd():
-            files_[no] = os.path.relpath(file)
-        else:
-            files_[no] = os.path.abspath(file)
-    files = files_
-
-    return files
-
 # each dwarf entry can have attrs and children entries
 class DwarfEntry:
    def __init__(self, level, off, tag, ats={}, children=[]):
@@ -393,79 +341,16 @@ class DwarfEntry:
        else:
            return None

-    @ft.cached_property
-    def addr(self):
-        if (self.tag == 'DW_TAG_subprogram'
-                and 'DW_AT_low_pc' in self):
-            return int(self['DW_AT_low_pc'], 0)
-        else:
-            return None
-
-    @ft.cached_property
-    def size(self):
-        if (self.tag == 'DW_TAG_subprogram'
-                and 'DW_AT_high_pc' in self):
-            # this looks wrong, but high_pc does store the size,
-            # for whatever reason
-            return int(self['DW_AT_high_pc'], 0)
-        else:
-            return None
-
-    def info(self, tags=None):
-        # recursively flatten children
-        def flatten(entry):
-            for child in entry.children:
-                # filter if requested
-                if tags is None or child.tag in tags:
-                    yield child
-
-                yield from flatten(child)
-
-        return DwarfInfo(co.OrderedDict(
-                (child.off, child) for child in flatten(self)))
-
 # a collection of dwarf entries
 class DwarfInfo:
    def __init__(self, entries):
        self.entries = entries

    def get(self, k, d=None):
-        # allow lookup by offset, symbol, or dwarf name
-        if not isinstance(k, str) and not hasattr(k, 'addr'):
+        # allow lookup by offset or dwarf name
+        if not isinstance(k, str):
            return self.entries.get(k, d)

-        elif hasattr(k, 'addr'):
-            import bisect
-
-            # organize by address
-            if not hasattr(self, '_by_addr'):
-                # sort and keep largest/first when duplicates
-                entries = [entry
-                        for entry in self.entries.values()
-                        if entry.addr is not None
-                            and entry.size is not None]
-                entries.sort(key=lambda x: (x.addr, -x.size))
-
-                by_addr = []
-                for entry in entries:
-                    if (len(by_addr) == 0
-                            or by_addr[-1].addr != entry.addr):
-                        by_addr.append(entry)
-                self._by_addr = by_addr
-
-            # find entry by range
-            i = bisect.bisect(self._by_addr, k.addr,
-                    key=lambda x: x.addr)
-            # check that we're actually in this entry's size
-            if (i > 0
-                    and k.addr
-                        < self._by_addr[i-1].addr
-                            + self._by_addr[i-1].size):
-                return self._by_addr[i-1]
-            else:
-                # fallback to lookup by name
-                return self.get(k.name, d)
-
        else:
            # organize entries by name
            if not hasattr(self, '_by_name'):
@@ -502,6 +387,9 @@ class DwarfInfo:
    def __contains__(self, k):
        return self.get(k) is not None

+    def __bool__(self):
+        return bool(self.entries)
+
    def __len__(self):
        return len(self.entries)

@@ -557,22 +445,9 @@ def collect_dwarf_info(obj_path, tags=None, *,
    if proc.returncode != 0:
        raise sp.CalledProcessError(proc.returncode, proc.args)

-    # resolve abstract origins
-    for entry in info.values():
-        if 'DW_AT_abstract_origin' in entry:
-            off = int(entry['DW_AT_abstract_origin'].strip('<>'), 0)
-            origin = info[off]
-            assert 'DW_AT_abstract_origin' not in origin, (
-                    "Recursive abstract origin?")
-
-            for k, v in origin.ats.items():
-                if k not in entry.ats:
-                    entry.ats[k] = v
-
    return DwarfInfo(info)

 def collect(obj_paths, *,
-        sources=None,
        everything=False,
        **args):
    results = []
@@ -582,42 +457,38 @@ def collect(obj_paths, *,
                sections=SECTIONS,
                **args)

-        # find source paths
-        files = collect_dwarf_files(obj_path, **args)
-
        # find dwarf info
        info = collect_dwarf_info(obj_path,
-                tags={'DW_TAG_subprogram', 'DW_TAG_variable'},
+                tags={'DW_TAG_compile_unit'},
                **args)

-        # map function sizes to debug symbols
+        # find source file from dwarf info
+        for entry in info:
+            if (entry.tag == 'DW_TAG_compile_unit'
+                    and 'DW_AT_name' in entry
+                    and 'DW_AT_comp_dir' in entry):
+                file = os.path.join(
+                        entry['DW_AT_comp_dir'].split(':')[-1].strip(),
+                        entry['DW_AT_name'].split(':')[-1].strip())
+                break
+        else:
+            # guess from obj path
+            file = re.sub('(\.o)?$', '.c', obj_path, 1)
+
+        # simplify path
+        if os.path.commonpath([
+                os.getcwd(),
+                os.path.abspath(file)]) == os.getcwd():
+            file = os.path.relpath(file)
+        else:
+            file = os.path.abspath(file)
+
+        # find function sizes
        for sym in syms:
            # discard internal functions
            if not everything and sym.name.startswith('__'):
                continue

-            # find best matching dwarf entry, this may be slightly different
-            # due to optimizations
-            entry = info.get(sym)
-
-            # if we have no file guess from obj path
-            if entry is not None and 'DW_AT_decl_file' in entry:
-                file = files.get(int(entry['DW_AT_decl_file']), '?')
-            else:
-                file = re.sub('(\.o)?$', '.c', obj_path, 1)
-
-            # ignore filtered sources
-            if sources is not None:
-                if not any(os.path.abspath(file) == os.path.abspath(s)
-                        for s in sources):
-                    continue
-            else:
-                # default to only cwd
-                if not everything and not os.path.commonpath([
-                        os.getcwd(),
-                        os.path.abspath(file)]) == os.getcwd():
-                    continue
-
            results.append(CodeResult(file, sym.name, sym.size))

    return results
@@ -1170,12 +1041,6 @@ if __name__ == "__main__":
            nargs='?',
            action=AppendSort,
            help="Sort by this field, but backwards.")
-    parser.add_argument(
-            '-F', '--source',
-            dest='sources',
-            action='append',
-            help="Only consider definitions in this file. Defaults to "
-                "anything in the current directory.")
    parser.add_argument(
            '--everything',
            action='store_true',