scripts: plot[mpl].py: Added --x/ylim-stddev for data-dependent limits

This adds --xlim-stddev and --ylim-stddev as alternatives to -X/--xlim and -Y/--ylim that define the plot limits in terms of standard deviations from the mean, instead of in absolute values. So want to only plot data within +-1 standard deviation? Use: $ ./scripts/plot.py --ylim-stddev=-1,+1 Want to ignore outliers >3 standard deviations? Use: $ ./scripts/plot.py --ylim-stddev=3 This is very useful for plotting the amortized/per-byte benchmarks, which have a tendency to run off towards infinity near zero. Before, we could truncate data explicitly with -Y/--ylim, but this was getting very tedious and doesn't work well when you don't know what the data is going to look like beforehand.
2025-12-08 00:22:44 +00:00 · 2025-05-15 18:23:09 -05:00
parent d5432ca0df
commit aebe5b1d1b
2 changed files with 112 additions and 37 deletions
--- a/scripts/plot.py
+++ b/scripts/plot.py
@@ -132,6 +132,18 @@ def si2(x, w=5):
        s = s.rstrip('.')
    return '%s%s%s' % ('-' if x < 0 else '', s, SI2_PREFIXES[p])

+# find x/y limit based on a number of standard deviations
+def stddevlim(lim, xs):
+    # make a list, we need two passes
+    xs = [float(x) for x in xs]
+    if len(xs) == 0:
+        return 0
+    # calculate mean and stddev
+    mean = sum(xs) / len(xs)
+    stddev = mt.sqrt(sum((x - mean)**2 for x in xs) / len(xs))
+    # compute the limit as relative stddevs from the mean
+    return mean + float(lim)*stddev
+
 # open with '-' for stdin/stdout
 def openio(path, mode='r', buffering=-1):
    import os
@@ -1212,6 +1224,8 @@ def main_(ring, csv_paths, *,
        height=None,
        xlim=(None,None),
        ylim=(None,None),
+        xlim_stddev=(None,None),
+        ylim_stddev=(None,None),
        xlog=False,
        ylog=False,
        x2=False,
@@ -1567,14 +1581,20 @@ def main_(ring, csv_paths, *,
        define_ = define + s.args.get('define', [])
        xlim_ = s.args.get('xlim', xlim)
        ylim_ = s.args.get('ylim', ylim)
+        xlim_stddev_ = s.args.get('xlim_stddev', xlim_stddev)
+        ylim_stddev_ = s.args.get('ylim_stddev', ylim_stddev)
        xlog_ = s.args.get('xlog', False) or xlog
        ylog_ = s.args.get('ylog', False) or ylog

        # allow shortened ranges
        if len(xlim_) == 1:
-            xlim_ = (0, xlim_[0])
+            xlim_ = (None, xlim_[0])
        if len(ylim_) == 1:
-            ylim_ = (0, ylim_[0])
+            ylim_ = (None, ylim_[0])
+        if len(xlim_stddev_) == 1:
+            xlim_stddev_ = (None, xlim_stddev_[0])
+        if len(ylim_stddev_) == 1:
+            ylim_stddev_ = (None, ylim_stddev_[0])

        # data can be constrained by subplot-specific defines,
        # so re-extract for each plot
@@ -1605,29 +1625,32 @@ def main_(ring, csv_paths, *,
                for k, v in dataattr.items()}

        # find actual xlim/ylim
+        x__ = (lambda: it.chain([0], (x
+                for dataset in subdatasets.values()
+                for x, y in dataset
+                if y is not None)))
+        y__ = (lambda: it.chain([0], (y
+                for dataset in subdatasets.values()
+                for _, y in dataset
+                if y is not None)))
        xlim_ = (
                xlim_[0] if xlim_[0] is not None
-                    else min(it.chain([0], (x
-                        for dataset in subdatasets.values()
-                        for x, y in dataset
-                        if y is not None))),
+                    else stddevlim(xlim_stddev_[0], x__())
+                    if xlim_stddev_[0] is not None
+                    else min(x__()),
                xlim_[1] if xlim_[1] is not None
-                    else max(it.chain([0], (x
-                        for dataset in subdatasets.values()
-                        for x, y in dataset
-                        if y is not None))))
-
+                    else stddevlim(xlim_stddev_[1], x__())
+                    if xlim_stddev_[1] is not None
+                    else max(x__()))
        ylim_ = (
                ylim_[0] if ylim_[0] is not None
-                    else min(it.chain([0], (y
-                        for dataset in subdatasets.values()
-                        for _, y in dataset
-                        if y is not None))),
+                    else stddevlim(ylim_stddev_[0], y__())
+                    if ylim_stddev_[0] is not None
+                    else min(y__()),
                ylim_[1] if ylim_[1] is not None
-                    else max(it.chain([0], (y
-                        for dataset in subdatasets.values()
-                        for _, y in dataset
-                        if y is not None))))
+                    else stddevlim(ylim_stddev_[1], y__())
+                    if ylim_stddev_[1] is not None
+                    else max(y__()))

        # figure out labels/titles now that we have our data
        subtitle = [punescape(l, submergedattrs) for l in s.title]
@@ -2053,6 +2076,20 @@ if __name__ == "__main__":
                dat(x) if x.strip() else None
                    for x in x.split(',')),
            help="Range for the y-axis.")
+    parser.add_argument(
+            '--xlim-stddev',
+            type=lambda x: tuple(
+                dat(x) if x.strip() else None
+                    for x in x.split(',')),
+            help="Range for the x-axis specified as a number of standard "
+                "deviations from the mean.")
+    parser.add_argument(
+            '--ylim-stddev',
+            type=lambda x: tuple(
+                dat(x) if x.strip() else None
+                    for x in x.split(',')),
+            help="Range for the y-axis specified as a number of standard "
+                "deviations from the mean.")
    parser.add_argument(
            '--xlog',
            action='store_true',