Changed scripts to not infer field purposes from CSV values

Note there's a bit of subtlety here, field _types_ are still infered, but the intention of the fields, i.e. if the field contains data vs row name/other properties, must be unambiguous in the scripts. There is still a _tiny_ bit of inference. For most scripts only one of --by or --fields is strictly needed, since this makes the purpose of the other fields unambiguous. The reason for this change is so the scripts are a bit more reliable, but also because this simplifies the data parsing/inference a bit. Oh, and this also changes field inference to use the csv.DictReader's fieldnames field instead of only inspecting the returned dicts. This should also save a bit of O(n) overhead when parsing CSV files.
2025-12-06 23:52:44 +00:00 · 2023-11-04 15:24:18 -05:00
parent 2be3ff57c5
commit d0a6ef0c89
12 changed files with 187 additions and 200 deletions
--- a/scripts/plot.py
+++ b/scripts/plot.py
@@ -445,11 +445,15 @@ class Plot:

 def collect(csv_paths, renames=[], defines=[]):
    # collect results from CSV files
+    fields = []
    results = []
    for path in csv_paths:
        try:
            with openio(path) as f:
                reader = csv.DictReader(f, restval='')
+                fields.extend(
+                    k for k in reader.fieldnames
+                    if k not in fields)
                for r in reader:
                    # apply any renames
                    if renames:
@@ -468,7 +472,7 @@ def collect(csv_paths, renames=[], defines=[]):
        except FileNotFoundError:
            pass

-    return results
+    return fields, results

 def fold(results, by=None, x=None, y=None, defines=[]):
    # filter by matching defines
@@ -479,29 +483,16 @@ def fold(results, by=None, x=None, y=None, defines=[]):
                results_.append(r)
        results = results_

-    # if y not specified, try to guess from data
-    if not y:
-        y = co.OrderedDict()
-        for r in results:
-            for k, v in r.items():
-                if (not by or k not in by) and v.strip():
-                    try:
-                        dat(v)
-                        y[k] = True
-                    except ValueError:
-                        y[k] = False
-        y = list(k for k,v in y.items() if v)
-
    if by:
        # find all 'by' values
-        ks = set()
+        keys = set()
        for r in results:
-            ks.add(tuple(r.get(k, '') for k in by))
-        ks = sorted(ks)
+            keys.add(tuple(r.get(k, '') for k in by))
+        keys = sorted(keys)

    # collect all datasets
    datasets = co.OrderedDict()
-    for ks_ in (ks if by else [()]):
+    for key in (keys if by else [()]):
        for x_ in (x if x else [None]):
            for y_ in y:
                # organize by 'by', x, and y
@@ -511,7 +502,7 @@ def fold(results, by=None, x=None, y=None, defines=[]):
                    # filter by 'by'
                    if by and not all(
                            k in r and r[k] == v
-                            for k, v in zip(by, ks_)):
+                            for k, v in zip(by, key)):
                        continue

                    # find xs
@@ -542,8 +533,8 @@ def fold(results, by=None, x=None, y=None, defines=[]):

                # hide x/y if there is only one field
                k_x = x_ if len(x or []) > 1 else ''
-                k_y = y_ if len(y or []) > 1 or (not ks_ and not k_x) else ''
-                datasets[ks_ + (k_x, k_y)] = dataset
+                k_y = y_ if len(y or []) > 1 or (not key and not k_x) else ''
+                datasets[key + (k_x, k_y)] = dataset

    return datasets

@@ -904,13 +895,17 @@ def main(csv_paths, *,
    all_defines = sorted(all_defines.items())

    # separate out renames
-    renames = list(it.chain.from_iterable(
+    all_renames = list(it.chain.from_iterable(
        ((k, v) for v in vs)
        for k, vs in it.chain(all_by, all_x, all_y)))
    all_by = [k for k, _ in all_by]
    all_x = [k for k, _ in all_x]
    all_y = [k for k, _ in all_y]

+    if not all_by and not all_y:
+        print("error: needs --by or -y to figure out fields")
+        sys.exit(-1)
+
    # create a grid of subplots
    grid = Grid.fromargs(**subplot, subplots=subplots)

@@ -994,10 +989,19 @@ def main(csv_paths, *,
        f.writeln = writeln

        # first collect results from CSV files
-        results = collect(csv_paths, renames, all_defines)
+        fields_, results = collect(csv_paths, all_renames, all_defines)
+
+        # if y not specified, guess it's anything not in by/defines/x/renames
+        all_y_ = all_y
+        if not all_y:
+            all_y_ = [
+                k for k in fields_
+                    if k not in all_by
+                        and not any(k == k_ for k_, _ in all_defines)
+                        and not any(k == old_k for _, old_k in all_renames)]

        # then extract the requested datasets
-        datasets_ = fold(results, all_by, all_x, all_y)
+        datasets_ = fold(results, all_by, all_x, all_y_)

        # figure out colors/chars here so that subplot defines
        # don't change them later, that'd be bad
@@ -1143,7 +1147,7 @@ def main(csv_paths, *,

            # data can be constrained by subplot-specific defines,
            # so re-extract for each plot
-            subdatasets = fold(results, all_by, all_x, all_y, define_)
+            subdatasets = fold(results, all_by, all_x, all_y_, define_)

            # filter by subplot x/y
            subdatasets = co.OrderedDict([(name, dataset)