Changed scripts to not infer field purposes from CSV values

Note there's a bit of subtlety here, field _types_ are still infered, but the intention of the fields, i.e. if the field contains data vs row name/other properties, must be unambiguous in the scripts. There is still a _tiny_ bit of inference. For most scripts only one of --by or --fields is strictly needed, since this makes the purpose of the other fields unambiguous. The reason for this change is so the scripts are a bit more reliable, but also because this simplifies the data parsing/inference a bit. Oh, and this also changes field inference to use the csv.DictReader's fieldnames field instead of only inspecting the returned dicts. This should also save a bit of O(n) overhead when parsing CSV files.
2023-11-04 15:24:18 -05:00
parent 2be3ff57c5
commit d0a6ef0c89
12 changed files with 187 additions and 200 deletions
--- a/scripts/avg.py
+++ b/scripts/avg.py
@@ -46,11 +46,15 @@ def dat(x):

 def collect(csv_paths, renames=[], defines=[]):
    # collect results from CSV files
+    fields = []
    results = []
    for path in csv_paths:
        try:
            with openio(path) as f:
                reader = csv.DictReader(f, restval='')
+                fields.extend(
+                    k for k in reader.fieldnames
+                    if k not in fields)
                for r in reader:
                    # apply any renames
                    if renames:
@@ -69,7 +73,7 @@ def collect(csv_paths, renames=[], defines=[]):
        except FileNotFoundError:
            pass

-    return results
+    return fields, results

 def main(csv_paths, output, *,
        sum=False,
@@ -81,8 +85,8 @@ def main(csv_paths, output, *,
        stddev=False,
        gmean=False,
        gstddev=False,
-        meas=None,
        by=None,
+        meas=None,
        seeds=None,
        fields=None,
        defines=[]):
@@ -113,40 +117,41 @@ def main(csv_paths, output, *,
    if fields is not None:
        fields = [k for k, _ in fields]

+    if by is None and fields is None:
+        print("error: needs --by or --fields to figure out fields")
+        sys.exit(-1)
+
    # collect results from csv files
-    results = collect(csv_paths, renames, defines)
+    fields_, results = collect(csv_paths, renames, defines)

-    # if fields not specified, try to guess from data
-    if fields is None:
-        fields = co.OrderedDict()
-        for r in results:
-            for k, v in r.items():
-                if k not in (by or []) and k not in (seeds or []) and v.strip():
-                    try:
-                        dat(v)
-                        fields[k] = True
-                    except ValueError:
-                        fields[k] = False
-        fields = list(k for k,v in fields.items() if v)
-
-    # if by not specified, guess it's anything not in seeds/fields and not a
-    # source of a rename
+    # if by not specified, guess it's anything not in
+    # seeds/fields/renames/defines
    if by is None:
-        by = co.OrderedDict()
-        for r in results:
-            # also ignore None keys, these are introduced by csv.DictReader
-            # when header + row mismatch
-            by.update((k, True) for k in r.keys()
-                if k is not None
-                    and k not in (seeds or [])
-                    and k not in fields
-                    and not any(k == old_k for _, old_k in renames))
-        by = list(by.keys())
+        by = [
+            k for k in fields_
+            if k not in (seeds or [])
+                and k not in (fields or [])
+                and not any(k == old_k for _, old_k in renames)
+                and not any(k == k_ for k_, _ in defines)]
+
+    # if fields not specified, guess it's anything not in
+    # by/seeds/renames/defines
+    if fields is None:
+        fields = [
+            k for k in fields_
+            if k not in (by or [])
+                and k not in (seeds or [])
+                and not any(k == old_k for _, old_k in renames)
+                and not any(k == k_ for k_, _ in defines)]
+
+    # add meas to by if it isn't already present
+    if meas is not None and meas not in by:
+        by.append(meas)

    # convert fields to ints/floats
    for r in results:
        for k in fields:
-            if k in r:
+            if k in r and isinstance(r[k], str):
                r[k] = dat(r[k]) if r[k].strip() else 0

    # organize by 'by' values
@@ -162,11 +167,10 @@ def main(csv_paths, output, *,
        vs = {f: [] for f in fields}
        meas__ = None
        for r in rs:
-            if all(k in r and r[k] == v for k, v in zip(by, key)):
-                for f in fields:
-                    vs[f].append(r.get(f, 0))
-                if meas is not None and meas in r:
-                    meas__ = r[meas]
+            for f in fields:
+                vs[f].append(r.get(f, 0))
+            if meas is not None and meas in r:
+                meas__ = r[meas]

        def append(meas_, f_):
            avgs.append(
@@ -197,8 +201,7 @@ def main(csv_paths, output, *,

    # write results to CSVS
    with openio(output, 'w') as f:
-        writer = csv.DictWriter(f,
-            by + ([meas] if meas not in by else []) + fields)
+        writer = csv.DictWriter(f, by + fields)
        writer.writeheader()
        for r in avgs:
            writer.writerow(r)
@@ -254,10 +257,6 @@ if __name__ == "__main__":
        '--gstddev',
        action='store_true',
        help="Compute the geometric standard deviation.")
-    parser.add_argument(
-        '-m', '--meas',
-        help="Optional name of measurement name field. If provided, the name "
-            "will be modified with +amor or +per.")
    parser.add_argument(
        '-b', '--by',
        action='append',
@@ -268,6 +267,10 @@ if __name__ == "__main__":
                    if vs is not None else ())
            )(*x.split('=', 1)),
        help="Group by this field. Can rename fields with new_name=old_name.")
+    parser.add_argument(
+        '-m', '--meas',
+        help="Optional name of measurement name field. If provided, the name "
+            "will be modified with +amor or +per.")
    parser.add_argument(
        '-s', '--seed',
        dest='seeds',