Changed scripts to not infer field purposes from CSV values

Note there's a bit of subtlety here, field _types_ are still infered,
but the intention of the fields, i.e. if the field contains data vs
row name/other properties, must be unambiguous in the scripts.

There is still a _tiny_ bit of inference. For most scripts only one
of --by or --fields is strictly needed, since this makes the purpose of
the other fields unambiguous.

The reason for this change is so the scripts are a bit more reliable,
but also because this simplifies the data parsing/inference a bit.

Oh, and this also changes field inference to use the csv.DictReader's
fieldnames field instead of only inspecting the returned dicts. This
should also save a bit of O(n) overhead when parsing CSV files.
This commit is contained in:
Christopher Haster
2023-11-04 15:24:18 -05:00
parent 2be3ff57c5
commit d0a6ef0c89
12 changed files with 187 additions and 200 deletions

View File

@@ -191,11 +191,15 @@ def dat(x):
def collect(csv_paths, renames=[], defines=[]):
# collect results from CSV files
fields = []
results = []
for path in csv_paths:
try:
with openio(path) as f:
reader = csv.DictReader(f, restval='')
fields.extend(
k for k in reader.fieldnames
if k not in fields)
for r in reader:
# apply any renames
if renames:
@@ -214,7 +218,7 @@ def collect(csv_paths, renames=[], defines=[]):
except FileNotFoundError:
pass
return results
return fields, results
def fold(results, by=None, x=None, y=None, defines=[]):
# filter by matching defines
@@ -225,29 +229,16 @@ def fold(results, by=None, x=None, y=None, defines=[]):
results_.append(r)
results = results_
# if y not specified, try to guess from data
if not y:
y = co.OrderedDict()
for r in results:
for k, v in r.items():
if (not by or k not in by) and v.strip():
try:
dat(v)
y[k] = True
except ValueError:
y[k] = False
y = list(k for k,v in y.items() if v)
if by:
# find all 'by' values
ks = set()
keys = set()
for r in results:
ks.add(tuple(r.get(k, '') for k in by))
ks = sorted(ks)
keys.add(tuple(r.get(k, '') for k in by))
keys = sorted(keys)
# collect all datasets
datasets = co.OrderedDict()
for ks_ in (ks if by else [()]):
for key in (keys if by else [()]):
for x_ in (x if x else [None]):
for y_ in y:
# organize by 'by', x, and y
@@ -257,7 +248,7 @@ def fold(results, by=None, x=None, y=None, defines=[]):
# filter by 'by'
if by and not all(
k in r and r[k] == v
for k, v in zip(by, ks_)):
for k, v in zip(by, key)):
continue
# find xs
@@ -288,8 +279,8 @@ def fold(results, by=None, x=None, y=None, defines=[]):
# hide x/y if there is only one field
k_x = x_ if len(x or []) > 1 else ''
k_y = y_ if len(y or []) > 1 or (not ks_ and not k_x) else ''
datasets[ks_ + (k_x, k_y)] = dataset
k_y = y_ if len(y or []) > 1 or (not key and not k_x) else ''
datasets[key + (k_x, k_y)] = dataset
return datasets
@@ -746,15 +737,27 @@ def main(csv_paths, output, *,
all_defines = sorted(all_defines.items())
# separate out renames
renames = list(it.chain.from_iterable(
all_renames = list(it.chain.from_iterable(
((k, v) for v in vs)
for k, vs in it.chain(all_by, all_x, all_y)))
all_by = [k for k, _ in all_by]
all_x = [k for k, _ in all_x]
all_y = [k for k, _ in all_y]
if not all_by and not all_y:
print("error: needs --by or -y to figure out fields")
sys.exit(-1)
# first collect results from CSV files
results = collect(csv_paths, renames, all_defines)
fields_, results = collect(csv_paths, all_renames, all_defines)
# if y not specified, guess it's anything not in by/defines/x/renames
if not all_y:
all_y = [
k for k in fields_
if k not in all_by
and not any(k == k_ for k_, _ in all_defines)
and not any(k == old_k for _, old_k in all_renames)]
# then extract the requested datasets
datasets_ = fold(results, all_by, all_x, all_y)