Tweaked scripts that consume .csv files to filter defines early

With the quantity of data being output by bench.py now, filtering ASAP
while parsing CSV files is a valuable optimization. And thanks to how
CSV files are structured, we can even avoid ever loading the full
contents into RAM.

This does end up with use filtering for defines redundantly in a few
places, but this is well worth the saved overhead from early filtering.

Also tried to clean up the plot.py/plotmpl.py's data folding path,
though that may have been wasted effort.
This commit is contained in:
Christopher Haster
2023-11-03 14:30:22 -05:00
parent fb9277feac
commit 616b4e1c9e
10 changed files with 274 additions and 212 deletions

View File

@@ -443,7 +443,7 @@ class Plot:
return ''.join(row_)
def collect(csv_paths, renames=[]):
def collect(csv_paths, renames=[], defines=[]):
# collect results from CSV files
results = []
for path in csv_paths:
@@ -451,64 +451,33 @@ def collect(csv_paths, renames=[]):
with openio(path) as f:
reader = csv.DictReader(f, restval='')
for r in reader:
# apply any renames
if renames:
# make a copy so renames can overlap
r_ = {}
for new_k, old_k in renames:
if old_k in r:
r_[new_k] = r[old_k]
r.update(r_)
# filter by matching defines
if not all(k in r and r[k] in vs for k, vs in defines):
continue
results.append(r)
except FileNotFoundError:
pass
if renames:
for r in results:
# make a copy so renames can overlap
r_ = {}
for new_k, old_k in renames:
if old_k in r:
r_[new_k] = r[old_k]
r.update(r_)
return results
def dataset(results, x=None, y=None, define=[]):
# organize by 'by', x, and y
dataset = []
i = 0
for r in results:
# filter results by matching defines
if not all(k in r and r[k] in vs for k, vs in define):
continue
# find xs
if x is not None:
if x not in r:
continue
try:
x_ = dat(r[x])
except ValueError:
continue
else:
x_ = i
i += 1
# find ys
if y is not None:
if y not in r:
continue
try:
y_ = dat(r[y])
except ValueError:
continue
else:
y_ = None
dataset.append((x_, y_))
return dataset
def datasets(results, by=None, x=None, y=None, define=[]):
# filter results by matching defines
results_ = []
for r in results:
if all(k in r and r[k] in vs for k, vs in define):
results_.append(r)
results = results_
def fold(results, by=None, x=None, y=None, defines=[]):
# filter by matching defines
if defines:
results_ = []
for r in results:
if all(k in r and r[k] in vs for k, vs in defines):
results_.append(r)
results = results_
# if y not specified, try to guess from data
if not y:
@@ -535,16 +504,46 @@ def datasets(results, by=None, x=None, y=None, define=[]):
for ks_ in (ks if by else [()]):
for x_ in (x if x else [None]):
for y_ in y:
# organize by 'by', x, and y
dataset = []
i = 0
for r in results:
# filter by 'by'
if by and not all(
k in r and r[k] == v
for k, v in zip(by, ks_)):
continue
# find xs
if x_ is not None:
if x_ not in r:
continue
try:
x__ = dat(r[x_])
except ValueError:
continue
else:
# fallback to enumeration
x__ = i
i += 1
# find ys
if y_ is not None:
if y_ not in r:
continue
try:
y__ = dat(r[y_])
except ValueError:
continue
else:
y__ = None
dataset.append((x__, y__))
# hide x/y if there is only one field
k_x = x_ if len(x or []) > 1 else ''
k_y = y_ if len(y or []) > 1 or (not ks_ and not k_x) else ''
datasets[ks_ + (k_x, k_y)] = dataset(
results,
x_,
y_,
[(by_, {k_}) for by_, k_ in zip(by, ks_)]
if by else [])
datasets[ks_ + (k_x, k_y)] = dataset
return datasets
@@ -898,6 +897,11 @@ def main(csv_paths, *,
all_by = (by or []) + subplots_get('by', **subplot, subplots=subplots)
all_x = (x or []) + subplots_get('x', **subplot, subplots=subplots)
all_y = (y or []) + subplots_get('y', **subplot, subplots=subplots)
all_defines = co.defaultdict(lambda: set())
for k, vs in it.chain(define or [],
subplots_get('define', **subplot, subplots=subplots)):
all_defines[k] |= vs
all_defines = sorted(all_defines.items())
# separate out renames
renames = list(it.chain.from_iterable(
@@ -990,10 +994,10 @@ def main(csv_paths, *,
f.writeln = writeln
# first collect results from CSV files
results = collect(csv_paths, renames)
results = collect(csv_paths, renames, all_defines)
# then extract the requested datasets
datasets_ = datasets(results, all_by, all_x, all_y, define)
datasets_ = fold(results, all_by, all_x, all_y)
# figure out colors/chars here so that subplot defines
# don't change them later, that'd be bad
@@ -1139,7 +1143,7 @@ def main(csv_paths, *,
# data can be constrained by subplot-specific defines,
# so re-extract for each plot
subdatasets = datasets(results, all_by, all_x, all_y, define_)
subdatasets = fold(results, all_by, all_x, all_y, define_)
# filter by subplot x/y
subdatasets = co.OrderedDict([(name, dataset)