From a3ac512cc18bc4b9aa7a677f50ba15b8a1608557 Mon Sep 17 00:00:00 2001
From: Christopher Haster <geky@geky.net>
Date: Fri, 13 Dec 2024 17:49:38 -0600
Subject: [PATCH] scripts: Adopted Parser class in prettyasserts.py

This ended up being a pretty in-depth rework of prettyasserts.py to
adopt the shared Parser class. But now prettyasserts.py should be both
more robust and faster.

The tricky parts:

- The Parser class eagerly munches whitespace by default. This is
  usually a good thing, but for prettyasserts.py we need to keep track
  of the whitespace somehow in order to write it to the output file.

  The solution here is a little bit hacky. Instead of complicating the
  Parser class, we implicitly add a regex group for whitespace when
  compiling our lexer.

  Unfortunately this does make last-minute patching of the lexer a bit
  messy (for things like -p/--prefix, etc), thanks to Python's
  re.Pattern class not being extendable. To work around this, the Lexer
  class keeps track of the original patterns to allow recompilation.

- Since we no longer tokenize in a separate pass, we can't use the
  None token to match any unmatched tokens.

  Fortunately this can be worked around with sufficiently ugly regex.
  See the 'STUFF' rule.

  It's a good thing Python has negative lookaheads.

  On the flip side, this means we no longer need to explicitly specify
  all possible tokens when multiple tokens overlap.

- Unlike stack.py/csv.py, prettyasserts.py needs multi-token lookahead.

  Fortunately this has a pretty straightforward solution with the
  addition of an optional stack to the Parser class.

  We can even have a bit of fun with Python's with statements (though I
  do wish with statements could have else clauses, so we wouldn't need
  double nesting to catch parser exceptions).

---

In addition to adopting the new Parser class, I also made sure to
eliminate intermediate string allocation through heavy use of Python's
io.StringIO class.

This, plus Parser's cheap shallow chomp/slice operations, gives
prettyasserts.py a much needed speed boost.

(Honestly, the original prettyasserts.py was pretty naive, with the
assumption that it wouldn't be the bottleneck during compilation. This
turned out to be wrong.)

These changes cut total compile time in ~half:

                                          real      user      sys
  before (time make test-runner -j): 0m56.202s 2m31.853s 0m2.827s
  after  (time make test-runner -j): 0m26.836s 1m51.213s 0m2.338s

Keep in mind this includes both prettyasserts.py and gcc -Os (and other
Makefile stuff).
---
 scripts/csv.py           |  42 ++-
 scripts/prettyasserts.py | 564 ++++++++++++++++++++++++---------------
 scripts/stack.py         |  42 ++-
 3 files changed, 406 insertions(+), 242 deletions(-)

diff --git a/scripts/csv.py b/scripts/csv.py
index cddd48d1..a540ec3a 100755
--- a/scripts/csv.py
+++ b/scripts/csv.py
@@ -370,7 +370,7 @@ class RGStddev:
 # basically just because memoryview doesn't support strs
 class Parser:
     def __init__(self, data, ws='\s*', ws_flags=0):
-        self.data = data.lstrip()
+        self.data = data
         self.i = 0
         self.m = None
         # also consume whitespace
@@ -378,9 +378,10 @@ class Parser:
         self.i = self.ws.match(self.data, self.i).end()
 
     def __repr__(self):
-        return '%s(%r...)' % (
-                self.__class__.__name__,
-                self.data[self.i:self.i+32])
+        if len(self.data) - self.i <= 32:
+            return repr(self.data[self.i:])
+        else:
+            return "%s..." % repr(self.data[self.i:self.i+32])[:32]
 
     def __str__(self):
         return self.data[self.i:]
@@ -411,15 +412,36 @@ class Parser:
 
     def chompmatch(self, pattern, flags=0, *groups):
         if not self.match(pattern, flags):
-            raise Parser.Error(
-                    "expected %r, found %r..." % (
-                        pattern, self.data[self.i:self.i+32]))
+            raise Parser.Error("expected %r, found %r" % (pattern, self))
         return self.chomp(*groups)
 
     def unexpected(self):
-        raise Parser.Error(
-                "unexpected %r..." % (
-                    self.data[self.i:self.i+32]))
+        raise Parser.Error("unexpected %r" % self)
+
+    def lookahead(self):
+        # push state on the stack
+        if not hasattr(self, 'stack'):
+            self.stack = []
+        self.stack.append((self.i, self.m))
+        return self
+
+    def consume(self):
+        # pop and use new state
+        self.stack.pop()
+
+    def discard(self):
+        # pop and discard new state
+        self.i, self.m = self.stack.pop()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, et, ev, tb):
+        # keep new state if no exception occured
+        if et is None:
+            self.consume()
+        else:
+            self.discard()
 
 # a lazily-evaluated field expression
 class RExpr:
diff --git a/scripts/prettyasserts.py b/scripts/prettyasserts.py
index 70ddc27d..dca5c2ef 100755
--- a/scripts/prettyasserts.py
+++ b/scripts/prettyasserts.py
@@ -13,12 +13,15 @@
 # prevent local imports
 __import__('sys').path.pop(0)
 
+import io
 import re
 import sys
 
 
+# default prettyassert limit
 LIMIT = 16
 
+# comparison ops
 CMP = {
     '==': 'eq',
     '!=': 'ne',
@@ -28,22 +31,50 @@ CMP = {
     '>':  'gt',
 }
 
-LEXEMES = {
-    'ws':           [r'(?:\s|\n|#.*?(?<!\\)\n|//.*?(?<!\\)\n|/\*.*?\*/)+'],
-    'assert':       [r'\bassert\b', r'\b__builtin_assert\b'],
-    'unreachable':  [r'\bunreachable\b', r'\b__builtin_unreachable\b'],
-    'memcmp':       [r'\bmemcmp\b', r'\b__builtin_memcmp\b'],
-    'strcmp':       [r'\bstrcmp\b', r'\b__builtin_strcmp\b'],
-    'arrow':        ['=>'],
-    'string':       [r'"(?:\\.|[^"])*"', r"'(?:\\.|[^'])\'"],
-    'paren':        ['\(', '\)'],
-    'cmp':          list(CMP.keys()),
-    'logic':        ['\&\&', '\|\|'],
-    'sep':          ['\?', ':', ','],
-    'term':         [';', '\{', '\}'],
-    # specifically ops that conflict with cmp
-    'op':           ['->', '>>', '<<'],
-}
+# helper class for lexical regexes
+class Lexer:
+    def __init__(self):
+        self.patterns = {}
+
+    def lex(self, k, *patterns):
+        # compile with whitespace
+        l = re.compile(
+                '(?P<token>%s)(?P<ws>(?:%s)*)' % (
+                    '|'.join(patterns) if patterns
+                        # force a failure if we have no patterns
+                        else '(?!)',
+                    '|'.join(self.patterns.get('WS', ''))),
+                re.DOTALL)
+        # add to class members
+        setattr(self, k, l)
+        # keep track of patterns
+        self.patterns[k] = patterns
+
+    def extend(self, k, *patterns):
+        self.lex(k, *self.patterns.get(k, []), *patterns)
+
+L = Lexer()
+L.lex('WS',             r'(?:\s|\n|#.*?(?<!\\)\n|//.*?(?<!\\)\n|/\*.*?\*/)+')
+L.lex('ASSERT',         r'\bassert\b', r'\b__builtin_assert\b')
+L.lex('UNREACHABLE',    r'\bunreachable\b', r'\b__builtin_unreachable\b')
+L.lex('MEMCMP',         r'\bmemcmp\b', r'\b__builtin_memcmp\b')
+L.lex('STRCMP',         r'\bstrcmp\b', r'\b__builtin_strcmp\b')
+L.lex('ARROW',          '=>')
+L.lex('STR',            r'"(?:\\.|[^"])*"', r"'(?:\\.|[^'])\'")
+L.lex('LPAREN',         '\(')
+L.lex('RPAREN',         '\)')
+L.lex('ZERO',           '\\b0\\b')
+L.lex('CMP',            *CMP.keys())
+L.lex('LOGIC',          '\&\&', '\|\|')
+L.lex('TERN',           '\?', ':')
+L.lex('COMMA',          ',')
+L.lex('TERM',           ';', '\{', '\}')
+L.lex('STUFF',          '[^;{}?:,()"\'=!<>\-&|/#]+',
+                        # these need special handling because we're only
+                        # using regex
+                        '->', '>>', '<<', '-(?!>)',
+                        '=(?![=>])', '!(?!=)', '&(?!&)', '\|(?!\|)',
+                        '/(?!/)', '/(?!\*)')
 
 
 def openio(path, mode='r', buffering=-1):
@@ -56,7 +87,7 @@ def openio(path, mode='r', buffering=-1):
     else:
         return open(path, mode, buffering)
 
-def write_header(f, limit=LIMIT):
+def mkheader(f, limit=LIMIT):
     f.writeln("// Generated by %s:" % sys.argv[0])
     f.writeln("//")
     f.writeln("// %s" % ' '.join(sys.argv))
@@ -204,225 +235,306 @@ def write_header(f, limit=LIMIT):
     f.writeln()
     f.writeln()
 
-def mkassert(type, cmp, lh, rh, size=None):
+def mkassert(f, type, cmp, lh, rh, size=None):
     if size is not None:
-        return ("__PRETTY_ASSERT_%s_%s(%s, %s, %s)" % (
+        f.write("__PRETTY_ASSERT_%s_%s(%s, %s, %s)" % (
                 type.upper(), cmp.upper(), lh, rh, size))
     else:
-        return ("__PRETTY_ASSERT_%s_%s(%s, %s)" % (
+        f.write("__PRETTY_ASSERT_%s_%s(%s, %s)" % (
                 type.upper(), cmp.upper(), lh, rh))
 
-def mkunreachable():
-    return "__PRETTY_ASSERT_UNREACHABLE()"
+def mkunreachable(f):
+    f.write("__PRETTY_ASSERT_UNREACHABLE()")
 
 
-# simple recursive descent parser
-class ParseFailure(Exception):
-    def __init__(self, expected, found):
-        self.expected = expected
-        self.found = found
+# a simple general-purpose parser class
+#
+# basically just because memoryview doesn't support strs
+class Parser:
+    def __init__(self, data, ws='\s*', ws_flags=0):
+        self.data = data
+        self.i = 0
+        self.m = None
+        # also consume whitespace
+        self.ws = re.compile(ws, ws_flags)
+        self.i = self.ws.match(self.data, self.i).end()
+
+    def __repr__(self):
+        if len(self.data) - self.i <= 32:
+            return repr(self.data[self.i:])
+        else:
+            return "%s..." % repr(self.data[self.i:self.i+32])[:32]
 
     def __str__(self):
-        return "expected %r, found %s..." % (
-                self.expected, repr(self.found)[:70])
+        return self.data[self.i:]
 
-class Parser:
-    def __init__(self, in_f, lexemes=LEXEMES):
-        p = '|'.join('(?P<%s>%s)' % (n, '|'.join(l))
-                for n, l in lexemes.items())
-        p = re.compile(p, re.DOTALL)
-        data = in_f.read()
-        tokens = []
-        line = 1
-        col = 0
-        while True:
-            m = p.search(data)
-            if m:
-                if m.start() > 0:
-                    tokens.append((None, data[:m.start()], line, col))
-                tokens.append((m.lastgroup, m.group(), line, col))
-                data = data[m.end():]
-            else:
-                tokens.append((None, data, line, col))
-                break
-        self.tokens = tokens
-        self.off = 0
+    def __len__(self):
+        return len(self.data) - self.i
 
-    def lookahead(self, *pattern):
-        if self.off < len(self.tokens):
-            token = self.tokens[self.off]
-            if token[0] in pattern or token[1] in pattern:
-                self.m = token[1]
-                return self.m
-        self.m = None
+    def __bool__(self):
+        return self.i != len(self.data)
+
+    def match(self, pattern, flags=0):
+        # compile so we can use the pos arg, this is still cached
+        self.m = re.compile(pattern, flags).match(self.data, self.i)
         return self.m
 
-    def accept(self, *patterns):
-        m = self.lookahead(*patterns)
-        if m is not None:
-            self.off += 1
-        return m
+    def group(self, *groups):
+        return self.m.group(*groups)
 
-    def expect(self, *patterns):
-        m = self.accept(*patterns)
-        if not m:
-            raise ParseFailure(patterns, self.tokens[self.off:])
-        return m
+    def chomp(self, *groups):
+        g = self.group(*groups)
+        self.i = self.m.end()
+        # also consume whitespace
+        self.i = self.ws.match(self.data, self.i).end()
+        return g
 
-    def push(self):
-        return self.off
+    class Error(Exception):
+        pass
 
-    def pop(self, state):
-        self.off = state
+    def chompmatch(self, pattern, flags=0, *groups):
+        if not self.match(pattern, flags):
+            raise Parser.Error("expected %r, found %r" % (pattern, self))
+        return self.chomp(*groups)
 
-def p_assert(p):
-    state = p.push()
+    def unexpected(self):
+        raise Parser.Error("unexpected %r" % self)
 
+    def lookahead(self):
+        # push state on the stack
+        if not hasattr(self, 'stack'):
+            self.stack = []
+        self.stack.append((self.i, self.m))
+        return self
+
+    def consume(self):
+        # pop and use new state
+        self.stack.pop()
+
+    def discard(self):
+        # pop and discard new state
+        self.i, self.m = self.stack.pop()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, et, ev, tb):
+        # keep new state if no exception occured
+        if et is None:
+            self.consume()
+        else:
+            self.discard()
+
+
+# parse rules
+
+def p_assert(p, f):
     # assert(memcmp(a,b,size) cmp 0)?
     try:
-        p.expect('assert') ; p.accept('ws')
-        p.expect('(') ; p.accept('ws')
-        p.expect('memcmp') ; p.accept('ws')
-        p.expect('(') ; p.accept('ws')
-        lh = p_expr(p) ; p.accept('ws')
-        p.expect(',') ; p.accept('ws')
-        rh = p_expr(p) ; p.accept('ws')
-        p.expect(',') ; p.accept('ws')
-        size = p_expr(p) ; p.accept('ws')
-        p.expect(')') ; p.accept('ws')
-        cmp = p.expect('cmp') ; p.accept('ws')
-        p.expect('0') ; p.accept('ws')
-        p.expect(')')
-        return mkassert('mem', CMP[cmp], lh, rh, size)
-    except ParseFailure:
-        p.pop(state)
+        with p.lookahead():
+            p.chompmatch(L.ASSERT)
+            p.chompmatch(L.LPAREN)
+            p.chompmatch(L.MEMCMP)
+            p.chompmatch(L.LPAREN)
+            lh = io.StringIO()
+            p_expr(p, lh)
+            lh = lh.getvalue()
+            p.chompmatch(L.COMMA)
+            rh = io.StringIO()
+            p_expr(p, rh)
+            rh = rh.getvalue()
+            p.chompmatch(L.COMMA)
+            size = io.StringIO()
+            p_expr(p, size)
+            size = size.getvalue()
+            p.chompmatch(L.RPAREN)
+            cmp = p.chompmatch(L.CMP, 0, 'token')
+            p.chompmatch(L.ZERO)
+            ws = p.chompmatch(L.RPAREN, 0, 'ws')
+            mkassert(f, 'mem', CMP[cmp], lh, rh, size)
+            f.write(ws)
+            return
+    except Parser.Error:
+        pass
 
     # assert(strcmp(a,b) cmp 0)?
     try:
-        p.expect('assert') ; p.accept('ws')
-        p.expect('(') ; p.accept('ws')
-        p.expect('strcmp') ; p.accept('ws')
-        p.expect('(') ; p.accept('ws')
-        lh = p_expr(p) ; p.accept('ws')
-        p.expect(',') ; p.accept('ws')
-        rh = p_expr(p) ; p.accept('ws')
-        p.expect(')') ; p.accept('ws')
-        cmp = p.expect('cmp') ; p.accept('ws')
-        p.expect('0') ; p.accept('ws')
-        p.expect(')')
-        return mkassert('str', CMP[cmp], lh, rh)
-    except ParseFailure:
-        p.pop(state)
+        with p.lookahead():
+            p.chompmatch(L.ASSERT)
+            p.chompmatch(L.LPAREN)
+            p.chompmatch(L.STRCMP)
+            p.chompmatch(L.LPAREN)
+            lh = io.StringIO()
+            p_expr(p, lh)
+            lh = lh.getvalue()
+            p.chompmatch(L.COMMA)
+            rh = io.StringIO()
+            p_expr(p, rh)
+            rh = rh.getvalue()
+            p.chompmatch(L.RPAREN)
+            cmp = p.chompmatch(L.CMP, 0, 'token')
+            p.chompmatch(L.ZERO)
+            ws = p.chompmatch(L.RPAREN, 0, 'ws')
+            mkassert(f, 'str', CMP[cmp], lh, rh)
+            f.write(ws)
+            return
+    except Parser.Error:
+        pass
 
     # assert(a cmp b)?
     try:
-        p.expect('assert') ; p.accept('ws')
-        p.expect('(') ; p.accept('ws')
-        lh = p_expr(p) ; p.accept('ws')
-        cmp = p.expect('cmp') ; p.accept('ws')
-        rh = p_expr(p) ; p.accept('ws')
-        p.expect(')')
-        return mkassert('int', CMP[cmp], lh, rh)
-    except ParseFailure:
-        p.pop(state)
+        with p.lookahead():
+            p.chompmatch(L.ASSERT)
+            p.chompmatch(L.LPAREN)
+            lh = io.StringIO()
+            p_simpleexpr(p, lh)
+            lh = lh.getvalue()
+            cmp = p.chompmatch(L.CMP, 0, 'token')
+            rh = io.StringIO()
+            p_simpleexpr(p, rh)
+            rh = rh.getvalue()
+            ws = p.chompmatch(L.RPAREN, 0, 'ws')
+            mkassert(f, 'int', CMP[cmp], lh, rh)
+            f.write(ws)
+            return
+    except Parser.Error:
+        pass
 
     # assert(a)?
-    p.expect('assert') ; p.accept('ws')
-    p.expect('(') ; p.accept('ws')
-    lh = p_exprs(p) ; p.accept('ws')
-    p.expect(')')
-    return mkassert('bool', 'eq', lh, 'true')
+    p.chompmatch(L.ASSERT)
+    p.chompmatch(L.LPAREN)
+    lh = io.StringIO()
+    p_exprs(p, lh)
+    lh = lh.getvalue()
+    ws = p.chompmatch(L.RPAREN, 0, 'ws')
+    mkassert(f, 'bool', 'eq', lh, 'true')
+    f.write(ws)
 
-def p_unreachable(p):
+def p_unreachable(p, f):
     # unreachable()?
-    p.expect('unreachable') ; p.accept('ws')
-    p.expect('(') ; p.accept('ws')
-    p.expect(')')
-    return mkunreachable()
+    p.chompmatch(L.UNREACHABLE)
+    p.chompmatch(L.LPAREN)
+    ws = p.chompmatch(L.RPAREN, 0, 'ws')
+    mkunreachable(f)
+    f.write(ws)
 
-def p_expr(p):
-    res = []
+def p_simpleexpr(p, f):
     while True:
-        if p.accept('('):
-            res.append(p.m)
+        # parens
+        if p.match(L.LPAREN):
+            f.write(p.chomp())
+            # allow terms in parens
             while True:
-                res.append(p_exprs(p))
-                if p.accept('sep', 'term'):
-                    res.append(p.m)
+                p_exprs(p, f)
+                if p.match(L.TERM):
+                    f.write(p.chomp())
                 else:
                     break
-            res.append(p.expect(')'))
-        elif p.lookahead('assert'):
-            state = p.push()
+            f.write(p.chompmatch(L.RPAREN))
+        # asserts
+        elif p.match(L.ASSERT):
             try:
-                res.append(p_assert(p))
-            except ParseFailure:
-                p.pop(state)
-                res.append(p.expect('assert'))
-        elif p.lookahead('unreachable'):
-            state = p.push()
+                with p.lookahead():
+                    p_assert(p, f)
+            except Parser.Error:
+                f.write(p.chomp())
+        # unreachables
+        elif p.match(L.UNREACHABLE):
             try:
-                res.append(p_unreachable(p))
-            except ParseFailure:
-                p.pop(state)
-                res.append(p.expect('unreachable'))
-        elif p.accept('memcmp', 'strcmp', 'string', 'op', 'ws', None):
-            res.append(p.m)
+                with p.lookahead():
+                    p_unreachable(p, f)
+            except Parser.Error:
+                f.write(p.chomp())
+        # anything else
+        elif p.match(L.STR) or p.match(L.STUFF):
+            f.write(p.chomp())
         else:
-            return ''.join(res)
+            break
 
-def p_exprs(p):
-    res = []
+def p_expr(p, f):
     while True:
-        res.append(p_expr(p))
-        if p.accept('cmp', 'logic', 'sep'):
-            res.append(p.m)
+        p_simpleexpr(p, f)
+        # continue if we hit a complex expr
+        if p.match(L.CMP) or p.match(L.LOGIC) or p.match(L.TERN):
+            f.write(p.chomp())
         else:
-            return ''.join(res)
+            break
 
-def p_stmt(p):
-    ws = p.accept('ws') or ''
+def p_exprs(p, f):
+    while True:
+        p_expr(p, f)
+        # continue if we hit a comma
+        if p.match(L.COMMA):
+            f.write(p.chomp())
+        else:
+            break
+
+def p_stmt(p, f):
+    # leading whitespace?
+    if p.match(L.WS):
+        f.write(p.chomp())
 
     # memcmp(lh,rh,size) => 0?
-    if p.lookahead('memcmp'):
-        state = p.push()
+    if p.match(L.MEMCMP):
         try:
-            p.expect('memcmp') ; p.accept('ws')
-            p.expect('(') ; p.accept('ws')
-            lh = p_expr(p) ; p.accept('ws')
-            p.expect(',') ; p.accept('ws')
-            rh = p_expr(p) ; p.accept('ws')
-            p.expect(',') ; p.accept('ws')
-            size = p_expr(p) ; p.accept('ws')
-            p.expect(')') ; p.accept('ws')
-            p.expect('=>') ; p.accept('ws')
-            p.expect('0') ; p.accept('ws')
-            return ws + mkassert('mem', 'eq', lh, rh, size)
-        except ParseFailure:
-            p.pop(state)
+            with p.lookahead():
+                p.chompmatch(L.MEMCMP)
+                p.chompmatch(L.LPAREN)
+                lh = io.StringIO()
+                p_expr(p, lh)
+                lh = lh.getvalue()
+                p.chompmatch(L.COMMA)
+                rh = io.StringIO()
+                p_expr(p, rh)
+                rh = rh.getvalue()
+                p.chompmatch(L.COMMA)
+                size = io.StringIO()
+                p_expr(p, size)
+                size = size.getvalue()
+                p.chompmatch(L.RPAREN)
+                p.chompmatch(L.ARROW)
+                ws = p.chompmatch(L.ZERO, 0, 'ws')
+                mkassert(f, 'mem', 'eq', lh, rh, size)
+                f.write(ws)
+                return
+        except Parse.Error:
+            pass
 
     # strcmp(lh,rh) => 0?
-    if p.lookahead('strcmp'):
-        state = p.push()
+    if p.match(L.STRCMP):
         try:
-            p.expect('strcmp') ; p.accept('ws') ; p.expect('(') ; p.accept('ws')
-            lh = p_expr(p) ; p.accept('ws')
-            p.expect(',') ; p.accept('ws')
-            rh = p_expr(p) ; p.accept('ws')
-            p.expect(')') ; p.accept('ws')
-            p.expect('=>') ; p.accept('ws')
-            p.expect('0') ; p.accept('ws')
-            return ws + mkassert('str', 'eq', lh, rh)
-        except ParseFailure:
-            p.pop(state)
+            with p.lookahead():
+                p.chompmatch(L.STRCMP)
+                p.chompmatch(L.LPAREN)
+                lh = io.StringIO()
+                p_expr(p, lh)
+                lh = lh.getvalue()
+                p.chompmatch(L.COMMA)
+                rh = io.StringIO()
+                p_expr(p, rh)
+                rh = rh.getvalue()
+                p.chompmatch(L.RPAREN)
+                p.chompmatch(L.ARROW)
+                ws = p.chompmatch(L.ZERO, 0, 'ws')
+                mkassert(f, 'str', 'eq', lh, rh)
+                f.write(ws)
+                return
+        except Parse.Error:
+            pass
 
     # lh => rh?
-    lh = p_exprs(p)
-    if p.accept('=>'):
-        rh = p_exprs(p)
-        return ws + mkassert('int', 'eq', lh, rh)
+    lh = io.StringIO()
+    p_exprs(p, lh)
+    lh = lh.getvalue()
+    if p.match(L.ARROW):
+        p.chomp()
+        rh = io.StringIO()
+        p_exprs(p, rh)
+        rh = rh.getvalue()
+        mkassert(f, 'int', 'eq', lh, rh)
     else:
-        return ws + lh
+        f.write(lh)
+
 
 def main(input=None, output=None, *,
         prefix=[],
@@ -435,35 +547,38 @@ def main(input=None, output=None, *,
         no_upper=False,
         no_arrows=False,
         limit=LIMIT):
+    # modify lexer rules?
+    if no_defaults:
+        L.lex('ASSERT', [])
+        L.lex('UNREACHABLE', [])
+        L.lex('MEMCMP', [])
+        L.lex('STRCMP', [])
+    for p in prefix + prefix_insensitive:
+        L.extend('ASSERT', r'\b%sassert\b' % p)
+        L.extend('UNREACHABLE', r'\b%sunreachable\b' % p)
+        L.extend('MEMCMP', r'\b%smemcmp\b' % p)
+        L.extend('STRCMP', r'\b%sstrcmp\b' % p)
+    for p in prefix_insensitive:
+        L.extend('ASSERT', r'\b%sassert\b' % p.lower())
+        L.extend('UNREACHABLE', r'\b%sunreachable\b' % p.lower())
+        L.extend('MEMCMP', r'\b%smemcmp\b' % p.lower())
+        L.extend('STRCMP', r'\b%sstrcmp\b' % p.lower())
+        L.extend('ASSERT', r'\b%sASSERT\b' % p.upper())
+        L.extend('UNREACHABLE', r'\b%sUNREACHABLE\b' % p.upper())
+        L.extend('MEMCMP', r'\b%sMEMCMP\b' % p.upper())
+        L.extend('STRCMP', r'\b%sSTRCMP\b' % p.upper())
+    if assert_:
+        L.extend('ASSERT', *[r'\b%s\b' % r for r in assert_])
+    if unreachable:
+        L.extend('UNREACHABLE', *[r'\b%s\b' % r for r in unreachable])
+    if memcmp:
+        L.extend('MEMCMP', *[r'\b%s\b' % r for r in memcmp])
+    if strcmp:
+        L.extend('STRCMP', *[r'\b%s\b' % r for r in strcmp])
+
+    # start parsing
     with openio(input or '-', 'r') as in_f:
-        # create parser
-        lexemes = {n: l.copy() for n, l in LEXEMES.items()}
-        if no_defaults:
-            lexemes['assert'] = []
-            lexemes['unreachable'] = []
-            lexemes['memcmp'] = []
-            lexemes['strcmp'] = []
-        if no_arrows:
-            lexemes['arrow'] = []
-        for p in prefix + prefix_insensitive:
-            lexemes['assert'].append(r'\b%sassert\b' % p)
-            lexemes['unreachable'].append(r'\b%sunreachable\b' % p)
-            lexemes['memcmp'].append(r'\b%smemcmp\b' % p)
-            lexemes['strcmp'].append(r'\b%sstrcmp\b' % p)
-        for p in prefix_insensitive:
-            lexemes['assert'].append(r'\b%sassert\b' % p.lower())
-            lexemes['unreachable'].append(r'\b%sunreachable\b' % p.lower())
-            lexemes['memcmp'].append(r'\b%smemcmp\b' % p.lower())
-            lexemes['strcmp'].append(r'\b%sstrcmp\b' % p.lower())
-            lexemes['assert'].append(r'\b%sASSERT\b' % p.upper())
-            lexemes['unreachable'].append(r'\b%sUNREACHABLE\b' % p.upper())
-            lexemes['memcmp'].append(r'\b%sMEMCMP\b' % p.upper())
-            lexemes['strcmp'].append(r'\b%sSTRCMP\b' % p.upper())
-        lexemes['assert'].extend(r'\b%s\b' % r for r in assert_)
-        lexemes['unreachable'].extend(r'\b%s\b' % r for r in unreachable)
-        lexemes['memcmp'].extend(r'\b%s\b' % r for r in memcmp)
-        lexemes['strcmp'].extend(r'\b%s\b' % r for r in strcmp)
-        p = Parser(in_f, lexemes)
+        p = Parser(in_f.read(), '')
 
         with openio(output or '-', 'w') as f:
             def writeln(s=''):
@@ -472,24 +587,29 @@ def main(input=None, output=None, *,
             f.writeln = writeln
 
             # write extra verbose asserts
-            write_header(f, limit=limit)
+            mkheader(f, limit=limit)
             if input is not None:
                 f.writeln("#line %d \"%s\"" % (1, input))
 
             # parse and write out stmt at a time
             try:
                 while True:
-                    f.write(p_stmt(p))
-                    if p.accept('term'):
-                        f.write(p.m)
+                    p_stmt(p, f)
+                    if p.match(L.TERM):
+                        f.write(p.chomp())
                     else:
                         break
-            except ParseFailure as e:
-                print('warning: %s' % e)
-                pass
 
-            for i in range(p.off, len(p.tokens)):
-                f.write(p.tokens[i][1])
+                # trailing junk?
+                if p:
+                    p.unexpected()
+
+            except Parser.Error as e:
+                # warn on error
+                print('warning: %s' % e)
+                # still write out the rest of the file so compiler
+                # errors can be reported, these are usually more useful
+                f.write(str(p))
 
 
 if __name__ == "__main__":
diff --git a/scripts/stack.py b/scripts/stack.py
index bc44ab96..518e391f 100755
--- a/scripts/stack.py
+++ b/scripts/stack.py
@@ -171,7 +171,7 @@ def openio(path, mode='r', buffering=-1):
 # basically just because memoryview doesn't support strs
 class Parser:
     def __init__(self, data, ws='\s*', ws_flags=0):
-        self.data = data.lstrip()
+        self.data = data
         self.i = 0
         self.m = None
         # also consume whitespace
@@ -179,9 +179,10 @@ class Parser:
         self.i = self.ws.match(self.data, self.i).end()
 
     def __repr__(self):
-        return '%s(%r...)' % (
-                self.__class__.__name__,
-                self.data[self.i:self.i+32])
+        if len(self.data) - self.i <= 32:
+            return repr(self.data[self.i:])
+        else:
+            return "%s..." % repr(self.data[self.i:self.i+32])[:32]
 
     def __str__(self):
         return self.data[self.i:]
@@ -212,15 +213,36 @@ class Parser:
 
     def chompmatch(self, pattern, flags=0, *groups):
         if not self.match(pattern, flags):
-            raise Parser.Error(
-                    "expected %r, found %r..." % (
-                        pattern, self.data[self.i:self.i+32]))
+            raise Parser.Error("expected %r, found %r" % (pattern, self))
         return self.chomp(*groups)
 
     def unexpected(self):
-        raise Parser.Error(
-                "unexpected %r..." % (
-                    self.data[self.i:self.i+32]))
+        raise Parser.Error("unexpected %r" % self)
+
+    def lookahead(self):
+        # push state on the stack
+        if not hasattr(self, 'stack'):
+            self.stack = []
+        self.stack.append((self.i, self.m))
+        return self
+
+    def consume(self):
+        # pop and use new state
+        self.stack.pop()
+
+    def discard(self):
+        # pop and discard new state
+        self.i, self.m = self.stack.pop()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, et, ev, tb):
+        # keep new state if no exception occured
+        if et is None:
+            self.consume()
+        else:
+            self.discard()
 
 class CGNode(co.namedtuple('CGNode', [
         'name', 'file', 'size', 'qualifiers', 'calls'])):