From a4c52c5a3d19b5527023fedfaae96cb717d03802 Mon Sep 17 00:00:00 2001
From: Damien George <damien.p.george@gmail.com>
Date: Fri, 5 Dec 2014 19:35:18 +0000
Subject: [PATCH] py: Optimise lexer by exposing lexer type.

mp_lexer_t type is exposed, mp_token_t type is removed, and simple lexer
functions (like checking current token kind) are now inlined.

This saves 784 bytes ROM on 32-bit unix, 348 bytes on stmhal, and 460
bytes on bare-arm.  It also saves a tiny bit of RAM since mp_lexer_t
is a bit smaller.  Also will run a bit more efficiently.
---
 bare-arm/main.c    |   2 +-
 py/builtinimport.c |   2 +-
 py/lexer.c         | 155 +++++++++++++++++----------------------------
 py/lexer.h         |  42 +++++++-----
 py/lexerstr.c      |   2 +
 py/lexerunix.c     |   1 +
 py/parse.c         |  58 ++++++++---------
 py/parsehelper.c   |   4 +-
 py/runtime.c       |   2 +-
 stmhal/pyexec.c    |   2 +-
 unix/main.c        |   6 +-
 11 files changed, 123 insertions(+), 153 deletions(-)

diff --git a/bare-arm/main.c b/bare-arm/main.c
index 476a08ba0..3c187e5fb 100644
--- a/bare-arm/main.c
+++ b/bare-arm/main.c
@@ -32,7 +32,7 @@ void do_str(const char *src) {
     }
 
     // parse okay
-    qstr source_name = mp_lexer_source_name(lex);
+    qstr source_name = lex->source_name;
     mp_lexer_free(lex);
     mp_obj_t module_fun = mp_compile(pn, source_name, MP_EMIT_OPT_NONE, true);
 
diff --git a/py/builtinimport.c b/py/builtinimport.c
index c96a7d4ae..2910f8d97 100644
--- a/py/builtinimport.c
+++ b/py/builtinimport.c
@@ -127,7 +127,7 @@ STATIC void do_load(mp_obj_t module_obj, vstr_t *file) {
     }
 
     #if MICROPY_PY___FILE__
-    qstr source_name = mp_lexer_source_name(lex);
+    qstr source_name = lex->source_name;
     mp_store_attr(module_obj, MP_QSTR___file__, MP_OBJ_NEW_QSTR(source_name));
     #endif
 
diff --git a/py/lexer.c b/py/lexer.c
index a9444645a..a93d8ad0d 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -42,32 +42,10 @@
 // TODO seems that CPython allows NULL byte in the input stream
 // don't know if that's intentional or not, but we don't allow it
 
-struct _mp_lexer_t {
-    qstr source_name;           // name of source
-    void *stream_data;          // data for stream
-    mp_lexer_stream_next_byte_t stream_next_byte;   // stream callback to get next byte
-    mp_lexer_stream_close_t stream_close;           // stream callback to free
-
-    unichar chr0, chr1, chr2;   // current cached characters from source
-
-    mp_uint_t line;             // source line
-    mp_uint_t column;           // source column
-
-    mp_int_t emit_dent;             // non-zero when there are INDENT/DEDENT tokens to emit
-    mp_int_t nested_bracket_level;  // >0 when there are nested brackets over multiple lines
-
-    mp_uint_t alloc_indent_level;
-    mp_uint_t num_indent_level;
-    uint16_t *indent_level;
-
-    vstr_t vstr;
-    mp_token_t tok_cur;
-};
-
 mp_uint_t mp_optimise_value;
 
 // TODO replace with a call to a standard function
-bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
+STATIC bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
     mp_uint_t i = 0;
 
     while (i < len && *str == *strn) {
@@ -79,27 +57,6 @@ bool str_strn_equal(const char *str, const char *strn, mp_uint_t len) {
     return i == len && *str == 0;
 }
 
-#ifdef MICROPY_DEBUG_PRINTERS
-void mp_token_show(const mp_token_t *tok) {
-    printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:" UINT_FMT, tok->src_line, tok->src_column, tok->kind, tok->str, tok->len);
-    if (tok->str != NULL && tok->len > 0) {
-        const byte *i = (const byte *)tok->str;
-        const byte *j = (const byte *)i + tok->len;
-        printf(" ");
-        while (i < j) {
-            unichar c = utf8_get_char(i);
-            i = utf8_next_char(i);
-            if (unichar_isprint(c)) {
-                printf("%c", c);
-            } else {
-                printf("?");
-            }
-        }
-    }
-    printf("\n");
-}
-#endif
-
 #define CUR_CHAR(lex) ((lex)->chr0)
 
 STATIC bool is_end(mp_lexer_t *lex) {
@@ -210,7 +167,7 @@ STATIC void next_char(mp_lexer_t *lex) {
     }
 }
 
-void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
+STATIC void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
     if (lex->num_indent_level >= lex->alloc_indent_level) {
         // TODO use m_renew_maybe and somehow indicate an error if it fails... probably by using MP_TOKEN_MEMORY_ERROR
         lex->indent_level = m_renew(uint16_t, lex->indent_level, lex->alloc_indent_level, lex->alloc_indent_level + MICROPY_ALLOC_LEXEL_INDENT_INC);
@@ -219,11 +176,11 @@ void indent_push(mp_lexer_t *lex, mp_uint_t indent) {
     lex->indent_level[lex->num_indent_level++] = indent;
 }
 
-mp_uint_t indent_top(mp_lexer_t *lex) {
+STATIC mp_uint_t indent_top(mp_lexer_t *lex) {
     return lex->indent_level[lex->num_indent_level - 1];
 }
 
-void indent_pop(mp_lexer_t *lex) {
+STATIC void indent_pop(mp_lexer_t *lex) {
     lex->num_indent_level -= 1;
 }
 
@@ -335,7 +292,10 @@ STATIC bool get_hex(mp_lexer_t *lex, mp_uint_t num_digits, mp_uint_t *result) {
     return true;
 }
 
-STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool first_token) {
+STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
+    // start new token text
+    vstr_reset(&lex->vstr);
+
     // skip white space and comments
     bool had_physical_newline = false;
     while (!is_end(lex)) {
@@ -355,12 +315,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
             next_char(lex);
             if (!is_physical_newline(lex)) {
                 // SyntaxError: unexpected character after line continuation character
-                tok->src_line = lex->line;
-                tok->src_column = lex->column;
-                tok->kind = MP_TOKEN_BAD_LINE_CONTINUATION;
-                vstr_reset(&lex->vstr);
-                tok->str = vstr_str(&lex->vstr);
-                tok->len = 0;
+                lex->tok_line = lex->line;
+                lex->tok_column = lex->column;
+                lex->tok_kind = MP_TOKEN_BAD_LINE_CONTINUATION;
                 return;
             } else {
                 next_char(lex);
@@ -371,29 +328,26 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
     }
 
     // set token source information
-    tok->src_line = lex->line;
-    tok->src_column = lex->column;
-
-    // start new token text
-    vstr_reset(&lex->vstr);
+    lex->tok_line = lex->line;
+    lex->tok_column = lex->column;
 
     if (first_token && lex->line == 1 && lex->column != 1) {
         // check that the first token is in the first column
         // if first token is not on first line, we get a physical newline and
         // this check is done as part of normal indent/dedent checking below
         // (done to get equivalence with CPython)
-        tok->kind = MP_TOKEN_INDENT;
+        lex->tok_kind = MP_TOKEN_INDENT;
 
     } else if (lex->emit_dent < 0) {
-        tok->kind = MP_TOKEN_DEDENT;
+        lex->tok_kind = MP_TOKEN_DEDENT;
         lex->emit_dent += 1;
 
     } else if (lex->emit_dent > 0) {
-        tok->kind = MP_TOKEN_INDENT;
+        lex->tok_kind = MP_TOKEN_INDENT;
         lex->emit_dent -= 1;
 
     } else if (had_physical_newline && lex->nested_bracket_level == 0) {
-        tok->kind = MP_TOKEN_NEWLINE;
+        lex->tok_kind = MP_TOKEN_NEWLINE;
 
         mp_uint_t num_spaces = lex->column - 1;
         lex->emit_dent = 0;
@@ -407,20 +361,20 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
                 lex->emit_dent -= 1;
             }
             if (num_spaces != indent_top(lex)) {
-                tok->kind = MP_TOKEN_DEDENT_MISMATCH;
+                lex->tok_kind = MP_TOKEN_DEDENT_MISMATCH;
             }
         }
 
     } else if (is_end(lex)) {
         if (indent_top(lex) > 0) {
-            tok->kind = MP_TOKEN_NEWLINE;
+            lex->tok_kind = MP_TOKEN_NEWLINE;
             lex->emit_dent = 0;
             while (indent_top(lex) > 0) {
                 indent_pop(lex);
                 lex->emit_dent -= 1;
             }
         } else {
-            tok->kind = MP_TOKEN_END;
+            lex->tok_kind = MP_TOKEN_END;
         }
 
     } else if (is_char_or(lex, '\'', '\"')
@@ -451,9 +405,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
 
         // set token kind
         if (is_bytes) {
-            tok->kind = MP_TOKEN_BYTES;
+            lex->tok_kind = MP_TOKEN_BYTES;
         } else {
-            tok->kind = MP_TOKEN_STRING;
+            lex->tok_kind = MP_TOKEN_STRING;
         }
 
         // get first quoting character
@@ -566,14 +520,14 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
 
         // check we got the required end quotes
         if (n_closing < num_quotes) {
-            tok->kind = MP_TOKEN_LONELY_STRING_OPEN;
+            lex->tok_kind = MP_TOKEN_LONELY_STRING_OPEN;
         }
 
         // cut off the end quotes from the token text
         vstr_cut_tail_bytes(&lex->vstr, n_closing);
 
     } else if (is_head_of_identifier(lex)) {
-        tok->kind = MP_TOKEN_NAME;
+        lex->tok_kind = MP_TOKEN_NAME;
 
         // get first char
         vstr_add_char(&lex->vstr, CUR_CHAR(lex));
@@ -586,7 +540,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
         }
 
     } else if (is_digit(lex) || (is_char(lex, '.') && is_following_digit(lex))) {
-        tok->kind = MP_TOKEN_NUMBER;
+        lex->tok_kind = MP_TOKEN_NUMBER;
 
         // get first char
         vstr_add_char(&lex->vstr, CUR_CHAR(lex));
@@ -621,9 +575,9 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
             vstr_add_char(&lex->vstr, '.');
             next_char(lex);
             next_char(lex);
-            tok->kind = MP_TOKEN_ELLIPSIS;
+            lex->tok_kind = MP_TOKEN_ELLIPSIS;
         } else {
-            tok->kind = MP_TOKEN_DEL_PERIOD;
+            lex->tok_kind = MP_TOKEN_DEL_PERIOD;
         }
 
     } else {
@@ -645,7 +599,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
 
         if (*t == 0) {
             // didn't match any delimiter or operator characters
-            tok->kind = MP_TOKEN_INVALID;
+            lex->tok_kind = MP_TOKEN_INVALID;
 
         } else {
             // matched a delimiter or operator character
@@ -670,7 +624,7 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
                         next_char(lex);
                         tok_enc_index = t_index;
                     } else {
-                        tok->kind = MP_TOKEN_INVALID;
+                        lex->tok_kind = MP_TOKEN_INVALID;
                         goto tok_enc_no_match;
                     }
                     break;
@@ -692,37 +646,33 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, mp_token_t *tok, bool firs
             }
 
             // set token kind
-            tok->kind = tok_enc_kind[tok_enc_index];
+            lex->tok_kind = tok_enc_kind[tok_enc_index];
 
             tok_enc_no_match:
 
             // compute bracket level for implicit line joining
-            if (tok->kind == MP_TOKEN_DEL_PAREN_OPEN || tok->kind == MP_TOKEN_DEL_BRACKET_OPEN || tok->kind == MP_TOKEN_DEL_BRACE_OPEN) {
+            if (lex->tok_kind == MP_TOKEN_DEL_PAREN_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACKET_OPEN || lex->tok_kind == MP_TOKEN_DEL_BRACE_OPEN) {
                 lex->nested_bracket_level += 1;
-            } else if (tok->kind == MP_TOKEN_DEL_PAREN_CLOSE || tok->kind == MP_TOKEN_DEL_BRACKET_CLOSE || tok->kind == MP_TOKEN_DEL_BRACE_CLOSE) {
+            } else if (lex->tok_kind == MP_TOKEN_DEL_PAREN_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACKET_CLOSE || lex->tok_kind == MP_TOKEN_DEL_BRACE_CLOSE) {
                 lex->nested_bracket_level -= 1;
             }
         }
     }
 
-    // point token text to vstr buffer
-    tok->str = vstr_str(&lex->vstr);
-    tok->len = vstr_len(&lex->vstr);
-
     // check for keywords
-    if (tok->kind == MP_TOKEN_NAME) {
+    if (lex->tok_kind == MP_TOKEN_NAME) {
         // We check for __debug__ here and convert it to its value.  This is so
         // the parser gives a syntax error on, eg, x.__debug__.  Otherwise, we
         // need to check for this special token in many places in the compiler.
         // TODO improve speed of these string comparisons
         //for (mp_int_t i = 0; tok_kw[i] != NULL; i++) {
         for (mp_int_t i = 0; i < MP_ARRAY_SIZE(tok_kw); i++) {
-            if (str_strn_equal(tok_kw[i], tok->str, tok->len)) {
+            if (str_strn_equal(tok_kw[i], lex->vstr.buf, lex->vstr.len)) {
                 if (i == MP_ARRAY_SIZE(tok_kw) - 1) {
                     // tok_kw[MP_ARRAY_SIZE(tok_kw) - 1] == "__debug__"
-                    tok->kind = (mp_optimise_value == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
+                    lex->tok_kind = (mp_optimise_value == 0 ? MP_TOKEN_KW_TRUE : MP_TOKEN_KW_FALSE);
                 } else {
-                    tok->kind = MP_TOKEN_KW_FALSE + i;
+                    lex->tok_kind = MP_TOKEN_KW_FALSE + i;
                 }
                 break;
             }
@@ -782,7 +732,7 @@ mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_
     }
 
     // preload first token
-    mp_lexer_next_token_into(lex, &lex->tok_cur, true);
+    mp_lexer_next_token_into(lex, true);
 
     return lex;
 }
@@ -798,18 +748,27 @@ void mp_lexer_free(mp_lexer_t *lex) {
     }
 }
 
-qstr mp_lexer_source_name(mp_lexer_t *lex) {
-    return lex->source_name;
-}
-
 void mp_lexer_to_next(mp_lexer_t *lex) {
-    mp_lexer_next_token_into(lex, &lex->tok_cur, false);
+    mp_lexer_next_token_into(lex, false);
 }
 
-const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex) {
-    return &lex->tok_cur;
-}
-
-bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind) {
-    return lex->tok_cur.kind == kind;
+#if MICROPY_DEBUG_PRINTERS
+void mp_lexer_show_token(const mp_lexer_t *lex) {
+    printf("(" UINT_FMT ":" UINT_FMT ") kind:%u str:%p len:%u", lex->tok_line, lex->tok_column, lex->tok_kind, lex->vstr.buf, lex->vstr.len);
+    if (lex->vstr.len > 0) {
+        const byte *i = (const byte *)lex->vstr.buf;
+        const byte *j = (const byte *)i + lex->vstr.len;
+        printf(" ");
+        while (i < j) {
+            unichar c = utf8_get_char(i);
+            i = utf8_next_char(i);
+            if (unichar_isprint(c)) {
+                printf("%c", c);
+            } else {
+                printf("?");
+            }
+        }
+    }
+    printf("\n");
 }
+#endif
diff --git a/py/lexer.h b/py/lexer.h
index d70735f6d..c2f621d4c 100644
--- a/py/lexer.h
+++ b/py/lexer.h
@@ -130,15 +130,6 @@ typedef enum _mp_token_kind_t {
     MP_TOKEN_DEL_MINUS_MORE,
 } mp_token_kind_t;
 
-typedef struct _mp_token_t {
-    mp_uint_t src_line;         // source line
-    mp_uint_t src_column;       // source column
-
-    mp_token_kind_t kind;       // kind of token
-    const char *str;            // string of token (valid only while this token is current token)
-    mp_uint_t len;              // (byte) length of string of token
-} mp_token_t;
-
 // the next-byte function must return the next byte in the stream
 // it must return MP_LEXER_EOF if end of stream
 // it can be called again after returning MP_LEXER_EOF, and in that case must return MP_LEXER_EOF
@@ -146,21 +137,38 @@ typedef struct _mp_token_t {
 typedef mp_uint_t (*mp_lexer_stream_next_byte_t)(void*);
 typedef void (*mp_lexer_stream_close_t)(void*);
 
-typedef struct _mp_lexer_t mp_lexer_t;
+// this data structure is exposed for efficiency
+// public members are: source_name, tok_line, tok_column, tok_kind, vstr
+typedef struct _mp_lexer_t {
+    qstr source_name;           // name of source
+    void *stream_data;          // data for stream
+    mp_lexer_stream_next_byte_t stream_next_byte;   // stream callback to get next byte
+    mp_lexer_stream_close_t stream_close;           // stream callback to free
 
-void mp_token_show(const mp_token_t *tok);
+    unichar chr0, chr1, chr2;   // current cached characters from source
+
+    mp_uint_t line;             // current source line
+    mp_uint_t column;           // current source column
+
+    mp_int_t emit_dent;             // non-zero when there are INDENT/DEDENT tokens to emit
+    mp_int_t nested_bracket_level;  // >0 when there are nested brackets over multiple lines
+
+    mp_uint_t alloc_indent_level;
+    mp_uint_t num_indent_level;
+    uint16_t *indent_level;
+
+    mp_uint_t tok_line;         // token source line
+    mp_uint_t tok_column;       // token source column
+    mp_token_kind_t tok_kind;   // token kind
+    vstr_t vstr;                // token data
+} mp_lexer_t;
 
 mp_lexer_t *mp_lexer_new(qstr src_name, void *stream_data, mp_lexer_stream_next_byte_t stream_next_byte, mp_lexer_stream_close_t stream_close);
 mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len);
 
 void mp_lexer_free(mp_lexer_t *lex);
-qstr mp_lexer_source_name(mp_lexer_t *lex);
 void mp_lexer_to_next(mp_lexer_t *lex);
-const mp_token_t *mp_lexer_cur(const mp_lexer_t *lex);
-bool mp_lexer_is_kind(mp_lexer_t *lex, mp_token_kind_t kind);
-
-bool mp_lexer_show_error_pythonic_prefix(mp_lexer_t *lex);
-bool mp_lexer_show_error_pythonic(mp_lexer_t *lex, const char *msg);
+void mp_lexer_show_token(const mp_lexer_t *lex);
 
 /******************************************************************/
 // platform specific import function; must be implemented for a specific port
diff --git a/py/lexerstr.c b/py/lexerstr.c
index a1f7ce41d..c3456b9bd 100644
--- a/py/lexerstr.c
+++ b/py/lexerstr.c
@@ -24,6 +24,8 @@
  * THE SOFTWARE.
  */
 
+#include <stdint.h>
+
 #include "mpconfig.h"
 #include "misc.h"
 #include "qstr.h"
diff --git a/py/lexerunix.c b/py/lexerunix.c
index 52eac9eda..8e3241ad0 100644
--- a/py/lexerunix.c
+++ b/py/lexerunix.c
@@ -30,6 +30,7 @@
 #if MICROPY_HELPER_LEXER_UNIX
 
 #include <stdio.h>
+#include <stdint.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/stat.h>
diff --git a/py/parse.c b/py/parse.c
index 092f6e65f..6912920e3 100644
--- a/py/parse.c
+++ b/py/parse.c
@@ -155,7 +155,7 @@ STATIC void push_rule_from_arg(parser_t *parser, mp_uint_t arg) {
     assert((arg & RULE_ARG_KIND_MASK) == RULE_ARG_RULE || (arg & RULE_ARG_KIND_MASK) == RULE_ARG_OPT_RULE);
     mp_uint_t rule_id = arg & RULE_ARG_ARG_MASK;
     assert(rule_id < RULE_maximum_number_of);
-    push_rule(parser, mp_lexer_cur(parser->lexer)->src_line, rules[rule_id], 0);
+    push_rule(parser, parser->lexer->tok_line, rules[rule_id], 0);
 }
 
 STATIC void pop_rule(parser_t *parser, const rule_t **rule, mp_uint_t *arg_i, mp_uint_t *src_line) {
@@ -298,17 +298,17 @@ STATIC void push_result_string(parser_t *parser, mp_uint_t src_line, const char
     push_result_node(parser, (mp_parse_node_t)pn);
 }
 
-STATIC void push_result_token(parser_t *parser, const mp_lexer_t *lex) {
-    const mp_token_t *tok = mp_lexer_cur(lex);
+STATIC void push_result_token(parser_t *parser) {
     mp_parse_node_t pn;
-    if (tok->kind == MP_TOKEN_NAME) {
-        pn = mp_parse_node_new_leaf(MP_PARSE_NODE_ID, qstr_from_strn(tok->str, tok->len));
-    } else if (tok->kind == MP_TOKEN_NUMBER) {
+    mp_lexer_t *lex = parser->lexer;
+    if (lex->tok_kind == MP_TOKEN_NAME) {
+        pn = mp_parse_node_new_leaf(MP_PARSE_NODE_ID, qstr_from_strn(lex->vstr.buf, lex->vstr.len));
+    } else if (lex->tok_kind == MP_TOKEN_NUMBER) {
         bool dec = false;
         bool small_int = true;
         mp_int_t int_val = 0;
-        mp_uint_t len = tok->len;
-        const char *str = tok->str;
+        mp_uint_t len = lex->vstr.len;
+        const char *str = lex->vstr.buf;
         mp_uint_t base = 0;
         mp_uint_t i = mp_parse_num_base(str, len, &base);
         bool overflow = false;
@@ -343,29 +343,29 @@ STATIC void push_result_token(parser_t *parser, const mp_lexer_t *lex) {
         } else {
             pn = mp_parse_node_new_leaf(MP_PARSE_NODE_INTEGER, qstr_from_strn(str, len));
         }
-    } else if (tok->kind == MP_TOKEN_STRING) {
+    } else if (lex->tok_kind == MP_TOKEN_STRING) {
         // Don't automatically intern all strings.  doc strings (which are usually large)
         // will be discarded by the compiler, and so we shouldn't intern them.
         qstr qst = MP_QSTR_NULL;
-        if (tok->len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) {
+        if (lex->vstr.len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) {
             // intern short strings
-            qst = qstr_from_strn(tok->str, tok->len);
+            qst = qstr_from_strn(lex->vstr.buf, lex->vstr.len);
         } else {
             // check if this string is already interned
-            qst = qstr_find_strn(tok->str, tok->len);
+            qst = qstr_find_strn(lex->vstr.buf, lex->vstr.len);
         }
         if (qst != MP_QSTR_NULL) {
             // qstr exists, make a leaf node
             pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qst);
         } else {
             // not interned, make a node holding a pointer to the string data
-            push_result_string(parser, mp_lexer_cur(lex)->src_line, tok->str, tok->len);
+            push_result_string(parser, lex->tok_line, lex->vstr.buf, lex->vstr.len);
             return;
         }
-    } else if (tok->kind == MP_TOKEN_BYTES) {
-        pn = mp_parse_node_new_leaf(MP_PARSE_NODE_BYTES, qstr_from_strn(tok->str, tok->len));
+    } else if (lex->tok_kind == MP_TOKEN_BYTES) {
+        pn = mp_parse_node_new_leaf(MP_PARSE_NODE_BYTES, qstr_from_strn(lex->vstr.buf, lex->vstr.len));
     } else {
-        pn = mp_parse_node_new_leaf(MP_PARSE_NODE_TOKEN, tok->kind);
+        pn = mp_parse_node_new_leaf(MP_PARSE_NODE_TOKEN, lex->tok_kind);
     }
     push_result_node(parser, pn);
 }
@@ -414,7 +414,7 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind, mp_p
         case MP_PARSE_EVAL_INPUT: top_level_rule = RULE_eval_input; break;
         default: top_level_rule = RULE_file_input;
     }
-    push_rule(&parser, mp_lexer_cur(lex)->src_line, rules[top_level_rule], 0);
+    push_rule(&parser, lex->tok_line, rules[top_level_rule], 0);
 
     // parse!
 
@@ -454,8 +454,8 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind, mp_p
                 for (; i < n - 1; ++i) {
                     switch (rule->arg[i] & RULE_ARG_KIND_MASK) {
                         case RULE_ARG_TOK:
-                            if (mp_lexer_is_kind(lex, rule->arg[i] & RULE_ARG_ARG_MASK)) {
-                                push_result_token(&parser, lex);
+                            if (lex->tok_kind == (rule->arg[i] & RULE_ARG_ARG_MASK)) {
+                                push_result_token(&parser);
                                 mp_lexer_to_next(lex);
                                 goto next_rule;
                             }
@@ -469,8 +469,8 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind, mp_p
                     }
                 }
                 if ((rule->arg[i] & RULE_ARG_KIND_MASK) == RULE_ARG_TOK) {
-                    if (mp_lexer_is_kind(lex, rule->arg[i] & RULE_ARG_ARG_MASK)) {
-                        push_result_token(&parser, lex);
+                    if (lex->tok_kind == (rule->arg[i] & RULE_ARG_ARG_MASK)) {
+                        push_result_token(&parser);
                         mp_lexer_to_next(lex);
                     } else {
                         backtrack = true;
@@ -507,10 +507,10 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind, mp_p
                         case RULE_ARG_TOK:
                             // need to match a token
                             tok_kind = rule->arg[i] & RULE_ARG_ARG_MASK;
-                            if (mp_lexer_is_kind(lex, tok_kind)) {
+                            if (lex->tok_kind == tok_kind) {
                                 // matched token
                                 if (tok_kind == MP_TOKEN_NAME) {
-                                    push_result_token(&parser, lex);
+                                    push_result_token(&parser);
                                 }
                                 mp_lexer_to_next(lex);
                             } else {
@@ -657,11 +657,11 @@ mp_parse_node_t mp_parse(mp_lexer_t *lex, mp_parse_input_kind_t input_kind, mp_p
                         mp_uint_t arg = rule->arg[i & 1 & n];
                         switch (arg & RULE_ARG_KIND_MASK) {
                             case RULE_ARG_TOK:
-                                if (mp_lexer_is_kind(lex, arg & RULE_ARG_ARG_MASK)) {
+                                if (lex->tok_kind == (arg & RULE_ARG_ARG_MASK)) {
                                     if (i & 1 & n) {
                                         // separators which are tokens are not pushed to result stack
                                     } else {
-                                        push_result_token(&parser, lex);
+                                        push_result_token(&parser);
                                     }
                                     mp_lexer_to_next(lex);
                                     // got element of list, so continue parsing list
@@ -722,7 +722,7 @@ memory_error:
     }
 
     // check we are at the end of the token stream
-    if (!mp_lexer_is_kind(lex, MP_TOKEN_END)) {
+    if (lex->tok_kind != MP_TOKEN_END) {
         goto syntax_error;
     }
 
@@ -745,9 +745,9 @@ finished:
     return result;
 
 syntax_error:
-    if (mp_lexer_is_kind(lex, MP_TOKEN_INDENT)) {
+    if (lex->tok_kind == MP_TOKEN_INDENT) {
         *parse_error_kind_out = MP_PARSE_ERROR_UNEXPECTED_INDENT;
-    } else if (mp_lexer_is_kind(lex, MP_TOKEN_DEDENT_MISMATCH)) {
+    } else if (lex->tok_kind == MP_TOKEN_DEDENT_MISMATCH) {
         *parse_error_kind_out = MP_PARSE_ERROR_UNMATCHED_UNINDENT;
     } else {
         *parse_error_kind_out = MP_PARSE_ERROR_INVALID_SYNTAX;
@@ -755,7 +755,7 @@ syntax_error:
         // debugging: print the rule name that failed and the token
         printf("rule: %s\n", rule->rule_name);
 #if MICROPY_DEBUG_PRINTERS
-        mp_token_show(mp_lexer_cur(lex));
+        mp_token_show(lex);
 #endif
 #endif
     }
diff --git a/py/parsehelper.c b/py/parsehelper.c
index a6c54e8fc..f30471067 100644
--- a/py/parsehelper.c
+++ b/py/parsehelper.c
@@ -43,7 +43,7 @@
 #define STR_INVALID_SYNTAX "invalid syntax"
 
 void mp_parse_show_exception(mp_lexer_t *lex, mp_parse_error_kind_t parse_error_kind) {
-    printf("  File \"%s\", line " UINT_FMT ", column " UINT_FMT "\n", qstr_str(mp_lexer_source_name(lex)), mp_lexer_cur(lex)->src_line, mp_lexer_cur(lex)->src_column);
+    printf("  File \"%s\", line " UINT_FMT ", column " UINT_FMT "\n", qstr_str(lex->source_name), lex->tok_line, lex->tok_column);
     switch (parse_error_kind) {
         case MP_PARSE_ERROR_MEMORY:
             printf("MemoryError: %s\n", STR_MEMORY);
@@ -88,7 +88,7 @@ mp_obj_t mp_parse_make_exception(mp_lexer_t *lex, mp_parse_error_kind_t parse_er
 
     // add traceback to give info about file name and location
     // we don't have a 'block' name, so just pass the NULL qstr to indicate this
-    mp_obj_exception_add_traceback(exc, mp_lexer_source_name(lex), mp_lexer_cur(lex)->src_line, MP_QSTR_NULL);
+    mp_obj_exception_add_traceback(exc, lex->source_name, lex->tok_line, MP_QSTR_NULL);
 
     return exc;
 }
diff --git a/py/runtime.c b/py/runtime.c
index c0ae4726f..463e325d2 100644
--- a/py/runtime.c
+++ b/py/runtime.c
@@ -1258,7 +1258,7 @@ mp_obj_t mp_parse_compile_execute(mp_lexer_t *lex, mp_parse_input_kind_t parse_i
         nlr_raise(exc);
     }
 
-    qstr source_name = mp_lexer_source_name(lex);
+    qstr source_name = lex->source_name;
     mp_lexer_free(lex);
 
     // save context and set new context
diff --git a/stmhal/pyexec.c b/stmhal/pyexec.c
index 6bf8009a1..36a496aa1 100644
--- a/stmhal/pyexec.c
+++ b/stmhal/pyexec.c
@@ -66,7 +66,7 @@ STATIC int parse_compile_execute(mp_lexer_t *lex, mp_parse_input_kind_t input_ki
 
     mp_parse_error_kind_t parse_error_kind;
     mp_parse_node_t pn = mp_parse(lex, input_kind, &parse_error_kind);
-    qstr source_name = mp_lexer_source_name(lex);
+    qstr source_name = lex->source_name;
 
     // check for parse error
     if (pn == MP_PARSE_NODE_NULL) {
diff --git a/unix/main.c b/unix/main.c
index 2ade40b18..6733bbdda 100644
--- a/unix/main.c
+++ b/unix/main.c
@@ -114,8 +114,8 @@ STATIC int execute_from_lexer(mp_lexer_t *lex, mp_parse_input_kind_t input_kind,
 
     if (0) {
         // just tokenise
-        while (!mp_lexer_is_kind(lex, MP_TOKEN_END)) {
-            mp_token_show(mp_lexer_cur(lex));
+        while (lex->tok_kind != MP_TOKEN_END) {
+            mp_lexer_show_token(lex);
             mp_lexer_to_next(lex);
         }
         mp_lexer_free(lex);
@@ -132,7 +132,7 @@ STATIC int execute_from_lexer(mp_lexer_t *lex, mp_parse_input_kind_t input_kind,
         return 1;
     }
 
-    qstr source_name = mp_lexer_source_name(lex);
+    qstr source_name = lex->source_name;
     #if MICROPY_PY___FILE__
     if (input_kind == MP_PARSE_FILE_INPUT) {
         mp_store_global(MP_QSTR___file__, MP_OBJ_NEW_QSTR(source_name));