py/lexer: Change token position for new lines.

Set the position of new line tokens as the end of the preceding line instead of the beginning of the next line. This is done by first moving the pointer to the end of the current line to skip any whitespace, record the position for the token, then finaly skip any other line and whitespace. The previous behavior was to skip every new line and whitespace, including the indent of the next line, before recording the token position. (Note that both lex->emit_dent and lex->nested_bracket_level equal 0 if had_physical_newline == true, which allows simplifying the if-logic for MP_TOKEN_NEWLINE.) And update the cmd_parsetree.py test expected output, because the position of the new-line token has changed. Fixes issue #12792. Signed-off-by: Mathieu Serandour <mathieu.serandour@numworks.fr>
2023-10-24 15:54:09 +02:00 · 2023-10-24 15:54:09 +02:00 · c85db05244
parent 9a4d4db3a1
commit c85db05244
2 changed files with 21 additions and 12 deletions
--- a/py/lexer.c
+++ b/py/lexer.c
@ -527,14 +527,14 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
    vstr_cut_tail_bytes(&lex->vstr, n_closing);
 }

+// This function returns whether it has crossed a newline or not.
+// It therefore always return true if stop_at_newline is true
 STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
-    bool had_physical_newline = false;
    while (!is_end(lex)) {
        if (is_physical_newline(lex)) {
            if (stop_at_newline && lex->nested_bracket_level == 0) {
-                break;
+                return true;
            }
-            had_physical_newline = true;
            next_char(lex);
        } else if (is_whitespace(lex)) {
            next_char(lex);
@ -543,16 +543,16 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
            while (!is_end(lex) && !is_physical_newline(lex)) {
                next_char(lex);
            }
-            // had_physical_newline will be set on next loop
+            // will return true on next loop
        } else if (is_char_and(lex, '\\', '\n')) {
-            // line-continuation, so don't set had_physical_newline
+            // line-continuation, so don't return true
            next_char(lex);
            next_char(lex);
        } else {
            break;
        }
    }
-    return had_physical_newline;
+    return false;
 }

 void mp_lexer_to_next(mp_lexer_t *lex) {
@ -577,7 +577,10 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
    vstr_reset(&lex->vstr);

    // skip white space and comments
-    bool had_physical_newline = skip_whitespace(lex, false);
+    // set the newline tokens at the line and column of the preceding line:
+    // only advance on the pointer until a new line is crossed, save the
+    // line and column, and then readvance it
+    bool had_physical_newline = skip_whitespace(lex, true);

    // set token source information
    lex->tok_line = lex->line;
@ -591,7 +594,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
        lex->tok_kind = MP_TOKEN_INDENT;
        lex->emit_dent -= 1;

-    } else if (had_physical_newline && lex->nested_bracket_level == 0) {
+    } else if (had_physical_newline) {
+        // The cursor is at the end of the previous line, pointing to a
+        // physical newline. Skip any remaining whitespace, comments, and
+        // newlines.
+        skip_whitespace(lex, false);
+
        lex->tok_kind = MP_TOKEN_NEWLINE;

        size_t num_spaces = lex->column - 1;
@ -862,9 +870,10 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
    // preload first token
    mp_lexer_to_next(lex);

-    // Check that the first token is in the first column.  If it's not then we
-    // convert the token kind to INDENT so that the parser gives a syntax error.
-    if (lex->tok_column != 1) {
+    // Check that the first token is in the first column unless it is a
+    // newline. Otherwise we convert the token kind to INDENT so that
+    // the parser gives a syntax error.
+    if (lex->tok_column != 1 && lex->tok_kind != MP_TOKEN_NEWLINE) {
        lex->tok_kind = MP_TOKEN_INDENT;
    }

--- a/tests/cmdline/cmd_parsetree.py.exp
+++ b/tests/cmdline/cmd_parsetree.py.exp
@ -1,5 +1,5 @@
 ----------------
-[   4] \(rule\|file_input_2\)(1) (n=10)
+[   1] file_input_2(1) (n=10)
         tok(6)
 [   4]   \(rule\|for_stmt\)(22) (n=4)
           id(i)