From c85db05244ef6185fbb3c218c508ddd179830942 Mon Sep 17 00:00:00 2001 From: Mathieu Serandour Date: Tue, 24 Oct 2023 15:54:09 +0200 Subject: [PATCH] py/lexer: Change token position for new lines. Set the position of new line tokens as the end of the preceding line instead of the beginning of the next line. This is done by first moving the pointer to the end of the current line to skip any whitespace, record the position for the token, then finaly skip any other line and whitespace. The previous behavior was to skip every new line and whitespace, including the indent of the next line, before recording the token position. (Note that both lex->emit_dent and lex->nested_bracket_level equal 0 if had_physical_newline == true, which allows simplifying the if-logic for MP_TOKEN_NEWLINE.) And update the cmd_parsetree.py test expected output, because the position of the new-line token has changed. Fixes issue #12792. Signed-off-by: Mathieu Serandour --- py/lexer.c | 31 +++++++++++++++++++----------- tests/cmdline/cmd_parsetree.py.exp | 2 +- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/py/lexer.c b/py/lexer.c index 9587f6b16..5e911a1a2 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -527,14 +527,14 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) vstr_cut_tail_bytes(&lex->vstr, n_closing); } +// This function returns whether it has crossed a newline or not. +// It therefore always return true if stop_at_newline is true STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) { - bool had_physical_newline = false; while (!is_end(lex)) { if (is_physical_newline(lex)) { if (stop_at_newline && lex->nested_bracket_level == 0) { - break; + return true; } - had_physical_newline = true; next_char(lex); } else if (is_whitespace(lex)) { next_char(lex); @@ -543,16 +543,16 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) { while (!is_end(lex) && !is_physical_newline(lex)) { next_char(lex); } - // had_physical_newline will be set on next loop + // will return true on next loop } else if (is_char_and(lex, '\\', '\n')) { - // line-continuation, so don't set had_physical_newline + // line-continuation, so don't return true next_char(lex); next_char(lex); } else { break; } } - return had_physical_newline; + return false; } void mp_lexer_to_next(mp_lexer_t *lex) { @@ -577,7 +577,10 @@ void mp_lexer_to_next(mp_lexer_t *lex) { vstr_reset(&lex->vstr); // skip white space and comments - bool had_physical_newline = skip_whitespace(lex, false); + // set the newline tokens at the line and column of the preceding line: + // only advance on the pointer until a new line is crossed, save the + // line and column, and then readvance it + bool had_physical_newline = skip_whitespace(lex, true); // set token source information lex->tok_line = lex->line; @@ -591,7 +594,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) { lex->tok_kind = MP_TOKEN_INDENT; lex->emit_dent -= 1; - } else if (had_physical_newline && lex->nested_bracket_level == 0) { + } else if (had_physical_newline) { + // The cursor is at the end of the previous line, pointing to a + // physical newline. Skip any remaining whitespace, comments, and + // newlines. + skip_whitespace(lex, false); + lex->tok_kind = MP_TOKEN_NEWLINE; size_t num_spaces = lex->column - 1; @@ -862,9 +870,10 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) { // preload first token mp_lexer_to_next(lex); - // Check that the first token is in the first column. If it's not then we - // convert the token kind to INDENT so that the parser gives a syntax error. - if (lex->tok_column != 1) { + // Check that the first token is in the first column unless it is a + // newline. Otherwise we convert the token kind to INDENT so that + // the parser gives a syntax error. + if (lex->tok_column != 1 && lex->tok_kind != MP_TOKEN_NEWLINE) { lex->tok_kind = MP_TOKEN_INDENT; } diff --git a/tests/cmdline/cmd_parsetree.py.exp b/tests/cmdline/cmd_parsetree.py.exp index 3049267c0..6ec553b8a 100644 --- a/tests/cmdline/cmd_parsetree.py.exp +++ b/tests/cmdline/cmd_parsetree.py.exp @@ -1,5 +1,5 @@ ---------------- -[ 4] \(rule\|file_input_2\)(1) (n=10) +[ 1] file_input_2(1) (n=10) tok(6) [ 4] \(rule\|for_stmt\)(22) (n=4) id(i)