diff --git a/py/lexer.c b/py/lexer.c index 9587f6b16..5e911a1a2 100644 --- a/py/lexer.c +++ b/py/lexer.c @@ -527,14 +527,14 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring) vstr_cut_tail_bytes(&lex->vstr, n_closing); } +// This function returns whether it has crossed a newline or not. +// It therefore always return true if stop_at_newline is true STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) { - bool had_physical_newline = false; while (!is_end(lex)) { if (is_physical_newline(lex)) { if (stop_at_newline && lex->nested_bracket_level == 0) { - break; + return true; } - had_physical_newline = true; next_char(lex); } else if (is_whitespace(lex)) { next_char(lex); @@ -543,16 +543,16 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) { while (!is_end(lex) && !is_physical_newline(lex)) { next_char(lex); } - // had_physical_newline will be set on next loop + // will return true on next loop } else if (is_char_and(lex, '\\', '\n')) { - // line-continuation, so don't set had_physical_newline + // line-continuation, so don't return true next_char(lex); next_char(lex); } else { break; } } - return had_physical_newline; + return false; } void mp_lexer_to_next(mp_lexer_t *lex) { @@ -577,7 +577,10 @@ void mp_lexer_to_next(mp_lexer_t *lex) { vstr_reset(&lex->vstr); // skip white space and comments - bool had_physical_newline = skip_whitespace(lex, false); + // set the newline tokens at the line and column of the preceding line: + // only advance on the pointer until a new line is crossed, save the + // line and column, and then readvance it + bool had_physical_newline = skip_whitespace(lex, true); // set token source information lex->tok_line = lex->line; @@ -591,7 +594,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) { lex->tok_kind = MP_TOKEN_INDENT; lex->emit_dent -= 1; - } else if (had_physical_newline && lex->nested_bracket_level == 0) { + } else if (had_physical_newline) { + // The cursor is at the end of the previous line, pointing to a + // physical newline. Skip any remaining whitespace, comments, and + // newlines. + skip_whitespace(lex, false); + lex->tok_kind = MP_TOKEN_NEWLINE; size_t num_spaces = lex->column - 1; @@ -862,9 +870,10 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) { // preload first token mp_lexer_to_next(lex); - // Check that the first token is in the first column. If it's not then we - // convert the token kind to INDENT so that the parser gives a syntax error. - if (lex->tok_column != 1) { + // Check that the first token is in the first column unless it is a + // newline. Otherwise we convert the token kind to INDENT so that + // the parser gives a syntax error. + if (lex->tok_column != 1 && lex->tok_kind != MP_TOKEN_NEWLINE) { lex->tok_kind = MP_TOKEN_INDENT; } diff --git a/tests/cmdline/cmd_parsetree.py.exp b/tests/cmdline/cmd_parsetree.py.exp index 3049267c0..6ec553b8a 100644 --- a/tests/cmdline/cmd_parsetree.py.exp +++ b/tests/cmdline/cmd_parsetree.py.exp @@ -1,5 +1,5 @@ ---------------- -[ 4] \(rule\|file_input_2\)(1) (n=10) +[ 1] file_input_2(1) (n=10) tok(6) [ 4] \(rule\|for_stmt\)(22) (n=4) id(i)