py/lexer: Change token position for new lines.

Set the position of new line tokens as the end of the preceding line
instead of the beginning of the next line.  This is done by first moving
the pointer to the end of the current line to skip any whitespace, record
the position for the token, then finaly skip any other line and whitespace.

The previous behavior was to skip every new line and whitespace, including
the indent of the next line, before recording the token position.

(Note that both lex->emit_dent and lex->nested_bracket_level equal 0 if
had_physical_newline == true, which allows simplifying the if-logic for
MP_TOKEN_NEWLINE.)

And update the cmd_parsetree.py test expected output, because the position
of the new-line token has changed.

Fixes issue #12792.

Signed-off-by: Mathieu Serandour <mathieu.serandour@numworks.fr>
This commit is contained in:
Mathieu Serandour 2023-10-24 15:54:09 +02:00 committed by Damien George
parent 9a4d4db3a1
commit c85db05244
2 changed files with 21 additions and 12 deletions

View File

@ -527,14 +527,14 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
vstr_cut_tail_bytes(&lex->vstr, n_closing);
}
// This function returns whether it has crossed a newline or not.
// It therefore always return true if stop_at_newline is true
STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
bool had_physical_newline = false;
while (!is_end(lex)) {
if (is_physical_newline(lex)) {
if (stop_at_newline && lex->nested_bracket_level == 0) {
break;
return true;
}
had_physical_newline = true;
next_char(lex);
} else if (is_whitespace(lex)) {
next_char(lex);
@ -543,16 +543,16 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
while (!is_end(lex) && !is_physical_newline(lex)) {
next_char(lex);
}
// had_physical_newline will be set on next loop
// will return true on next loop
} else if (is_char_and(lex, '\\', '\n')) {
// line-continuation, so don't set had_physical_newline
// line-continuation, so don't return true
next_char(lex);
next_char(lex);
} else {
break;
}
}
return had_physical_newline;
return false;
}
void mp_lexer_to_next(mp_lexer_t *lex) {
@ -577,7 +577,10 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
vstr_reset(&lex->vstr);
// skip white space and comments
bool had_physical_newline = skip_whitespace(lex, false);
// set the newline tokens at the line and column of the preceding line:
// only advance on the pointer until a new line is crossed, save the
// line and column, and then readvance it
bool had_physical_newline = skip_whitespace(lex, true);
// set token source information
lex->tok_line = lex->line;
@ -591,7 +594,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
lex->tok_kind = MP_TOKEN_INDENT;
lex->emit_dent -= 1;
} else if (had_physical_newline && lex->nested_bracket_level == 0) {
} else if (had_physical_newline) {
// The cursor is at the end of the previous line, pointing to a
// physical newline. Skip any remaining whitespace, comments, and
// newlines.
skip_whitespace(lex, false);
lex->tok_kind = MP_TOKEN_NEWLINE;
size_t num_spaces = lex->column - 1;
@ -862,9 +870,10 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
// preload first token
mp_lexer_to_next(lex);
// Check that the first token is in the first column. If it's not then we
// convert the token kind to INDENT so that the parser gives a syntax error.
if (lex->tok_column != 1) {
// Check that the first token is in the first column unless it is a
// newline. Otherwise we convert the token kind to INDENT so that
// the parser gives a syntax error.
if (lex->tok_column != 1 && lex->tok_kind != MP_TOKEN_NEWLINE) {
lex->tok_kind = MP_TOKEN_INDENT;
}

View File

@ -1,5 +1,5 @@
----------------
[ 4] \(rule\|file_input_2\)(1) (n=10)
[ 1] file_input_2(1) (n=10)
tok(6)
[ 4] \(rule\|for_stmt\)(22) (n=4)
id(i)