From c85db05244ef6185fbb3c218c508ddd179830942 Mon Sep 17 00:00:00 2001
From: Mathieu Serandour <mathieu.serandour@numworks.fr>
Date: Tue, 24 Oct 2023 15:54:09 +0200
Subject: [PATCH] py/lexer: Change token position for new lines.

Set the position of new line tokens as the end of the preceding line
instead of the beginning of the next line.  This is done by first moving
the pointer to the end of the current line to skip any whitespace, record
the position for the token, then finaly skip any other line and whitespace.

The previous behavior was to skip every new line and whitespace, including
the indent of the next line, before recording the token position.

(Note that both lex->emit_dent and lex->nested_bracket_level equal 0 if
had_physical_newline == true, which allows simplifying the if-logic for
MP_TOKEN_NEWLINE.)

And update the cmd_parsetree.py test expected output, because the position
of the new-line token has changed.

Fixes issue #12792.

Signed-off-by: Mathieu Serandour <mathieu.serandour@numworks.fr>
---
 py/lexer.c                         | 31 +++++++++++++++++++-----------
 tests/cmdline/cmd_parsetree.py.exp |  2 +-
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/py/lexer.c b/py/lexer.c
index 9587f6b16..5e911a1a2 100644
--- a/py/lexer.c
+++ b/py/lexer.c
@@ -527,14 +527,14 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
     vstr_cut_tail_bytes(&lex->vstr, n_closing);
 }
 
+// This function returns whether it has crossed a newline or not.
+// It therefore always return true if stop_at_newline is true
 STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
-    bool had_physical_newline = false;
     while (!is_end(lex)) {
         if (is_physical_newline(lex)) {
             if (stop_at_newline && lex->nested_bracket_level == 0) {
-                break;
+                return true;
             }
-            had_physical_newline = true;
             next_char(lex);
         } else if (is_whitespace(lex)) {
             next_char(lex);
@@ -543,16 +543,16 @@ STATIC bool skip_whitespace(mp_lexer_t *lex, bool stop_at_newline) {
             while (!is_end(lex) && !is_physical_newline(lex)) {
                 next_char(lex);
             }
-            // had_physical_newline will be set on next loop
+            // will return true on next loop
         } else if (is_char_and(lex, '\\', '\n')) {
-            // line-continuation, so don't set had_physical_newline
+            // line-continuation, so don't return true
             next_char(lex);
             next_char(lex);
         } else {
             break;
         }
     }
-    return had_physical_newline;
+    return false;
 }
 
 void mp_lexer_to_next(mp_lexer_t *lex) {
@@ -577,7 +577,10 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
     vstr_reset(&lex->vstr);
 
     // skip white space and comments
-    bool had_physical_newline = skip_whitespace(lex, false);
+    // set the newline tokens at the line and column of the preceding line:
+    // only advance on the pointer until a new line is crossed, save the
+    // line and column, and then readvance it
+    bool had_physical_newline = skip_whitespace(lex, true);
 
     // set token source information
     lex->tok_line = lex->line;
@@ -591,7 +594,12 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
         lex->tok_kind = MP_TOKEN_INDENT;
         lex->emit_dent -= 1;
 
-    } else if (had_physical_newline && lex->nested_bracket_level == 0) {
+    } else if (had_physical_newline) {
+        // The cursor is at the end of the previous line, pointing to a
+        // physical newline. Skip any remaining whitespace, comments, and
+        // newlines.
+        skip_whitespace(lex, false);
+
         lex->tok_kind = MP_TOKEN_NEWLINE;
 
         size_t num_spaces = lex->column - 1;
@@ -862,9 +870,10 @@ mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader) {
     // preload first token
     mp_lexer_to_next(lex);
 
-    // Check that the first token is in the first column.  If it's not then we
-    // convert the token kind to INDENT so that the parser gives a syntax error.
-    if (lex->tok_column != 1) {
+    // Check that the first token is in the first column unless it is a
+    // newline. Otherwise we convert the token kind to INDENT so that
+    // the parser gives a syntax error.
+    if (lex->tok_column != 1 && lex->tok_kind != MP_TOKEN_NEWLINE) {
         lex->tok_kind = MP_TOKEN_INDENT;
     }
 
diff --git a/tests/cmdline/cmd_parsetree.py.exp b/tests/cmdline/cmd_parsetree.py.exp
index 3049267c0..6ec553b8a 100644
--- a/tests/cmdline/cmd_parsetree.py.exp
+++ b/tests/cmdline/cmd_parsetree.py.exp
@@ -1,5 +1,5 @@
 ----------------
-[   4] \(rule\|file_input_2\)(1) (n=10)
+[   1] file_input_2(1) (n=10)
          tok(6)
 [   4]   \(rule\|for_stmt\)(22) (n=4)
            id(i)