py: Support unicode (utf-8 encoded) identifiers in Python source.

Enabled simply by making the identifier lexing code 8-bit clean.
This commit is contained in:
Damien George 2015-06-09 10:58:07 +00:00
parent 6e56bb623c
commit 7ed58cb663
2 changed files with 32 additions and 6 deletions

View File

@ -112,12 +112,11 @@ STATIC bool is_following_odigit(mp_lexer_t *lex) {
return lex->chr1 >= '0' && lex->chr1 <= '7';
}
// TODO UNICODE include unicode characters in definition of identifiers
// to easily parse utf-8 identifiers we allow any raw byte with high bit set
STATIC bool is_head_of_identifier(mp_lexer_t *lex) {
return is_letter(lex) || lex->chr0 == '_';
return is_letter(lex) || lex->chr0 == '_' || lex->chr0 >= 0x80;
}
// TODO UNICODE include unicode characters in definition of identifiers
STATIC bool is_tail_of_identifier(mp_lexer_t *lex) {
return is_head_of_identifier(lex) || is_digit(lex);
}
@ -523,13 +522,13 @@ STATIC void mp_lexer_next_token_into(mp_lexer_t *lex, bool first_token) {
} else if (is_head_of_identifier(lex)) {
lex->tok_kind = MP_TOKEN_NAME;
// get first char
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
// get first char (add as byte to remain 8-bit clean and support utf-8)
vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
next_char(lex);
// get tail chars
while (!is_end(lex) && is_tail_of_identifier(lex)) {
vstr_add_char(&lex->vstr, CUR_CHAR(lex));
vstr_add_byte(&lex->vstr, CUR_CHAR(lex));
next_char(lex);
}

View File

@ -0,0 +1,27 @@
# test unicode in identifiers
# comment
# αβγδϵφζ
# global identifiers
α = 1
αβγ = 2
= 3
βb = 4
print(α, αβγ, , βb)
# function, argument, local identifiers
def α(β, γ):
δ = β + γ
print(β, γ, δ)
α(1, 2)
# class, method identifiers
class φ:
def __init__(self):
pass
def δ(self, ϵ):
print(ϵ)
zζzζz = φ()
if hasattr(zζzζz, "δ"):
zζzζz.δ(ϵ=123)