diff --git a/py/mpconfig.h b/py/mpconfig.h index dac8a903c..38cf4b560 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -691,6 +691,11 @@ typedef double mp_float_t; #define MICROPY_PY_BUILTINS_STR_UNICODE (0) #endif +// Whether to check for valid UTF-8 when converting bytes to str +#ifndef MICROPY_PY_BUILTINS_STR_UNICODE_CHECK +#define MICROPY_PY_BUILTINS_STR_UNICODE_CHECK (MICROPY_PY_BUILTINS_STR_UNICODE) +#endif + // Whether str.center() method provided #ifndef MICROPY_PY_BUILTINS_STR_CENTER #define MICROPY_PY_BUILTINS_STR_CENTER (0) diff --git a/py/objstr.c b/py/objstr.c index 4c287af04..f6214f80c 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -161,6 +161,11 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ if (str_hash == 0) { str_hash = qstr_compute_hash(str_data, str_len); } + #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK + if (!utf8_check(str_data, str_len)) { + mp_raise_msg(&mp_type_UnicodeError, NULL); + } + #endif mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len)); o->data = str_data; o->hash = str_hash; @@ -168,6 +173,11 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ } else { mp_buffer_info_t bufinfo; mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ); + #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK + if (!utf8_check(bufinfo.buf, bufinfo.len)) { + mp_raise_msg(&mp_type_UnicodeError, NULL); + } + #endif return mp_obj_new_str(bufinfo.buf, bufinfo.len, false); } } diff --git a/py/unicode.c b/py/unicode.c index eddb007d5..140b7ba71 100644 --- a/py/unicode.c +++ b/py/unicode.c @@ -182,3 +182,31 @@ mp_uint_t unichar_xdigit_value(unichar c) { } return n; } + +bool utf8_check(const byte *p, size_t len) { + uint8_t need = 0; + const byte *end = p + len; + for (; p < end; p++) { + byte c = *p; + if (need) { + if (c >= 0x80) { + need--; + } else { + // mismatch + return 0; + } + } else { + if (c >= 0xc0) { + if (c >= 0xf8) { + // mismatch + return 0; + } + need = (0xe5 >> ((c >> 3) & 0x6)) & 3; + } else if (c >= 0x80) { + // mismatch + return 0; + } + } + } + return need == 0; // no pending fragments allowed +} diff --git a/py/unicode.h b/py/unicode.h index 19487a65a..c1fb51789 100644 --- a/py/unicode.h +++ b/py/unicode.h @@ -30,5 +30,6 @@ #include "py/misc.h" mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr); +bool utf8_check(const byte *p, size_t len); #endif // MICROPY_INCLUDED_PY_UNICODE_H diff --git a/tests/unicode/unicode.py b/tests/unicode/unicode.py index 5f29bc1c9..3a35ce894 100644 --- a/tests/unicode/unicode.py +++ b/tests/unicode/unicode.py @@ -33,3 +33,17 @@ try: int('\u0200') except ValueError: print('ValueError') + +# test invalid UTF-8 string +try: + str(b'ab\xa1', 'utf8') +except UnicodeError: + print('UnicodeError') +try: + str(b'ab\xf8', 'utf8') +except UnicodeError: + print('UnicodeError') +try: + str(bytearray(b'ab\xc0a'), 'utf8') +except UnicodeError: + print('UnicodeError')