diff --git a/py/misc.h b/py/misc.h index 5d557db6fe..a14bef7fe6 100644 --- a/py/misc.h +++ b/py/misc.h @@ -121,8 +121,15 @@ typedef uint32_t unichar; typedef uint unichar; #endif +#if MICROPY_PY_BUILTINS_STR_UNICODE unichar utf8_get_char(const byte *s); const byte *utf8_next_char(const byte *s); +size_t utf8_charlen(const byte *str, size_t len); +#else +static inline unichar utf8_get_char(const byte *s) { return *s; } +static inline const byte *utf8_next_char(const byte *s) { return s + 1; } +static inline size_t utf8_charlen(const byte *str, size_t len) { (void)str; return len; } +#endif bool unichar_isspace(unichar c); bool unichar_isalpha(unichar c); @@ -135,7 +142,6 @@ bool unichar_islower(unichar c); unichar unichar_tolower(unichar c); unichar unichar_toupper(unichar c); mp_uint_t unichar_xdigit_value(unichar c); -mp_uint_t unichar_charlen(const char *str, mp_uint_t len); #define UTF8_IS_NONASCII(ch) ((ch) & 0x80) #define UTF8_IS_CONT(ch) (((ch) & 0xC0) == 0x80) diff --git a/py/modbuiltins.c b/py/modbuiltins.c index e6f82df6f4..6b8886804b 100644 --- a/py/modbuiltins.c +++ b/py/modbuiltins.c @@ -346,7 +346,7 @@ STATIC mp_obj_t mp_builtin_ord(mp_obj_t o_in) { const char *str = mp_obj_str_get_data(o_in, &len); #if MICROPY_PY_BUILTINS_STR_UNICODE if (MP_OBJ_IS_STR(o_in)) { - len = unichar_charlen(str, len); + len = utf8_charlen((const byte*)str, len); if (len == 1) { return mp_obj_new_int(utf8_get_char((const byte*)str)); } diff --git a/py/objstr.c b/py/objstr.c index 30153813da..ed9ab4e45b 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -1704,7 +1704,7 @@ STATIC mp_obj_t str_count(size_t n_args, const mp_obj_t *args) { // if needle_len is zero then we count each gap between characters as an occurrence if (needle_len == 0) { - return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char*)start, end - start) + 1); + return MP_OBJ_NEW_SMALL_INT(utf8_charlen(start, end - start) + 1); } // count the occurrences diff --git a/py/objstrunicode.c b/py/objstrunicode.c index a1f54b8a21..badb569d79 100644 --- a/py/objstrunicode.c +++ b/py/objstrunicode.c @@ -104,7 +104,7 @@ STATIC mp_obj_t uni_unary_op(mp_unary_op_t op, mp_obj_t self_in) { case MP_UNARY_OP_BOOL: return mp_obj_new_bool(str_len != 0); case MP_UNARY_OP_LEN: - return MP_OBJ_NEW_SMALL_INT(unichar_charlen((const char *)str_data, str_len)); + return MP_OBJ_NEW_SMALL_INT(utf8_charlen(str_data, str_len)); default: return MP_OBJ_NULL; // op not supported } diff --git a/py/unicode.c b/py/unicode.c index 140b7ba712..935dc9012e 100644 --- a/py/unicode.c +++ b/py/unicode.c @@ -67,9 +67,9 @@ STATIC const uint8_t attr[] = { AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0 }; -// TODO: Rename to str_get_char -unichar utf8_get_char(const byte *s) { #if MICROPY_PY_BUILTINS_STR_UNICODE + +unichar utf8_get_char(const byte *s) { unichar ord = *s++; if (!UTF8_IS_NONASCII(ord)) return ord; ord &= 0x7F; @@ -80,22 +80,14 @@ unichar utf8_get_char(const byte *s) { ord = (ord << 6) | (*s++ & 0x3F); } return ord; -#else - return *s; -#endif } -// TODO: Rename to str_next_char const byte *utf8_next_char(const byte *s) { -#if MICROPY_PY_BUILTINS_STR_UNICODE ++s; while (UTF8_IS_CONT(*s)) { ++s; } return s; -#else - return s + 1; -#endif } mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) { @@ -109,21 +101,18 @@ mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) { return i; } -// TODO: Rename to str_charlen -mp_uint_t unichar_charlen(const char *str, mp_uint_t len) { -#if MICROPY_PY_BUILTINS_STR_UNICODE - mp_uint_t charlen = 0; - for (const char *top = str + len; str < top; ++str) { +size_t utf8_charlen(const byte *str, size_t len) { + size_t charlen = 0; + for (const byte *top = str + len; str < top; ++str) { if (!UTF8_IS_CONT(*str)) { ++charlen; } } return charlen; -#else - return len; -#endif } +#endif + // Be aware: These unichar_is* functions are actually ASCII-only! bool unichar_isspace(unichar c) { return c < 128 && (attr[c] & FL_SPACE) != 0; @@ -183,6 +172,8 @@ mp_uint_t unichar_xdigit_value(unichar c) { return n; } +#if MICROPY_PY_BUILTINS_STR_UNICODE + bool utf8_check(const byte *p, size_t len) { uint8_t need = 0; const byte *end = p + len; @@ -210,3 +201,5 @@ bool utf8_check(const byte *p, size_t len) { } return need == 0; // no pending fragments allowed } + +#endif