From 47c234584d3358dfa6b4003d5e7264105d17b8f7 Mon Sep 17 00:00:00 2001 From: Chris Angelico Date: Fri, 6 Jun 2014 13:15:32 +1000 Subject: [PATCH] objstr: Record character length separately from byte length CAUTION: Buggy, may crash stuff - qstr needs equivalent functionality too --- py/objstr.c | 27 +++++++++++++++++---------- py/objstr.h | 5 ++++- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/py/objstr.c b/py/objstr.c index 096315db6f..2ad4bf9a7e 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -52,6 +52,10 @@ const mp_obj_t mp_const_empty_bytes; // use this macro to extract the string data and length #define GET_STR_DATA_LEN_FLAGS(str_obj_in, str_data, str_len, str_flags) const byte *str_data; uint str_len; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; } +// use this macro to extract the string data, lengths, and flags +// NOTE: Currently buggy as regards qstr, which doesn't record a charlen +#define GET_STR_INFO(str_obj_in, str_data, str_len, str_charlen, str_flags) const byte *str_data; uint str_len, str_charlen = -1; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_charlen = ((mp_obj_str_t*)str_obj_in)->charlen; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; } + // don't use this macro, it's only for conversions #define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) GET_STR_DATA_LEN_FLAGS(str_obj_in, str_data, str_len, str_data ## _flags); assert(str_data ## _flags == 1); @@ -355,7 +359,7 @@ uncomparable: STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { mp_obj_type_t *type = mp_obj_get_type(self_in); - GET_STR_DATA_LEN(self_in, self_data, self_len); + GET_STR_INFO(self_in, self_data, self_len, self_charlen, self_flags); if (value == MP_OBJ_SENTINEL) { // load #if MICROPY_PY_BUILTINS_SLICE @@ -368,7 +372,7 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start); } #endif - uint index_val = mp_get_index(type, self_len, index, false); + uint index_val = mp_get_index(type, self_charlen, index, false); if (type == &mp_type_bytes) { return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]); } else { @@ -377,8 +381,11 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { // end of the buffer if there aren't that many characters in it const char *s; for (s=(const char *)self_data; index_val; ++s) - if ((*s&0xC0) != 0x80) --index_val; - return mp_obj_new_str(s, 1, true); + if ((*s & 0xC0) != 0x80) --index_val; + int len = 1; + if (*s > 0x7f) + for (char mask = 0x40; *s & mask; mask >>= 1) ++len; // Count the number of 1 bits (after the first) + return mp_obj_new_str(s, len, true); // This will create a one-character string } } else { return MP_OBJ_NULL; // op not supported @@ -1710,7 +1717,7 @@ const mp_obj_type_t mp_type_bytes = { }; // the zero-length bytes -STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, 1, NULL}; +STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, 0, 1, NULL}; const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj; mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) { @@ -1739,12 +1746,12 @@ mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uin o->len = len; o->flags = 1; if (data) { - // Calculate the byte length used by 'len' characters (by counting non-continuation bytes) + // Count non-continuation bytes so we know how long the string is in characters. const byte *endptr, *top = data + len; - uint lenleft = len; - for (endptr = data; endptr < top && lenleft; ++endptr) - if ((*endptr & 0xC0) != 0x80) --lenleft; - len = endptr - data; // Work with the byte length now (the object's length is stored above) + uint charlen = 0; + for (endptr = data; endptr < top; ++endptr) + if ((*endptr & 0xC0) != 0x80) ++charlen; + o->charlen = charlen; o->hash = qstr_compute_hash(data, len); byte *p = m_new(byte, len + 1); o->data = p; diff --git a/py/objstr.h b/py/objstr.h index 6c9e446455..66199ea145 100644 --- a/py/objstr.h +++ b/py/objstr.h @@ -30,11 +30,14 @@ typedef struct _mp_obj_str_t { machine_uint_t hash : 16; // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte machine_uint_t len : 16; + // charlen == number of characters in the string - charlen <= len - 1, and is the value returned by len() in Python + machine_uint_t charlen : 16; char flags; //Currently unused, always 1. Will later get markers eg ASCII-only. const void *data; //Character data is encoded UTF-8 and should not be blindly indexed. } mp_obj_str_t; -#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, 1, (const byte*)str}; +// This is valid ONLY for pure-ASCII strings! +#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, sizeof(str) - 1, 1, (const byte*)str}; mp_obj_t mp_obj_str_format(uint n_args, const mp_obj_t *args); mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uint len);