From 47c234584d3358dfa6b4003d5e7264105d17b8f7 Mon Sep 17 00:00:00 2001
From: Chris Angelico <rosuav@gmail.com>
Date: Fri, 6 Jun 2014 13:15:32 +1000
Subject: [PATCH] objstr: Record character length separately from byte length

CAUTION: Buggy, may crash stuff - qstr needs equivalent functionality too
---
 py/objstr.c | 27 +++++++++++++++++----------
 py/objstr.h |  5 ++++-
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/py/objstr.c b/py/objstr.c
index 096315db6f..2ad4bf9a7e 100644
--- a/py/objstr.c
+++ b/py/objstr.c
@@ -52,6 +52,10 @@ const mp_obj_t mp_const_empty_bytes;
 // use this macro to extract the string data and length
 #define GET_STR_DATA_LEN_FLAGS(str_obj_in, str_data, str_len, str_flags) const byte *str_data; uint str_len; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; }
 
+// use this macro to extract the string data, lengths, and flags
+// NOTE: Currently buggy as regards qstr, which doesn't record a charlen
+#define GET_STR_INFO(str_obj_in, str_data, str_len, str_charlen, str_flags) const byte *str_data; uint str_len, str_charlen = -1; char str_flags; if (MP_OBJ_IS_QSTR(str_obj_in)) { str_data = qstr_data(MP_OBJ_QSTR_VALUE(str_obj_in), &str_len, &str_flags); } else { str_len = ((mp_obj_str_t*)str_obj_in)->len; str_charlen = ((mp_obj_str_t*)str_obj_in)->charlen; str_data = ((mp_obj_str_t*)str_obj_in)->data; str_flags = ((mp_obj_str_t*)str_obj_in)->flags; }
+
 // don't use this macro, it's only for conversions
 #define GET_STR_DATA_LEN(str_obj_in, str_data, str_len) GET_STR_DATA_LEN_FLAGS(str_obj_in, str_data, str_len, str_data ## _flags); assert(str_data ## _flags == 1);
 
@@ -355,7 +359,7 @@ uncomparable:
 
 STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
     mp_obj_type_t *type = mp_obj_get_type(self_in);
-    GET_STR_DATA_LEN(self_in, self_data, self_len);
+    GET_STR_INFO(self_in, self_data, self_len, self_charlen, self_flags);
     if (value == MP_OBJ_SENTINEL) {
         // load
 #if MICROPY_PY_BUILTINS_SLICE
@@ -368,7 +372,7 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
             return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
         }
 #endif
-        uint index_val = mp_get_index(type, self_len, index, false);
+        uint index_val = mp_get_index(type, self_charlen, index, false);
         if (type == &mp_type_bytes) {
             return MP_OBJ_NEW_SMALL_INT((mp_small_int_t)self_data[index_val]);
         } else {
@@ -377,8 +381,11 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
             // end of the buffer if there aren't that many characters in it
             const char *s;
             for (s=(const char *)self_data; index_val; ++s)
-                if ((*s&0xC0) != 0x80) --index_val;
-            return mp_obj_new_str(s, 1, true);
+                if ((*s & 0xC0) != 0x80) --index_val;
+            int len = 1;
+            if (*s > 0x7f)
+                for (char mask = 0x40; *s & mask; mask >>= 1) ++len; // Count the number of 1 bits (after the first)
+            return mp_obj_new_str(s, len, true); // This will create a one-character string
         }
     } else {
         return MP_OBJ_NULL; // op not supported
@@ -1710,7 +1717,7 @@ const mp_obj_type_t mp_type_bytes = {
 };
 
 // the zero-length bytes
-STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, 1, NULL};
+STATIC const mp_obj_str_t empty_bytes_obj = {{&mp_type_bytes}, 0, 0, 0, 1, NULL};
 const mp_obj_t mp_const_empty_bytes = (mp_obj_t)&empty_bytes_obj;
 
 mp_obj_t mp_obj_str_builder_start(const mp_obj_type_t *type, uint len, byte **data) {
@@ -1739,12 +1746,12 @@ mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uin
     o->len = len;
     o->flags = 1;
     if (data) {
-        // Calculate the byte length used by 'len' characters (by counting non-continuation bytes)
+        // Count non-continuation bytes so we know how long the string is in characters.
         const byte *endptr, *top = data + len;
-        uint lenleft = len;
-        for (endptr = data; endptr < top && lenleft; ++endptr)
-            if ((*endptr & 0xC0) != 0x80) --lenleft;
-        len = endptr - data; // Work with the byte length now (the object's length is stored above)
+        uint charlen = 0;
+        for (endptr = data; endptr < top; ++endptr)
+            if ((*endptr & 0xC0) != 0x80) ++charlen;
+        o->charlen = charlen;
         o->hash = qstr_compute_hash(data, len);
         byte *p = m_new(byte, len + 1);
         o->data = p;
diff --git a/py/objstr.h b/py/objstr.h
index 6c9e446455..66199ea145 100644
--- a/py/objstr.h
+++ b/py/objstr.h
@@ -30,11 +30,14 @@ typedef struct _mp_obj_str_t {
     machine_uint_t hash : 16;
     // len == number of bytes used in data, alloc = len + 1 because (at the moment) we also append a null byte
     machine_uint_t len : 16;
+    // charlen == number of characters in the string - charlen <= len - 1, and is the value returned by len() in Python
+    machine_uint_t charlen : 16;
     char flags; //Currently unused, always 1. Will later get markers eg ASCII-only.
     const void *data; //Character data is encoded UTF-8 and should not be blindly indexed.
 } mp_obj_str_t;
 
-#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, 1, (const byte*)str};
+// This is valid ONLY for pure-ASCII strings!
+#define MP_DEFINE_STR_OBJ(obj_name, str) mp_obj_str_t obj_name = {{&mp_type_str}, 0, sizeof(str) - 1, sizeof(str) - 1, 1, (const byte*)str};
 
 mp_obj_t mp_obj_str_format(uint n_args, const mp_obj_t *args);
 mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uint len);