Formatting/layout improvements - introduce macros for UTF-8 byte detection, add braces. No functional changes.

pull/671/head
Chris Angelico 2014-06-07 15:28:35 +10:00
rodzic f1911f53d5
commit 279de0c8eb
4 zmienionych plików z 35 dodań i 14 usunięć

Wyświetl plik

@ -360,12 +360,12 @@ STATIC mp_obj_t mp_builtin_ord(mp_obj_t o_in) {
uint len, charlen;
const char *str = mp_obj_str_get_data_len(o_in, &len, &charlen);
if (charlen == 1) {
if (MP_OBJ_IS_STR(o_in) && (*str & 0x80)) {
if (MP_OBJ_IS_STR(o_in) && UTF8_IS_NONASCII(*str)) {
machine_int_t ord = *str++ & 0x7F;
for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) {
ord &= ~mask;
}
while ((*str & 0xC0) == 0x80) {
while (UTF8_IS_CONT(*str)) {
ord = (ord << 6) | (*str++ & 0x3F);
}
return mp_obj_new_int(ord);

Wyświetl plik

@ -100,6 +100,8 @@ bool unichar_isupper(unichar c);
bool unichar_islower(unichar c);
unichar unichar_tolower(unichar c);
unichar unichar_toupper(unichar c);
#define UTF8_IS_NONASCII(ch) ((ch) & 0x80)
#define UTF8_IS_CONT(ch) (((ch) & 0xC0) == 0x80)
/** variable string *********************************************/

Wyświetl plik

@ -109,7 +109,7 @@ void mp_str_print_quoted(void (*print)(void *env, const char *fmt, ...), void *e
for (machine_int_t mask = 0x40; ord & mask; mask >>= 1) {
ord &= ~mask;
}
while ((*s & 0xC0) == 0x80) {
while (UTF8_IS_CONT(*s)) {
ord = (ord << 6) | (*s++ & 0x3F);
}
--s; // s will be incremented by the main loop
@ -398,12 +398,22 @@ STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
// Assumes that the string is correctly formed - will run past the
// end of the buffer if there aren't that many characters in it
const char *s;
for (s=(const char *)self_data; index_val; ++s)
if ((*s & 0xC0) != 0x80) --index_val;
while ((*s & 0xC0) == 0x80) ++s; // Skip continuation bytes after the last lead byte
for (s=(const char *)self_data; index_val; ++s) {
if (!UTF8_IS_CONT(*s)) {
--index_val;
}
}
// Skip continuation bytes after the last lead byte
while (UTF8_IS_CONT(*s)) {
++s;
}
int len = 1;
if (*s & 0x80)
for (char mask = 0x40; *s & mask; mask >>= 1) ++len; // Count the number of 1 bits (after the first)
if (UTF8_IS_NONASCII(*s)) {
// Count the number of 1 bits (after the first)
for (char mask = 0x40; *s & mask; mask >>= 1) {
++len;
}
}
return mp_obj_new_str(s, len, true); // This will create a one-character string
}
} else {
@ -1769,8 +1779,11 @@ mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, uin
// Count non-continuation bytes so we know how long the string is in characters.
const byte *endptr, *top = data + len;
uint charlen = 0;
for (endptr = data; endptr < top; ++endptr)
if ((*endptr & 0xC0) != 0x80) ++charlen;
for (endptr = data; endptr < top; ++endptr) {
if (!UTF8_IS_CONT(*endptr)) {
++charlen;
}
}
o->charlen = charlen;
} else {
// For byte strings, the 'character' length (really the "exposed length" or "Python length") equals the byte length.

Wyświetl plik

@ -162,8 +162,11 @@ qstr qstr_from_strn(const char *str, uint len) {
machine_uint_t hash = qstr_compute_hash((const byte*)str, len);
byte *q_ptr = m_new(byte, 7 + len + 1);
uint charlen = 0;
for (const char *s = str; s < str + len; ++s)
if ((*s & 0xC0) != 0x80) ++charlen;
for (const char *s = str; s < str + len; ++s) {
if (!UTF8_IS_CONT(*s)) {
++charlen;
}
}
q_ptr[0] = hash;
q_ptr[1] = hash >> 8;
q_ptr[2] = len;
@ -195,8 +198,11 @@ qstr qstr_build_end(byte *q_ptr) {
q_ptr[0] = hash;
q_ptr[1] = hash >> 8;
uint charlen = 0;
for (const byte *s = str; s < str + len; ++s)
if ((*s & 0xC0) != 0x80) ++charlen;
for (const byte *s = str; s < str + len; ++s) {
if (!UTF8_IS_CONT(*s)) {
++charlen;
}
}
q_ptr[4] = charlen;
q_ptr[5] = charlen >> 8;
q_ptr[6] = 1;