From 1f1d5194d775ad996f1d341c1a44b56af7ea4d4c Mon Sep 17 00:00:00 2001 From: Damien George Date: Thu, 16 Nov 2017 13:53:04 +1100 Subject: [PATCH] py/objstr: Make mp_obj_new_str_of_type check for existing interned qstr. The function mp_obj_new_str_of_type is a general str object constructor used in many places in the code to create either a str or bytes object. When creating a str it should first check if the string data already exists as an interned qstr, and if so then return the qstr object. This patch makes the function have such behaviour, which helps to reduce heap usage by reusing existing interned data where possible. The old behaviour of mp_obj_new_str_of_type (which didn't check for existing interned data) is made available through the function mp_obj_new_str_copy, but should only be used in very special cases. One consequence of this patch is that the following expression is now True: 'abc' is ' abc '.split()[0] --- py/objstr.c | 26 +++++++++++++++++++------- py/objstr.h | 1 + py/parse.c | 2 +- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/py/objstr.c b/py/objstr.c index 5c464ba7d6..bca2af801b 100644 --- a/py/objstr.c +++ b/py/objstr.c @@ -164,7 +164,7 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_ mp_raise_msg(&mp_type_UnicodeError, NULL); } #endif - mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(type, NULL, str_len)); + mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_copy(type, NULL, str_len)); o->data = str_data; o->hash = str_hash; return MP_OBJ_FROM_PTR(o); @@ -205,7 +205,7 @@ STATIC mp_obj_t bytes_make_new(const mp_obj_type_t *type_in, size_t n_args, size if (str_hash == 0) { str_hash = qstr_compute_hash(str_data, str_len); } - mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_of_type(&mp_type_bytes, NULL, str_len)); + mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_copy(&mp_type_bytes, NULL, str_len)); o->data = str_data; o->hash = str_hash; return MP_OBJ_FROM_PTR(o); @@ -226,7 +226,7 @@ STATIC mp_obj_t bytes_make_new(const mp_obj_type_t *type_in, size_t n_args, size // check if argument has the buffer protocol mp_buffer_info_t bufinfo; if (mp_get_buffer(args[0], &bufinfo, MP_BUFFER_READ)) { - return mp_obj_new_str_of_type(&mp_type_bytes, bufinfo.buf, bufinfo.len); + return mp_obj_new_bytes(bufinfo.buf, bufinfo.len); } vstr_t vstr; @@ -1977,8 +1977,9 @@ const mp_obj_type_t mp_type_bytes = { const mp_obj_str_t mp_const_empty_bytes_obj = {{&mp_type_bytes}, 0, 0, (const byte*)""}; // Create a str/bytes object using the given data. New memory is allocated and -// the data is copied across. -mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, size_t len) { +// the data is copied across. This function should only be used if the type is bytes, +// or if the type is str and the string data is known to be not interned. +mp_obj_t mp_obj_new_str_copy(const mp_obj_type_t *type, const byte* data, size_t len) { mp_obj_str_t *o = m_new_obj(mp_obj_str_t); o->base.type = type; o->len = len; @@ -1992,6 +1993,17 @@ mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, siz return MP_OBJ_FROM_PTR(o); } +// Create a str/bytes object using the given data. If the type is str and the string +// data is already interned, then a qstr object is returned. Otherwise new memory is +// allocated for the object and the data is copied across. +mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, size_t len) { + if (type == &mp_type_str) { + return mp_obj_new_str((const char*)data, len); + } else { + return mp_obj_new_bytes(data, len); + } +} + // Create a str using a qstr to store the data; may use existing or new qstr. mp_obj_t mp_obj_new_str_via_qstr(const char* data, size_t len) { return MP_OBJ_NEW_QSTR(qstr_from_strn(data, len)); @@ -2034,7 +2046,7 @@ mp_obj_t mp_obj_new_str(const char* data, size_t len) { return MP_OBJ_NEW_QSTR(q); } else { // no existing qstr, don't make one - return mp_obj_new_str_of_type(&mp_type_str, (const byte*)data, len); + return mp_obj_new_str_copy(&mp_type_str, (const byte*)data, len); } } @@ -2044,7 +2056,7 @@ mp_obj_t mp_obj_str_intern(mp_obj_t str) { } mp_obj_t mp_obj_new_bytes(const byte* data, size_t len) { - return mp_obj_new_str_of_type(&mp_type_bytes, data, len); + return mp_obj_new_str_copy(&mp_type_bytes, data, len); } bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) { diff --git a/py/objstr.h b/py/objstr.h index 82501a763e..4e55cad091 100644 --- a/py/objstr.h +++ b/py/objstr.h @@ -65,6 +65,7 @@ mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type_in, size_t n_args, size_t void mp_str_print_json(const mp_print_t *print, const byte *str_data, size_t str_len); mp_obj_t mp_obj_str_format(size_t n_args, const mp_obj_t *args, mp_map_t *kwargs); mp_obj_t mp_obj_str_split(size_t n_args, const mp_obj_t *args); +mp_obj_t mp_obj_new_str_copy(const mp_obj_type_t *type, const byte* data, size_t len); mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte* data, size_t len); mp_obj_t mp_obj_str_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in); diff --git a/py/parse.c b/py/parse.c index 8c51b03498..f7fe30418c 100644 --- a/py/parse.c +++ b/py/parse.c @@ -417,7 +417,7 @@ STATIC void push_result_token(parser_t *parser, const rule_t *rule) { pn = mp_parse_node_new_leaf(lex->tok_kind == MP_TOKEN_STRING ? MP_PARSE_NODE_STRING : MP_PARSE_NODE_BYTES, qst); } else { // not interned, make a node holding a pointer to the string/bytes object - mp_obj_t o = mp_obj_new_str_of_type( + mp_obj_t o = mp_obj_new_str_copy( lex->tok_kind == MP_TOKEN_STRING ? &mp_type_str : &mp_type_bytes, (const byte*)lex->vstr.buf, lex->vstr.len); pn = make_node_const_object(parser, lex->tok_line, o);