From e30a5fc7bcd27900e0657db97ed54fc056d8f852 Mon Sep 17 00:00:00 2001 From: Damien George Date: Thu, 24 May 2018 13:08:51 +1000 Subject: [PATCH] extmod/modure: Add ure.sub() function and method, and tests. This feature is controlled at compile time by MICROPY_PY_URE_SUB, disabled by default. Thanks to @dmazzella for the original patch for this feature; see #3770. --- extmod/modure.c | 128 ++++++++++++++++++++++++++ py/mpconfig.h | 4 + tests/extmod/ure_sub.py | 61 ++++++++++++ tests/extmod/ure_sub_unmatched.py | 19 ++++ tests/extmod/ure_sub_unmatched.py.exp | 1 + 5 files changed, 213 insertions(+) create mode 100644 tests/extmod/ure_sub.py create mode 100644 tests/extmod/ure_sub_unmatched.py create mode 100644 tests/extmod/ure_sub_unmatched.py.exp diff --git a/extmod/modure.c b/extmod/modure.c index a536f907fd..0d5330cb54 100644 --- a/extmod/modure.c +++ b/extmod/modure.c @@ -249,10 +249,127 @@ STATIC mp_obj_t re_split(size_t n_args, const mp_obj_t *args) { } MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_split_obj, 2, 3, re_split); +#if MICROPY_PY_URE_SUB + +STATIC mp_obj_t re_sub_helper(mp_obj_t self_in, size_t n_args, const mp_obj_t *args) { + mp_obj_re_t *self = MP_OBJ_TO_PTR(self_in); + mp_obj_t replace = args[1]; + mp_obj_t where = args[2]; + mp_int_t count = 0; + if (n_args > 3) { + count = mp_obj_get_int(args[3]); + // Note: flags are currently ignored + } + + size_t where_len; + const char *where_str = mp_obj_str_get_data(where, &where_len); + Subject subj; + subj.begin = where_str; + subj.end = subj.begin + where_len; + int caps_num = (self->re.sub + 1) * 2; + + vstr_t vstr_return; + vstr_return.buf = NULL; // We'll init the vstr after the first match + mp_obj_match_t *match = mp_local_alloc(sizeof(mp_obj_match_t) + caps_num * sizeof(char*)); + match->base.type = &match_type; + match->num_matches = caps_num / 2; // caps_num counts start and end pointers + match->str = where; + + for (;;) { + // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char + memset((char*)match->caps, 0, caps_num * sizeof(char*)); + int res = re1_5_recursiveloopprog(&self->re, &subj, match->caps, caps_num, false); + + // If we didn't have a match, or had an empty match, it's time to stop + if (!res || match->caps[0] == match->caps[1]) { + break; + } + + // Initialise the vstr if it's not already + if (vstr_return.buf == NULL) { + vstr_init(&vstr_return, match->caps[0] - subj.begin); + } + + // Add pre-match string + vstr_add_strn(&vstr_return, subj.begin, match->caps[0] - subj.begin); + + // Get replacement string + const char* repl = mp_obj_str_get_str((mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace)); + + // Append replacement string to result, substituting any regex groups + while (*repl != '\0') { + if (*repl == '\\') { + ++repl; + bool is_g_format = false; + if (*repl == 'g' && repl[1] == '<') { + // Group specified with syntax "\g" + repl += 2; + is_g_format = true; + } + + if ('0' <= *repl && *repl <= '9') { + // Group specified with syntax "\g" or "\number" + unsigned int match_no = 0; + do { + match_no = match_no * 10 + (*repl++ - '0'); + } while ('0' <= *repl && *repl <= '9'); + if (is_g_format && *repl == '>') { + ++repl; + } + + if (match_no >= (unsigned int)match->num_matches) { + nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, MP_OBJ_NEW_SMALL_INT(match_no))); + } + + const char *start_match = match->caps[match_no * 2]; + if (start_match != NULL) { + // Add the substring matched by group + const char *end_match = match->caps[match_no * 2 + 1]; + vstr_add_strn(&vstr_return, start_match, end_match - start_match); + } + } + } else { + // Just add the current byte from the replacement string + vstr_add_byte(&vstr_return, *repl++); + } + } + + // Move start pointer to end of last match + subj.begin = match->caps[1]; + + // Stop substitutions if count was given and gets to 0 + if (count > 0 && --count == 0) { + break; + } + } + + mp_local_free(match); + + if (vstr_return.buf == NULL) { + // Optimisation for case of no substitutions + return where; + } + + // Add post-match string + vstr_add_strn(&vstr_return, subj.begin, subj.end - subj.begin); + + return mp_obj_new_str_from_vstr(mp_obj_get_type(where), &vstr_return); +} + +STATIC mp_obj_t re_sub(size_t n_args, const mp_obj_t *args) { + return re_sub_helper(args[0], n_args, args); +} +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_sub_obj, 3, 5, re_sub); + +#endif + STATIC const mp_rom_map_elem_t re_locals_dict_table[] = { { MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&re_match_obj) }, { MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&re_search_obj) }, { MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&re_split_obj) }, + #if MICROPY_PY_URE_SUB + { MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&re_sub_obj) }, + #endif }; STATIC MP_DEFINE_CONST_DICT(re_locals_dict, re_locals_dict_table); @@ -307,11 +424,22 @@ STATIC mp_obj_t mod_re_search(size_t n_args, const mp_obj_t *args) { } MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_search_obj, 2, 4, mod_re_search); +#if MICROPY_PY_URE_SUB +STATIC mp_obj_t mod_re_sub(size_t n_args, const mp_obj_t *args) { + mp_obj_t self = mod_re_compile(1, args); + return re_sub_helper(self, n_args, args); +} +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_sub_obj, 3, 5, mod_re_sub); +#endif + STATIC const mp_rom_map_elem_t mp_module_re_globals_table[] = { { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ure) }, { MP_ROM_QSTR(MP_QSTR_compile), MP_ROM_PTR(&mod_re_compile_obj) }, { MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&mod_re_match_obj) }, { MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&mod_re_search_obj) }, + #if MICROPY_PY_URE_SUB + { MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&mod_re_sub_obj) }, + #endif { MP_ROM_QSTR(MP_QSTR_DEBUG), MP_ROM_INT(FLAG_DEBUG) }, }; diff --git a/py/mpconfig.h b/py/mpconfig.h index 727375b123..8b0f291cb0 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -1150,6 +1150,10 @@ typedef double mp_float_t; #define MICROPY_PY_URE_MATCH_SPAN_START_END (0) #endif +#ifndef MICROPY_PY_URE_SUB +#define MICROPY_PY_URE_SUB (0) +#endif + #ifndef MICROPY_PY_UHEAPQ #define MICROPY_PY_UHEAPQ (0) #endif diff --git a/tests/extmod/ure_sub.py b/tests/extmod/ure_sub.py new file mode 100644 index 0000000000..4aeb8650a1 --- /dev/null +++ b/tests/extmod/ure_sub.py @@ -0,0 +1,61 @@ +try: + import ure as re +except ImportError: + try: + import re + except ImportError: + print('SKIP') + raise SystemExit + +try: + re.sub +except AttributeError: + print('SKIP') + raise SystemExit + + +def multiply(m): + return str(int(m.group(0)) * 2) + +print(re.sub("\d+", multiply, "10 20 30 40 50")) + +print(re.sub("\d+", lambda m: str(int(m.group(0)) // 2), "10 20 30 40 50")) + +def A(): + return "A" +print(re.sub('a', A(), 'aBCBABCDabcda.')) + +print( + re.sub( + r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):', + 'static PyObject*\npy_\\1(void){\n return;\n}\n', + '\n\ndef myfunc():\n\ndef myfunc1():\n\ndef myfunc2():' + ) +) + +print( + re.compile( + '(calzino) (blu|bianco|verde) e (scarpa) (blu|bianco|verde)' + ).sub( + r'\g<1> colore \2 con \g<3> colore \4? ...', + 'calzino blu e scarpa verde' + ) +) + +# no matches at all +print(re.sub('a', 'b', 'c')) + +# with maximum substitution count specified +print(re.sub('a', 'b', '1a2a3a', 2)) + +# invalid group +try: + re.sub('(a)', 'b\\2', 'a') +except: + print('invalid group') + +# invalid group with very large number (to test overflow in uPy) +try: + re.sub('(a)', 'b\\199999999999999999999999999999999999999', 'a') +except: + print('invalid group') diff --git a/tests/extmod/ure_sub_unmatched.py b/tests/extmod/ure_sub_unmatched.py new file mode 100644 index 0000000000..4795b3196f --- /dev/null +++ b/tests/extmod/ure_sub_unmatched.py @@ -0,0 +1,19 @@ +# test re.sub with unmatched groups, behaviour changed in CPython 3.5 + +try: + import ure as re +except ImportError: + try: + import re + except ImportError: + print('SKIP') + raise SystemExit + +try: + re.sub +except AttributeError: + print('SKIP') + raise SystemExit + +# first group matches, second optional group doesn't so is replaced with a blank +print(re.sub(r'(a)(b)?', r'\2-\1', '1a2')) diff --git a/tests/extmod/ure_sub_unmatched.py.exp b/tests/extmod/ure_sub_unmatched.py.exp new file mode 100644 index 0000000000..1e5f0fda05 --- /dev/null +++ b/tests/extmod/ure_sub_unmatched.py.exp @@ -0,0 +1 @@ +1-a2