From 6f3c6d7de917eae16f9b7efb7fb93014d0e8dc69 Mon Sep 17 00:00:00 2001 From: Jared Hancock Date: Mon, 25 Mar 2024 20:58:51 -0500 Subject: [PATCH 1/7] re: Add support for start- and endpos. Pattern objects have two additional parameters for the ::search and ::match methods to define the starting and ending position of the subject within the string to be searched. This allows for searching a sub-string without creating a slice. However, one caveat of using the start-pos rather than a slice is that the start anchor (`^`) remains anchored to the beginning of the text. --- extmod/modre.c | 24 ++++++++++- tests/extmod/re_start_end_pos.py | 59 ++++++++++++++++++++++++++++ tests/extmod/re_start_end_pos.py.exp | 14 +++++++ 3 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 tests/extmod/re_start_end_pos.py create mode 100644 tests/extmod/re_start_end_pos.py.exp diff --git a/extmod/modre.c b/extmod/modre.c index 2a3fdfd350..3a203644a8 100644 --- a/extmod/modre.c +++ b/extmod/modre.c @@ -195,10 +195,11 @@ static void re_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t } static mp_obj_t re_exec(bool is_anchored, uint n_args, const mp_obj_t *args) { - (void)n_args; mp_obj_re_t *self; + bool was_compiled = false; if (mp_obj_is_type(args[0], (mp_obj_type_t *)&re_type)) { self = MP_OBJ_TO_PTR(args[0]); + was_compiled = true; } else { self = MP_OBJ_TO_PTR(mod_re_compile(1, args)); } @@ -206,6 +207,27 @@ static mp_obj_t re_exec(bool is_anchored, uint n_args, const mp_obj_t *args) { size_t len; subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len); subj.end = subj.begin + len; + + if (was_compiled && n_args > 2) { + // Arg #2 is starting-pos + size_t startpos = mp_obj_get_int(args[2]); + if (startpos >= len) { + startpos = len; + } + subj.begin += startpos; + if (n_args > 3) { + // Arg #3 is ending-pos + size_t endpos = mp_obj_get_int(args[3]); + if (endpos > len) { + endpos = len; + } + else if (endpos < startpos) { + endpos = startpos; + } + subj.end = subj.begin_line + endpos; + } + } + int caps_num = (self->re.sub + 1) * 2; mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, caps, char *, caps_num); // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char diff --git a/tests/extmod/re_start_end_pos.py b/tests/extmod/re_start_end_pos.py new file mode 100644 index 0000000000..f8405b7852 --- /dev/null +++ b/tests/extmod/re_start_end_pos.py @@ -0,0 +1,59 @@ +# test start and end pos specification + +try: + import re +except ImportError: + print("SKIP") + raise SystemExit + + +def print_groups(match): + print("----") + try: + if match is not None: + i = 0 + while True: + print(match.group(i)) + i += 1 + except IndexError: + pass + +p = re.compile(r"o") +m = p.match("dog") +print_groups(m) + +m = p.match("dog", 1) +print_groups(m) + +m = p.match("dog", 2) +print_groups(m) + +m = p.match("dog", 3) # Past end of input +print_groups(m) + +m = p.match("dog", 0, 1) +print_groups(m) + +# Caret only matches the actual beginning +p = re.compile(r"^o") +m = p.match("dog", 1) +print_groups(m) + +# End at begging means searching empty string +p = re.compile(r"o") +m = p.match("dog", 1, 1) +print_groups(m) + +# Search also works +print('--search') + +p = re.compile(r"o") +m = p.search('dog') +print_groups(m) + +m = p.search('dog', 1) +print_groups(m) + +m = p.search('dog', 2) +print_groups(m) + diff --git a/tests/extmod/re_start_end_pos.py.exp b/tests/extmod/re_start_end_pos.py.exp new file mode 100644 index 0000000000..e02177fccd --- /dev/null +++ b/tests/extmod/re_start_end_pos.py.exp @@ -0,0 +1,14 @@ +---- +---- +o +---- +---- +---- +---- +---- +--search +---- +o +---- +o +---- From f710667f097cc0cc576d30d2d0689b507498128f Mon Sep 17 00:00:00 2001 From: Jared Hancock Date: Mon, 25 Mar 2024 21:10:27 -0500 Subject: [PATCH 2/7] re: Add doc for `pos` and `endpos` params for search and match. --- docs/library/re.rst | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/docs/library/re.rst b/docs/library/re.rst index 19b15d2d2c..b8aeefd90c 100644 --- a/docs/library/re.rst +++ b/docs/library/re.rst @@ -154,8 +154,8 @@ Regex objects Compiled regular expression. Instances of this class are created using `re.compile()`. -.. method:: regex.match(string) - regex.search(string) +.. method:: regex.match(string, [pos, [endpos]]) + regex.search(string, [pos, [endpos]]) regex.sub(replace, string, count=0, flags=0, /) Similar to the module-level functions :meth:`match`, :meth:`search` @@ -163,6 +163,16 @@ Compiled regular expression. Instances of this class are created using Using methods is (much) more efficient if the same regex is applied to multiple strings. + The optional second parameter *pos* gives an index in the string where the + search is to start; it defaults to ``0``. This is not completely equivalent + to slicing the string; the ``'^'`` pattern character matches at the real + beginning of the string and at positions just after a newline, but not + necessarily at the index where the search is to start. + + The optional parameter *endpos* limits how far the string will be searched; + it will be as if the string is *endpos* characters long, so only the + characters from *pos* to ``endpos - 1`` will be searched for a match. + .. method:: regex.split(string, max_split=-1, /) Split a *string* using regex. If *max_split* is given, it specifies From b190872d02ac828e04b10cbb026ebd88814c47ee Mon Sep 17 00:00:00 2001 From: Jared Hancock Date: Tue, 26 Mar 2024 09:27:28 -0500 Subject: [PATCH 3/7] re: Drop unneeded EXP file to ensure compatibility with CPython. --- tests/extmod/re_start_end_pos.py.exp | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 tests/extmod/re_start_end_pos.py.exp diff --git a/tests/extmod/re_start_end_pos.py.exp b/tests/extmod/re_start_end_pos.py.exp deleted file mode 100644 index e02177fccd..0000000000 --- a/tests/extmod/re_start_end_pos.py.exp +++ /dev/null @@ -1,14 +0,0 @@ ----- ----- -o ----- ----- ----- ----- ----- ---search ----- -o ----- -o ----- From 3e5a718f8457f2a328ab775413b5a8a133b114e4 Mon Sep 17 00:00:00 2001 From: Jared Hancock Date: Tue, 26 Mar 2024 21:41:28 -0500 Subject: [PATCH 4/7] re: Add support for `finditer` method. --- docs/library/re.rst | 11 ++++-- extmod/modre.c | 67 +++++++++++++++++++++++++++++++++++++ py/mpconfig.h | 4 +++ tests/extmod/re_finditer.py | 25 ++++++++++++++ 4 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 tests/extmod/re_finditer.py diff --git a/docs/library/re.rst b/docs/library/re.rst index b8aeefd90c..2d639c2e64 100644 --- a/docs/library/re.rst +++ b/docs/library/re.rst @@ -140,6 +140,12 @@ Functions Note: availability of this function depends on :term:`MicroPython port`. +.. function:: finditer(regex_str, string) + + Return an iterator yielding ``Match`` objects over all non-overlapping + matches for the RE *regex_str* in *string*. The string is scanned + left-to-right, and matches are returned in the order found. + .. data:: DEBUG Flag value, display debug information about compiled expression. @@ -156,10 +162,11 @@ Compiled regular expression. Instances of this class are created using .. method:: regex.match(string, [pos, [endpos]]) regex.search(string, [pos, [endpos]]) + regex.finditer(string, [pos, [endpos]]) regex.sub(replace, string, count=0, flags=0, /) - Similar to the module-level functions :meth:`match`, :meth:`search` - and :meth:`sub`. + Similar to the module-level functions :meth:`match`, :meth:`search`, + :meth:`finditer`, and :meth:`sub`. Using methods is (much) more efficient if the same regex is applied to multiple strings. diff --git a/extmod/modre.c b/extmod/modre.c index 3a203644a8..d2304a534b 100644 --- a/extmod/modre.c +++ b/extmod/modre.c @@ -422,11 +422,75 @@ MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_sub_obj, 3, 5, re_sub_helper); #endif +#if MICROPY_PY_RE_FINDITER + +typedef struct _mp_re_finditer_it_t { + mp_obj_base_t base; + mp_fun_1_t iternext; + mp_obj_t pattern; + mp_obj_t str; + mp_obj_t start; + mp_obj_t end; +} mp_re_finditer_it_t; + + +static mp_obj_t mp_re_finditer_it_iternext(mp_obj_t self_in) { + mp_re_finditer_it_t *self = MP_OBJ_TO_PTR(self_in); + + mp_obj_t args[4] = { + self->pattern, + self->str, + self->start, + self->end + }; + int n_args = (self->end == mp_const_none) ? 3 : 4; + + mp_obj_t obj_match = re_exec(false, n_args, args); + if (obj_match == mp_const_none) { + return MP_OBJ_STOP_ITERATION; + } + + mp_obj_match_t *match = MP_OBJ_TO_PTR(obj_match); + const char *begin = mp_obj_str_get_str(self->str); + self->start = MP_OBJ_NEW_SMALL_INT(match->caps[1] - begin); + return obj_match; +} + +static mp_obj_t re_finditer(size_t n_args, const mp_obj_t *args) { + mp_re_finditer_it_t *iter = mp_obj_malloc(mp_re_finditer_it_t, &mp_type_polymorph_iter); + iter->iternext = mp_re_finditer_it_iternext; + iter->str = args[1]; + iter->start = MP_OBJ_NEW_SMALL_INT(0); + iter->end = mp_const_none; + + if (mp_obj_is_type(args[0], (mp_obj_type_t *)&re_type)) { + iter->pattern = args[0]; + if (n_args > 2) { + iter->start = args[2]; + if (n_args > 3) { + iter->end = args[3]; + } + } + } + else { + iter->pattern = mod_re_compile(1, args); + } + + return MP_OBJ_FROM_PTR(iter); +} + +MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_finditer_obj, 2, 4, re_finditer); + +#endif // MICROPY_PY_RE_FINDITER + #if !MICROPY_ENABLE_DYNRUNTIME static const mp_rom_map_elem_t re_locals_dict_table[] = { { MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&re_match_obj) }, { MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&re_search_obj) }, { MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&re_split_obj) }, + #if MICROPY_PY_RE_FINDITER + { MP_ROM_QSTR(MP_QSTR_finditer), MP_ROM_PTR(&re_finditer_obj) }, + #endif #if MICROPY_PY_RE_SUB { MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&re_sub_obj) }, #endif @@ -477,6 +541,9 @@ static const mp_rom_map_elem_t mp_module_re_globals_table[] = { { MP_ROM_QSTR(MP_QSTR_compile), MP_ROM_PTR(&mod_re_compile_obj) }, { MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&re_match_obj) }, { MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&re_search_obj) }, + #if MICROPY_PY_RE_FINDITER + { MP_ROM_QSTR(MP_QSTR_finditer), MP_ROM_PTR(&re_finditer_obj) }, + #endif #if MICROPY_PY_RE_SUB { MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&re_sub_obj) }, #endif diff --git a/py/mpconfig.h b/py/mpconfig.h index d9cff930d1..4cd7d2f0ab 100644 --- a/py/mpconfig.h +++ b/py/mpconfig.h @@ -1652,6 +1652,10 @@ typedef double mp_float_t; #define MICROPY_PY_RE_MATCH_SPAN_START_END (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EVERYTHING) #endif +#ifndef MICROPY_PY_RE_FINDITER +#define MICROPY_PY_RE_FINDITER (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES) +#endif + #ifndef MICROPY_PY_RE_SUB #define MICROPY_PY_RE_SUB (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES) #endif diff --git a/tests/extmod/re_finditer.py b/tests/extmod/re_finditer.py new file mode 100644 index 0000000000..6043a37e3f --- /dev/null +++ b/tests/extmod/re_finditer.py @@ -0,0 +1,25 @@ +try: + import re + from re import finditer +except ImportError: + print("SKIP") + raise SystemExit + +ms = re.finditer(r'f[a-z]*', 'which foot or hand fell fastest') +print(list(x.group(0) for x in ms)) + +p = re.compile(r'f[a-z]*') +ms = p.finditer('which foot or hand fell fastest') +print(list(x.group(0) for x in ms)) + +ms = p.finditer('which foot or hand fell fastest', 10) +print(list(x.group(0) for x in ms)) + +ms = p.finditer('which foot or hand fell fastest', 10, 21) +print(list(x.group(0) for x in ms)) + +ms = re.finditer(r'\s+', 'which foot or hand fell fastest') +print(list(x.group(0) for x in ms)) + +ms = re.finditer(r'zz', 'which foot or hand fell fastest') +print(list(x.group(0) for x in ms)) From 8c28d635385cfdc86f44ae8a5746b9edb85d9dd4 Mon Sep 17 00:00:00 2001 From: Jared Hancock Date: Tue, 26 Mar 2024 22:08:12 -0500 Subject: [PATCH 5/7] re: Fixup linting of test files. --- tests/extmod/re_finditer.py | 14 +++++++------- tests/extmod/re_start_end_pos.py | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/extmod/re_finditer.py b/tests/extmod/re_finditer.py index 6043a37e3f..4794ea61a0 100644 --- a/tests/extmod/re_finditer.py +++ b/tests/extmod/re_finditer.py @@ -5,21 +5,21 @@ except ImportError: print("SKIP") raise SystemExit -ms = re.finditer(r'f[a-z]*', 'which foot or hand fell fastest') +ms = re.finditer(r"f[a-z]*", "which foot or hand fell fastest") print(list(x.group(0) for x in ms)) -p = re.compile(r'f[a-z]*') -ms = p.finditer('which foot or hand fell fastest') +p = re.compile(r"f[a-z]*") +ms = p.finditer("which foot or hand fell fastest") print(list(x.group(0) for x in ms)) -ms = p.finditer('which foot or hand fell fastest', 10) +ms = p.finditer("which foot or hand fell fastest", 10) print(list(x.group(0) for x in ms)) -ms = p.finditer('which foot or hand fell fastest', 10, 21) +ms = p.finditer("which foot or hand fell fastest", 10, 21) print(list(x.group(0) for x in ms)) -ms = re.finditer(r'\s+', 'which foot or hand fell fastest') +ms = re.finditer(r"\s+", "which foot or hand fell fastest") print(list(x.group(0) for x in ms)) -ms = re.finditer(r'zz', 'which foot or hand fell fastest') +ms = re.finditer(r"zz", "which foot or hand fell fastest") print(list(x.group(0) for x in ms)) diff --git a/tests/extmod/re_start_end_pos.py b/tests/extmod/re_start_end_pos.py index f8405b7852..5330d1b54e 100644 --- a/tests/extmod/re_start_end_pos.py +++ b/tests/extmod/re_start_end_pos.py @@ -18,6 +18,7 @@ def print_groups(match): except IndexError: pass + p = re.compile(r"o") m = p.match("dog") print_groups(m) @@ -45,15 +46,14 @@ m = p.match("dog", 1, 1) print_groups(m) # Search also works -print('--search') +print("--search") p = re.compile(r"o") -m = p.search('dog') +m = p.search("dog") print_groups(m) -m = p.search('dog', 1) +m = p.search("dog", 1) print_groups(m) -m = p.search('dog', 2) +m = p.search("dog", 2) print_groups(m) - From f72327165f391e23b4030c9780733fd69b1507a3 Mon Sep 17 00:00:00 2001 From: Jared Hancock Date: Thu, 28 Mar 2024 09:16:50 -0500 Subject: [PATCH 6/7] re: Properly handle negative start/end values. --- extmod/modre.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/extmod/modre.c b/extmod/modre.c index d2304a534b..3c7e53c956 100644 --- a/extmod/modre.c +++ b/extmod/modre.c @@ -210,15 +210,18 @@ static mp_obj_t re_exec(bool is_anchored, uint n_args, const mp_obj_t *args) { if (was_compiled && n_args > 2) { // Arg #2 is starting-pos - size_t startpos = mp_obj_get_int(args[2]); - if (startpos >= len) { + mp_int_t startpos = mp_obj_get_int(args[2]); + if (startpos > (mp_int_t) len) { startpos = len; } + else if (startpos < 0) { + startpos = 0; + } subj.begin += startpos; if (n_args > 3) { // Arg #3 is ending-pos - size_t endpos = mp_obj_get_int(args[3]); - if (endpos > len) { + mp_int_t endpos = mp_obj_get_int(args[3]); + if (endpos > (mp_int_t) len) { endpos = len; } else if (endpos < startpos) { From 5ad16c367883e1050140dcc43a88313f772612c2 Mon Sep 17 00:00:00 2001 From: Jared Hancock Date: Thu, 28 Mar 2024 09:17:18 -0500 Subject: [PATCH 7/7] re: Add test cases for end-before-start and negative values. --- tests/extmod/re_start_end_pos.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/tests/extmod/re_start_end_pos.py b/tests/extmod/re_start_end_pos.py index 5330d1b54e..fb22749666 100644 --- a/tests/extmod/re_start_end_pos.py +++ b/tests/extmod/re_start_end_pos.py @@ -29,7 +29,8 @@ print_groups(m) m = p.match("dog", 2) print_groups(m) -m = p.match("dog", 3) # Past end of input +# No match past end of input +m = p.match("dog", 5) print_groups(m) m = p.match("dog", 0, 1) @@ -40,11 +41,22 @@ p = re.compile(r"^o") m = p.match("dog", 1) print_groups(m) -# End at begging means searching empty string +# End at beginning means searching empty string p = re.compile(r"o") m = p.match("dog", 1, 1) print_groups(m) +# End before the beginning doesn't match anything +m = p.match("dog", 2, 1) +print_groups(m) + +# Negative starting values don't crash +m = p.search("dog", -2) +print_groups(m) + +m = p.search("dog", -2, -5) +print_groups(m) + # Search also works print("--search") @@ -57,3 +69,10 @@ print_groups(m) m = p.search("dog", 2) print_groups(m) + +# Negative starting values don't crash +m = p.search("dog", -2) +print_groups(m) + +m = p.search("dog", -2, -5) +print_groups(m) \ No newline at end of file