As far as I understand it, my implementation of findalliter() matches the semantics in the switch statement.static PyObject *_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,Py_ssize_t pos, Py_ssize_t endpos)/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/{SRE_STATE state;PyObject* list;Py_ssize_t status;Py_ssize_t i, b, e;if (!state_init(&state, self, string, pos, endpos))return NULL;list = PyList_New(0);if (!list) {state_fini(&state);return NULL;}while (state.start <= state.end) {PyObject* item;state_reset(&state);state.ptr = state.start;status = sre_search(&state, PatternObject_GetCode(self));if (PyErr_Occurred())goto error;if (status <= 0) {if (status == 0)break;pattern_error(status);goto error;}/* don't bother to build a match object */switch (self->groups) {case 0:b = STATE_OFFSET(&state, state.start);e = STATE_OFFSET(&state, state.ptr);item = getslice(state.isbytes, state.beginning,string, b, e);if (!item)goto error;break;case 1:item = state_getslice(&state, 1, string, 1);if (!item)goto error;break;default:item = PyTuple_New(self->groups);if (!item)goto error;for (i = 0; i < self->groups; i++) {PyObject* o = state_getslice(&state, i+1, string, 1);if (!o) {Py_DECREF(item);goto error;}PyTuple_SET_ITEM(item, i, o);}break;}status = PyList_Append(list, item);Py_DECREF(item);if (status < 0)goto error;state.must_advance = (state.ptr == state.start);state.start = state.ptr;}state_fini(&state);return list;error:Py_DECREF(list);state_fini(&state);return NULL;}
def test_findfirst():def check(pattern, string):try:g = re.findall(pattern, string)assert findfirst(pattern, string) == g[0], '%s %s' % (pattern, string)assert list(findalliter(pattern, string)) == g, '%s %s' % (pattern, string)except ValueError:assert len(g) == 0with pytest.raises(ValueError):findfirst(pattern, string)s = 'xxxaxxxxbxxx'check('a', s)check('(a)', s)check('(a|b)', s)check('a.*b', s)check('(a).*(b)', s)# no matchcheck('y', s)assert findfirst('y', s, default='THE DEFAULT') == 'THE DEFAULT'base = sfor i in range(2, 10):s = base * ig = re.findall('a', s)assert len(g) > 1check('a', s)check('(a)', s)check('(a|b)', s)check('a.*b', s)check('(a).*(b)', s)check('y', s)
[ins] In [1]: import re[ins] In [2]: re.findall(r'(a).*(b)?.*(c)', 'axxxxxxxxxxxxc')Out[2]: [('a', '', 'c')][ins] In [3]:
for m in re.finditer(pattern, string, flags=flags):g = m.groups()if len(g) == 1:yield g[0]elif g:yield tuple(s if s else '' for s in g)else:yield m.group()
def test_findfirst():def check(pattern, string):try:g = re.findall(pattern, string)assert findfirst(pattern, string) == g[0], '%s %s' % (pattern, string)assert list(findalliter(pattern, string)) == g, '%s %s' % (pattern, string)except ValueError:assert len(g) == 0with pytest.raises(ValueError):findfirst(pattern, string)def all_checks(s):check('a', s)check('(a)', s)check('(a|b)', s)check('a.*b', s)check('(a).*(b)', s)check('(a).*(c)?.*(b)', s) # non-matching group# no matchcheck('y', s)check('(y)?.*(z)?.*(q)', s)# empty matchescheck('(a).*(y?).*(b)', s)assert findfirst('y', s, default='THE DEFAULT') == 'THE DEFAULT's = 'xxxaxxxxbxxx'all_checks(s)base = sfor i in range(2, 10):s = base * ig = re.findall('a', s)assert len(g) == iall_checks(s)
> On Dec 27, 2019, at 09:00, Juancarlo Añez <apalala@gmail.com> wrote:
>
> for m in re.finditer(pattern, string, flags=flags):
> g = m.groups()
> if len(g) == 1:
> yield g[0]
> elif g:
> yield g
> else:
> yield m.group()
I don’t think this does the same thing as findall in every case. For example, for capture groups that don’t participate in the match, you’ll get tuples like ('spam', None, '42'), when I’m pretty sure findall always has strings no matter what. I’m not sure exactly what the rule is for how it does that: maybe it’s just the same thing as m.groups(default='') would give you?
At any rate, it seems like this isn’t as trivial to port from C as it looked, so this needs solid unit tests. (Maybe the ones for findall are already good enough if you just adapt them?)
That’s also a great argument that it should be added to the re module, so people don’t have to try to figure out how to port C code to Python and then test the hell out of it just to get something that’s only missing in the first place for historical/naming reasons.