Good point, Andrew!

I missed including the unit tests I wrote. I will now, at the bottom.

For reference, this is the implementation of findall():

static PyObject *
_sre_SRE_Pattern_findall_impl(PatternObject *self, PyObject *string,
                              Py_ssize_t pos, Py_ssize_t endpos)
/*[clinic end generated code: output=f4966baceea60aca input=5b6a4ee799741563]*/
{
    SRE_STATE state;
    PyObject* list;
    Py_ssize_t status;
    Py_ssize_t i, b, e;

    if (!state_init(&state, self, string, pos, endpos))
        return NULL;

    list = PyList_New(0);
    if (!list) {
        state_fini(&state);
        return NULL;
    }

    while (state.start <= state.end) {

        PyObject* item;

        state_reset(&state);

        state.ptr = state.start;

        status = sre_search(&state, PatternObject_GetCode(self));
        if (PyErr_Occurred())
            goto error;

        if (status <= 0) {
            if (status == 0)
                break;
            pattern_error(status);
            goto error;
        }

        /* don't bother to build a match object */
        switch (self->groups) {
        case 0:
            b = STATE_OFFSET(&state, state.start);
            e = STATE_OFFSET(&state, state.ptr);
            item = getslice(state.isbytes, state.beginning,
                            string, b, e);
            if (!item)
                goto error;
            break;
        case 1:
            item = state_getslice(&state, 1, string, 1);
            if (!item)
                goto error;
            break;
        default:
            item = PyTuple_New(self->groups);
            if (!item)
                goto error;
            for (i = 0; i < self->groups; i++) {
                PyObject* o = state_getslice(&state, i+1, string, 1);
                if (!o) {
                    Py_DECREF(item);
                    goto error;
                }
                PyTuple_SET_ITEM(item, i, o);
            }
            break;
        }

        status = PyList_Append(list, item);
        Py_DECREF(item);
        if (status < 0)
            goto error;

        state.must_advance = (state.ptr == state.start);
        state.start = state.ptr;
    }

    state_fini(&state);
    return list;

error:
    Py_DECREF(list);
    state_fini(&state);
    return NULL;

}

As far as I understand it, my implementation of findalliter() matches the semantics in the switch statement.

This is my unit test. As you noted, it's not complete as it lacks a case for unmatched groups.

def test_findfirst():
    def check(pattern, string):
        try:
            g = re.findall(pattern, string)
            assert findfirst(pattern, string) == g[0], '%s %s' % (pattern, string)
            assert list(findalliter(pattern, string)) == g, '%s %s' % (pattern, string)
        except ValueError:
            assert len(g) == 0
            with pytest.raises(ValueError):
                findfirst(pattern, string)

    s = 'xxxaxxxxbxxx'
    check('a', s)
    check('(a)', s)
    check('(a|b)', s)
    check('a.*b', s)
    check('(a).*(b)', s)

    # no match
    check('y', s)

    assert findfirst('y', s, default='THE DEFAULT') == 'THE DEFAULT'

    base = s
    for i in range(2, 10):
        s = base * i
        g = re.findall('a', s)
        assert len(g) > 1

        check('a', s)
        check('(a)', s)
        check('(a|b)', s)
        check('a.*b', s)
        check('(a).*(b)', s)

        check('y', s)

You're right about findall() always returning srings:

[ins] In [1]: import re                                                                                                         

[ins] In [2]: re.findall(r'(a).*(b)?.*(c)', 'axxxxxxxxxxxxc')                                                                   
Out[2]: [('a', '', 'c')]

[ins] In [3]:  


This is the matching implementation:

    for m in re.finditer(pattern, string, flags=flags):
        g = m.groups()
        if len(g) == 1:
            yield g[0]
        elif g:
            yield tuple(s if s else '' for s in g)
        else:
            yield m.group()

Updated unit test:

def test_findfirst():
    def check(pattern, string):
        try:
            g = re.findall(pattern, string)
            assert findfirst(pattern, string) == g[0], '%s %s' % (pattern, string)
            assert list(findalliter(pattern, string)) == g, '%s %s' % (pattern, string)
        except ValueError:
            assert len(g) == 0
            with pytest.raises(ValueError):
                findfirst(pattern, string)


    def all_checks(s):
        check('a', s)
        check('(a)', s)
        check('(a|b)', s)
        check('a.*b', s)
        check('(a).*(b)', s)
        check('(a).*(c)?.*(b)', s)  # non-matching group

        # no match
        check('y', s)
        check('(y)?.*(z)?.*(q)', s)

        # empty matches
        check('(a).*(y?).*(b)', s)

        assert findfirst('y', s, default='THE DEFAULT') == 'THE DEFAULT'

    s = 'xxxaxxxxbxxx'
    all_checks(s)

    base = s
    for i in range(2, 10):
        s = base * i
        g = re.findall('a', s)
        assert len(g) == i

        all_checks(s)


Cheers,


On Fri, Dec 27, 2019 at 5:37 PM Andrew Barnert <abarnert@yahoo.com> wrote:
> On Dec 27, 2019, at 09:00, Juancarlo Añez <apalala@gmail.com> wrote:
>
>    for m in re.finditer(pattern, string, flags=flags):
>        g = m.groups()
>        if len(g) == 1:
>            yield g[0]
>        elif g:
>            yield g
>        else:
>            yield m.group()

I don’t think this does the same thing as findall in every case. For example, for capture groups that don’t participate in the match, you’ll get tuples like ('spam', None, '42'), when I’m pretty sure findall always has strings no matter what. I’m not sure exactly what the rule is for how it does that: maybe it’s just the same thing as m.groups(default='') would give you?

At any rate, it seems like this isn’t as trivial to port from C as it looked, so this needs solid unit tests. (Maybe the ones for findall are already good enough if you just adapt them?)

That’s also a great argument that it should be added to the re module, so people don’t have to try to figure out how to port C code to Python and then test the hell out of it just to get something that’s only missing in the first place for historical/naming reasons.




--
Juancarlo Añez