[Python-checkins] CVS: python/dist/src/Modules _sre.c,2.68,2.69

Fredrik Lundh effbot@users.sourceforge.net
Sun, 21 Oct 2001 09:47:59 -0700


Update of /cvsroot/python/python/dist/src/Modules
In directory usw-pr-cvs1:/tmp/cvs-serv14492/Modules

Modified Files:
	_sre.c 
Log Message:


rewrote the pattern.sub and pattern.subn methods in C

removed (conceptually flawed) getliteral helper; the new sub/subn code
uses a faster code path for literal replacement strings, but doesn't
(yet) look for literal patterns.

added STATE_OFFSET macro, and use it to convert state.start/ptr to
char indexes


Index: _sre.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Modules/_sre.c,v
retrieving revision 2.68
retrieving revision 2.69
diff -C2 -d -r2.68 -r2.69
*** _sre.c	2001/10/20 17:48:46	2.68
--- _sre.c	2001/10/21 16:47:57	2.69
***************
*** 32,38 ****
   * 2001-05-14 fl  fixes for 1.5.2
   * 2001-07-01 fl  added BIGCHARSET support (from Martin von Loewis)
-  * 2001-09-18 fl  added _getliteral helper
   * 2001-10-18 fl  fixed group reset issue (from Matthew Mueller)
   * 2001-10-20 fl  added split primitive; reenable unicode for 1.6/2.0/2.1
   *
   * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
--- 32,38 ----
   * 2001-05-14 fl  fixes for 1.5.2
   * 2001-07-01 fl  added BIGCHARSET support (from Martin von Loewis)
   * 2001-10-18 fl  fixed group reset issue (from Matthew Mueller)
   * 2001-10-20 fl  added split primitive; reenable unicode for 1.6/2.0/2.1
+  * 2001-10-21 fl  added sub/subn primitive
   *
   * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
***************
*** 50,54 ****
  
  static char copyright[] =
!     " SRE 2.2.0 Copyright (c) 1997-2001 by Secret Labs AB ";
  
  #include "Python.h"
--- 50,54 ----
  
  static char copyright[] =
!     " SRE 2.2.1 Copyright (c) 1997-2001 by Secret Labs AB ";
  
  #include "Python.h"
***************
*** 77,82 ****
  /* optional features */
  
! /* test: define to use sre._split helper instead of C code */
  #undef USE_PYTHON_SPLIT
  
  /* prevent run-away recursion (bad patterns on long strings) */
--- 77,83 ----
  /* optional features */
  
! /* test: define to use sre.py helpers instead of C code */
  #undef USE_PYTHON_SPLIT
+ #undef USE_PYTHON_SUB
  
  /* prevent run-away recursion (bad patterns on long strings) */
***************
*** 1494,1497 ****
--- 1495,1502 ----
  }
  
+ /* calculate offset from start of string */
+ #define STATE_OFFSET(state, member)\
+     (((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
+ 
  LOCAL(PyObject*)
  state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
***************
*** 1510,1517 ****
          }
      } else {
!         i = ((char*)state->mark[index] - (char*)state->beginning) /
!             state->charsize;
!         j = ((char*)state->mark[index+1] - (char*)state->beginning) /
!             state->charsize;
      }
  
--- 1515,1520 ----
          }
      } else {
!         i = STATE_OFFSET(state, state->mark[index]);
!         j = STATE_OFFSET(state, state->mark[index+1]);
      }
  
***************
*** 1722,1725 ****
--- 1725,1730 ----
      PyObject* result;
  
+     if (!args)
+         return NULL;
      name = PyString_FromString(module);
      if (!name)
***************
*** 1760,1763 ****
--- 1765,1822 ----
  
  static PyObject*
+ join(PyObject* list, PyObject* pattern)
+ {
+     /* join list elements */
+ 
+     PyObject* joiner;
+ #if PY_VERSION_HEX >= 0x01060000
+     PyObject* function;
+     PyObject* args;
+ #endif
+     PyObject* result;
+ 
+     switch (PyList_GET_SIZE(list)) {
+     case 0:
+         Py_DECREF(list);
+         return PyString_FromString("");
+     case 1:
+         result = PyList_GET_ITEM(list, 0);
+         Py_INCREF(result);
+         Py_DECREF(list);
+         return result;
+     }
+ 
+     /* two or more elements: slice out a suitable separator from the
+        first member, and use that to join the entire list */
+ 
+     joiner = PySequence_GetSlice(pattern, 0, 0);
+     if (!joiner)
+         return NULL;
+ 
+ #if PY_VERSION_HEX >= 0x01060000
+     function = PyObject_GetAttrString(joiner, "join");
+     if (!function) {
+         Py_DECREF(joiner);
+         return NULL;
+     }
+     args = PyTuple_New(1);
+     PyTuple_SET_ITEM(args, 0, list);
+     result = PyObject_CallObject(function, args);
+     Py_DECREF(args); /* also removes list */
+     Py_DECREF(function);
+ #else
+     result = call(
+         "string", "join",
+         Py_BuildValue("OO", list, joiner)
+         );
+ #endif
+     Py_DECREF(joiner);
+ 
+     return result;
+ }
+ 
+ 
+ #ifdef USE_PYTHON_SUB
+ static PyObject*
  pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
  {
***************
*** 1776,1780 ****
--- 1835,1841 ----
          );
  }
+ #endif
  
+ #ifdef USE_PYTHON_SUB
  static PyObject*
  pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
***************
*** 1794,1797 ****
--- 1855,1859 ----
          );
  }
+ #endif
  
  #if defined(USE_PYTHON_SPLIT)
***************
*** 1852,1908 ****
          }
  
!         if (status > 0) {
! 
!             /* don't bother to build a match object */
!             switch (self->groups) {
!             case 0:
!                 b = ((char*) state.start - (char*) state.beginning) /
!                     state.charsize;
!                 e = ((char*) state.ptr - (char*) state.beginning) /
!                     state.charsize;
!                 item = PySequence_GetSlice(string, b, e);
!                 if (!item)
!                     goto error;
!                 break;
!             case 1:
!                 item = state_getslice(&state, 1, string, 1);
!                 if (!item)
!                     goto error;
                  break;
!             default:
!                 item = PyTuple_New(self->groups);
!                 if (!item)
                      goto error;
-                 for (i = 0; i < self->groups; i++) {
-                     PyObject* o = state_getslice(&state, i+1, string, 1);
-                     if (!o) {
-                         Py_DECREF(item);
-                         goto error;
-                     }
-                     PyTuple_SET_ITEM(item, i, o);
                  }
!                 break;
              }
! 
!             status = PyList_Append(list, item);
!             Py_DECREF(item);
! 
!             if (status < 0)
!                 goto error;
! 
!             if (state.ptr == state.start)
!                 state.start = (void*) ((char*) state.ptr + state.charsize);
!             else
!                 state.start = state.ptr;
! 
!         } else {
! 
!             if (status == 0)
!                 break;
  
!             pattern_error(status);
              goto error;
  
!         }
      }
  
--- 1914,1963 ----
          }
  
!         if (status <= 0) {
!             if (status == 0)
                  break;
!             pattern_error(status);
!             goto error;
!         }
!         
!         /* don't bother to build a match object */
!         switch (self->groups) {
!         case 0:
!             b = STATE_OFFSET(&state, state.start);
!             e = STATE_OFFSET(&state, state.ptr);
!             item = PySequence_GetSlice(string, b, e);
!             if (!item)
!                 goto error;
!             break;
!         case 1:
!             item = state_getslice(&state, 1, string, 1);
!             if (!item)
!                 goto error;
!             break;
!         default:
!             item = PyTuple_New(self->groups);
!             if (!item)
!                 goto error;
!             for (i = 0; i < self->groups; i++) {
!                 PyObject* o = state_getslice(&state, i+1, string, 1);
!                 if (!o) {
!                     Py_DECREF(item);
                      goto error;
                  }
!                 PyTuple_SET_ITEM(item, i, o);
              }
!             break;
!         }
  
!         status = PyList_Append(list, item);
!         Py_DECREF(item);
!         if (status < 0)
              goto error;
  
!         if (state.ptr == state.start)
!             state.start = (void*) ((char*) state.ptr + state.charsize);
!         else
!             state.start = state.ptr;
! 
      }
  
***************
*** 1926,1931 ****
      int status;
      int n;
!     int i, b, e;
!     int g;
  
      PyObject* string;
--- 1981,1986 ----
      int status;
      int n;
!     int i;
!     void* last;
  
      PyObject* string;
***************
*** 1942,1948 ****
      list = PyList_New(0);
  
!     i = n = 0;
  
!     while (maxsplit == 0 || n < maxsplit) {
  
          state_reset(&state);
--- 1997,2004 ----
      list = PyList_New(0);
  
!     n = 0;
!     last = state.start;
  
!     while (!maxsplit || n < maxsplit) {
  
          state_reset(&state);
***************
*** 1957,1978 ****
  #endif
          }
- 
-         if (status > 0) {
  
!             if (state.start == state.ptr) {
!                 if (i >= state.endpos)
!                     break;
!                 /* skip one character */
!                 state.start = (void*) ((char*) state.ptr + state.charsize);
!                 continue;
!             }
  
!             b = ((char*) state.start - (char*) state.beginning) /
!                 state.charsize;
!             e = ((char*) state.ptr - (char*) state.beginning) /
!                 state.charsize;
  
!             /* get segment before this match */
!             item = PySequence_GetSlice(string, i, b);
              if (!item)
                  goto error;
--- 2013,2047 ----
  #endif
          }
  
!         if (status <= 0) {
!             if (status == 0)
!                 break;
!             pattern_error(status);
!             goto error;
!         }
!         
!         if (state.start == state.ptr) {
!             if (last == state.end)
!                 break;
!             /* skip one character */
!             state.start = (void*) ((char*) state.ptr + state.charsize);
!             continue;
!         }
  
!         /* get segment before this match */
!         item = PySequence_GetSlice(
!             string, STATE_OFFSET(&state, last),
!             STATE_OFFSET(&state, state.start)
!             );
!         if (!item)
!             goto error;
!         status = PyList_Append(list, item);
!         Py_DECREF(item);
!         if (status < 0)
!             goto error;
  
!         /* add groups (if any) */
!         for (i = 0; i < self->groups; i++) {
!             item = state_getslice(&state, i+1, string, 0);
              if (!item)
                  goto error;
***************
*** 1981,2009 ****
              if (status < 0)
                  goto error;
  
!             for (g = 0; g < self->groups; g++) {
!                 item = state_getslice(&state, g+1, string, 0);
!                 if (!item)
!                     goto error;
!                 status = PyList_Append(list, item);
!                 Py_DECREF(item);
!                 if (status < 0)
!                     goto error;
!             }
  
!             i = e;
!             n = n + 1;
  
!             state.start = state.ptr;
  
          } else {
  
              if (status == 0)
                  break;
- 
              pattern_error(status);
              goto error;
  
          }
      }
  
--- 2050,2192 ----
              if (status < 0)
                  goto error;
+         }
  
!         n = n + 1;
  
!         last = state.start = state.ptr;
  
!     }
  
+     /* get segment following last match */
+     item = PySequence_GetSlice(
+         string, STATE_OFFSET(&state, last), state.endpos
+         );
+     if (!item)
+         goto error;
+     status = PyList_Append(list, item);
+     Py_DECREF(item);
+     if (status < 0)
+         goto error;
+ 
+     state_fini(&state);
+     return list;
+ 
+ error:
+     Py_DECREF(list);
+     state_fini(&state);
+     return NULL;
+     
+ }
+ #endif
+ 
+ #if !defined(USE_PYTHON_SUB)
+ static PyObject*
+ pattern_subx(PatternObject* self, PyObject* template, PyObject* string,
+              int count, int subn)
+ {
+     SRE_STATE state;
+     PyObject* list;
+     PyObject* item;
+     PyObject* filter;
+     PyObject* args;
+     PyObject* match;
+     int status;
+     int n;
+     int i, b, e;
+     int filter_is_callable;
+ 
+     /* call subx helper to get the filter */
+     filter = call(
+         SRE_MODULE, "_subx",
+         Py_BuildValue("OO", self, template)
+         );
+     if (!filter)
+         return NULL;
+ 
+     filter_is_callable = PyCallable_Check(filter);
+ 
+     string = state_init(&state, self, string, 0, INT_MAX);
+     if (!string)
+         return NULL;
+ 
+     list = PyList_New(0);
+ 
+     n = i = 0;
+ 
+     while (!count || n < count) {
+ 
+         state_reset(&state);
+ 
+         state.ptr = state.start;
+ 
+         if (state.charsize == 1) {
+             status = sre_search(&state, PatternObject_GetCode(self));
          } else {
+ #if defined(HAVE_UNICODE)
+             status = sre_usearch(&state, PatternObject_GetCode(self));
+ #endif
+         }
  
+         if (status <= 0) {
              if (status == 0)
                  break;
              pattern_error(status);
              goto error;
+         }
+         
+         b = STATE_OFFSET(&state, state.start);
+         e = STATE_OFFSET(&state, state.ptr);
  
+         if (i < b) {
+             /* get segment before this match */
+             item = PySequence_GetSlice(string, i, b);
+             if (!item)
+                 goto error;
+             status = PyList_Append(list, item);
+             Py_DECREF(item);
+             if (status < 0)
+                 goto error;
+ 
+         } else if (i == b && i == e && n > 0)
+             /* ignore empty match on latest position */
+             goto next;
+ 
+         if (filter_is_callable) {
+             /* filter match */
+             match = pattern_new_match(self, &state, 1);
+             if (!match)
+                 goto error;
+             args = Py_BuildValue("(O)", match);
+             if (!args) {
+                 Py_DECREF(args);
+                 goto error;
+             }
+             item = PyObject_CallObject(filter, args);
+             Py_DECREF(args);
+             Py_DECREF(match);
+             if (!item)
+                 goto error;
+         } else {
+             /* filter is literal string */
+             item = filter;
+             Py_INCREF(filter);
          }
+ 
+         /* add to list */
+         status = PyList_Append(list, item);
+         Py_DECREF(item);
+         if (status < 0)
+             goto error;
+         
+         i = e;
+         n = n + 1;
+ 
+ next:
+         /* move on */
+         if (state.ptr == state.start)
+             state.start = (void*) ((char*) state.ptr + state.charsize);
+         else
+             state.start = state.ptr;
+ 
      }
  
***************
*** 2018,2023 ****
  
      state_fini(&state);
-     return list;
  
  error:
      Py_DECREF(list);
--- 2201,2215 ----
  
      state_fini(&state);
  
+     /* convert list to single string */
+     item = join(list, self->pattern);
+     if (!item)
+         return NULL;
+ 
+     if (subn)
+         return Py_BuildValue("Ni", item, n);
+ 
+     return item;
+ 
  error:
      Py_DECREF(list);
***************
*** 2026,2029 ****
--- 2218,2249 ----
      
  }
+ 
+ static PyObject*
+ pattern_sub(PatternObject* self, PyObject* args, PyObject* kw)
+ {
+     PyObject* template;
+     PyObject* string;
+     int count = 0;
+     static char* kwlist[] = { "repl", "string", "count", NULL };
+     if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:sub", kwlist,
+                                      &template, &string, &count))
+         return NULL;
+ 
+     return pattern_subx(self, template, string, count, 0);
+ }
+ 
+ static PyObject*
+ pattern_subn(PatternObject* self, PyObject* args, PyObject* kw)
+ {
+     PyObject* template;
+     PyObject* string;
+     int count = 0;
+     static char* kwlist[] = { "repl", "string", "count", NULL };
+     if (!PyArg_ParseTupleAndKeywords(args, kw, "OO|i:subn", kwlist,
+                                      &template, &string, &count))
+         return NULL;
+ 
+     return pattern_subx(self, template, string, count, 1);
+ }
  #endif
  
***************
*** 2085,2114 ****
  }
  
- static PyObject*
- pattern_getliteral(PatternObject* self, PyObject* args)
- {
-     /* internal: if the pattern is a literal string, return that
-        string.  otherwise, return None */
- 
-     SRE_CODE* code;
-     PyObject* literal;
- 
-     if (!PyArg_ParseTuple(args, ":_getliteral"))
-         return NULL;
- 
-     code = PatternObject_GetCode(self);
- 
-     if (code[0] == SRE_OP_INFO && code[2] & SRE_INFO_LITERAL) {
-         /* FIXME: extract literal string from code buffer.  we can't
-            use the pattern member, since it may contain untranslated
-            escape codes (see SF bug 449000) */
-         literal = Py_None;
-     } else
-         literal = Py_None; /* no literal */
- 
-     Py_INCREF(literal);
-     return literal;
- }
- 
  static PyMethodDef pattern_methods[] = {
      {"match", (PyCFunction) pattern_match, METH_VARARGS|METH_KEYWORDS},
--- 2305,2308 ----
***************
*** 2121,2125 ****
      {"__copy__", (PyCFunction) pattern_copy, METH_VARARGS},
      {"__deepcopy__", (PyCFunction) pattern_deepcopy, METH_VARARGS},
-     {"_getliteral", (PyCFunction) pattern_getliteral, METH_VARARGS},
      {NULL, NULL}
  };
--- 2315,2318 ----