[Python-checkins] r46132 - python/trunk/Objects/unicodeobject.c
fredrik.lundh
python-checkins at python.org
Tue May 23 20:44:26 CEST 2006
Author: fredrik.lundh
Date: Tue May 23 20:44:25 2006
New Revision: 46132
Modified:
python/trunk/Objects/unicodeobject.c
Log:
needforspeed: use append+reverse for rsplit, use "bloom filters" to
speed up splitlines and strip with charsets; etc. rsplit is now as
fast as split in all our tests (reverse takes no time at all), and
splitlines() is nearly as fast as a plain split("\n") in our tests.
and we're not done yet... ;-)
Modified: python/trunk/Objects/unicodeobject.c
==============================================================================
--- python/trunk/Objects/unicodeobject.c (original)
+++ python/trunk/Objects/unicodeobject.c Tue May 23 20:44:25 2006
@@ -46,6 +46,18 @@
#include <windows.h>
#endif
+#undef USE_INLINE /* XXX - set via configure? */
+
+#if defined(_MSC_VER) /* this is taken from _sre.c */
+#pragma warning(disable: 4710)
+/* fastest possible local call under MSVC */
+#define LOCAL(type) static __inline type __fastcall
+#elif defined(USE_INLINE)
+#define LOCAL(type) static inline type
+#else
+#define LOCAL(type) static type
+#endif
+
/* Limit for the Unicode object free list */
#define MAX_UNICODE_FREELIST_SIZE 1024
@@ -121,6 +133,51 @@
#endif
}
+/* --- Bloom Filters ----------------------------------------------------- */
+
+/* stuff to implement simple "bloom filters" for Unicode characters.
+ to keep things simple, we use a single bitmask, using the least 5
+ bits from each unicode characters as the bit index. */
+
+/* the linebreak mask is set up by Unicode_Init below */
+
+#define BLOOM_MASK unsigned long
+
+static BLOOM_MASK bloom_linebreak;
+
+#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
+
+#define BLOOM_LINEBREAK(ch)\
+ (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK((ch)))
+
+LOCAL(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
+{
+ /* calculate simple bloom-style bitmask for a given unicode string */
+
+ long mask;
+ Py_ssize_t i;
+
+ mask = 0;
+ for (i = 0; i < len; i++)
+ mask |= (1 << (ptr[i] & 0x1F));
+
+ return mask;
+}
+
+LOCAL(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
+{
+ Py_ssize_t i;
+
+ for (i = 0; i < setlen; i++)
+ if (set[i] == chr)
+ return 1;
+
+ return -1;
+}
+
+#define BLOOM_MEMBER(mask, chr, set, setlen)\
+ BLOOM(mask, chr) && unicode_member(chr, set, setlen)
+
/* --- Unicode Object ----------------------------------------------------- */
static
@@ -3791,8 +3848,7 @@
/* --- Helpers ------------------------------------------------------------ */
-static
-Py_ssize_t count(PyUnicodeObject *self,
+static Py_ssize_t count(PyUnicodeObject *self,
Py_ssize_t start,
Py_ssize_t end,
PyUnicodeObject *substring)
@@ -3850,8 +3906,7 @@
return result;
}
-static
-Py_ssize_t findstring(PyUnicodeObject *self,
+static Py_ssize_t findstring(PyUnicodeObject *self,
PyUnicodeObject *substring,
Py_ssize_t start,
Py_ssize_t end,
@@ -4332,17 +4387,6 @@
else \
Py_DECREF(str);
-#define SPLIT_INSERT(data, left, right) \
- str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
- if (!str) \
- goto onError; \
- if (PyList_Insert(list, 0, str)) { \
- Py_DECREF(str); \
- goto onError; \
- } \
- else \
- Py_DECREF(str);
-
static
PyObject *split_whitespace(PyUnicodeObject *self,
PyObject *list,
@@ -4403,7 +4447,7 @@
Py_ssize_t eol;
/* Find a line and append it */
- while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
+ while (i < len && !BLOOM_LINEBREAK(data[i]))
i++;
/* Skip the line break reading CRLF as one line break */
@@ -4514,15 +4558,17 @@
if (j > i) {
if (maxcount-- <= 0)
break;
- SPLIT_INSERT(self->str, i + 1, j + 1);
+ SPLIT_APPEND(self->str, i + 1, j + 1);
while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
i--;
j = i;
}
}
if (j >= 0) {
- SPLIT_INSERT(self->str, 0, j + 1);
+ SPLIT_APPEND(self->str, 0, j + 1);
}
+ if (PyList_Reverse(list) < 0)
+ goto onError;
return list;
onError:
@@ -4545,14 +4591,16 @@
if (self->str[i] == ch) {
if (maxcount-- <= 0)
break;
- SPLIT_INSERT(self->str, i + 1, j + 1);
+ SPLIT_APPEND(self->str, i + 1, j + 1);
j = i = i - 1;
} else
i--;
}
if (j >= -1) {
- SPLIT_INSERT(self->str, 0, j + 1);
+ SPLIT_APPEND(self->str, 0, j + 1);
}
+ if (PyList_Reverse(list) < 0)
+ goto onError;
return list;
onError:
@@ -4576,15 +4624,17 @@
if (Py_UNICODE_MATCH(self, i, substring)) {
if (maxcount-- <= 0)
break;
- SPLIT_INSERT(self->str, i + sublen, j);
+ SPLIT_APPEND(self->str, i + sublen, j);
j = i;
i -= sublen;
} else
i--;
}
if (j >= 0) {
- SPLIT_INSERT(self->str, 0, j);
+ SPLIT_APPEND(self->str, 0, j);
}
+ if (PyList_Reverse(list) < 0)
+ goto onError;
return list;
onError:
@@ -4593,7 +4643,6 @@
}
#undef SPLIT_APPEND
-#undef SPLIT_INSERT
static
PyObject *split(PyUnicodeObject *self,
@@ -5703,16 +5752,6 @@
#define STRIPNAME(i) (stripformat[i]+3)
-static const Py_UNICODE *
-unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
-{
- size_t i;
- for (i = 0; i < n; ++i)
- if (s[i] == c)
- return s+i;
- return NULL;
-}
-
/* externally visible for str.strip(unicode) */
PyObject *
_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
@@ -5723,27 +5762,29 @@
Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
Py_ssize_t i, j;
+ BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
+
i = 0;
if (striptype != RIGHTSTRIP) {
- while (i < len && unicode_memchr(sep, s[i], seplen)) {
- i++;
- }
+ while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
+ i++;
+ }
}
j = len;
if (striptype != LEFTSTRIP) {
- do {
- j--;
- } while (j >= i && unicode_memchr(sep, s[j], seplen));
- j++;
+ do {
+ j--;
+ } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
+ j++;
}
if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
- Py_INCREF(self);
- return (PyObject*)self;
+ Py_INCREF(self);
+ return (PyObject*)self;
}
else
- return PyUnicode_FromUnicode(s+i, j-i);
+ return PyUnicode_FromUnicode(s+i, j-i);
}
@@ -7387,6 +7428,18 @@
{
int i;
+ /* XXX - move this array to unicodectype.c ? */
+ Py_UNICODE linebreak[] = {
+ 0x000A, /* LINE FEED */
+ 0x000D, /* CARRIAGE RETURN */
+ 0x001C, /* FILE SEPARATOR */
+ 0x001D, /* GROUP SEPARATOR */
+ 0x001E, /* RECORD SEPARATOR */
+ 0x0085, /* NEXT LINE */
+ 0x2028, /* LINE SEPARATOR */
+ 0x2029, /* PARAGRAPH SEPARATOR */
+ };
+
/* Init the implementation */
unicode_freelist = NULL;
unicode_freelist_size = 0;
@@ -7396,6 +7449,11 @@
unicode_latin1[i] = NULL;
if (PyType_Ready(&PyUnicode_Type) < 0)
Py_FatalError("Can't initialize 'unicode'");
+
+ /* initialize the linebreak bloom filter */
+ bloom_linebreak = make_bloom_mask(
+ linebreak, sizeof(linebreak) / sizeof(linebreak[0])
+ );
}
/* Finalize the Unicode implementation */
More information about the Python-checkins
mailing list