[Python-checkins] CVS: python/dist/src/Modules unicodedata_db.h,1.1,1.2 unicodedatabase.h,2.4,2.5 unicodedata.c,2.5,2.6 unicodedatabase.c,2.4,2.5

Fredrik Lundh python-dev@python.org
Mon, 25 Sep 2000 01:07:09 -0700


Update of /cvsroot/python/python/dist/src/Modules
In directory slayer.i.sourceforge.net:/tmp/cvs-serv23556/Modules

Modified Files:
	unicodedata_db.h unicodedatabase.h unicodedata.c 
	unicodedatabase.c 
Log Message:


unicode database compression, step 2:

- fixed attributions
- moved decomposition data to a separate table, in preparation
  for step 3 (which won't happen before 2.0 final, promise!)
- use relative paths in the generator script

I have a lot more stuff in the works for 2.1, but let's leave
that for another day...


Index: unicodedata_db.h
===================================================================
RCS file: /cvsroot/python/python/dist/src/Modules/unicodedata_db.h,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -r1.1 -r1.2
*** unicodedata_db.h	2000/09/24 21:27:18	1.1
--- unicodedata_db.h	2000/09/25 08:07:05	1.2
***************
*** 1,3573 ****
! /* this file was generated by ..\makeunidb.py 1.0 */
  
  /* a list of unique database records */
  const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {
!     {0, 0, 0, 0, NULL},
!     {13, 0, 15, 0, NULL},
!     {13, 0, 17, 0, NULL},
!     {13, 0, 16, 0, NULL},
!     {13, 0, 18, 0, NULL},
!     {10, 0, 18, 0, NULL},
[...8862 lines suppressed...]
!     3259, 3260, 3261, 3262, 3263, 3264, 3265, 3266, 3267, 3268, 3269, 3270, 
!     3271, 3272, 3273, 3274, 3275, 3276, 3277, 3278, 3279, 3280, 3281, 3282, 
!     3283, 3284, 3285, 3286, 3287, 3288, 3289, 3290, 3291, 3292, 3293, 3294, 
!     3295, 3296, 3297, 3298, 3299, 3300, 3301, 3302, 3303, 3304, 3305, 3306, 
!     3307, 3308, 3309, 3310, 3311, 3312, 3313, 3314, 3315, 3316, 3317, 3318, 
!     3319, 3320, 3321, 0, 0, 3322, 3323, 3324, 3325, 3326, 3327, 3328, 3329, 
!     3330, 3331, 3332, 3333, 3334, 3335, 3336, 3337, 3338, 3339, 3340, 3341, 
!     3342, 3343, 3344, 3345, 3346, 3347, 3348, 3349, 3350, 3351, 3352, 3353, 
!     3354, 3355, 3356, 3357, 3358, 3359, 3360, 3361, 3362, 3363, 3364, 3365, 
!     3366, 3367, 3368, 3369, 3370, 3371, 3372, 3373, 3374, 3375, 3376, 3377, 
!     3378, 3379, 3380, 3381, 3382, 3383, 3384, 3385, 3386, 3387, 3388, 3389, 
!     3390, 3391, 3392, 3393, 3394, 3395, 3396, 3397, 3398, 3399, 3400, 3401, 
!     3402, 3403, 3404, 3405, 3406, 3407, 3408, 3409, 3410, 3411, 3412, 3413, 
!     3414, 3415, 0, 0, 0, 3416, 3417, 3418, 3419, 3420, 3421, 0, 0, 3422, 
!     3423, 3424, 3425, 3426, 3427, 0, 0, 3428, 3429, 3430, 3431, 3432, 3433, 
!     0, 0, 3434, 3435, 3436, 0, 0, 0, 3437, 3438, 3439, 3440, 3441, 3442, 
!     3443, 0, 3444, 3445, 3446, 3447, 3448, 3449, 3450, 0, 0, 0, 0, 0, 0, 0, 
!     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
  };
  

Index: unicodedatabase.h
===================================================================
RCS file: /cvsroot/python/python/dist/src/Modules/unicodedatabase.h,v
retrieving revision 2.4
retrieving revision 2.5
diff -C2 -r2.4 -r2.5
*** unicodedatabase.h	2000/09/24 21:28:28	2.4
--- unicodedatabase.h	2000/09/25 08:07:06	2.5
***************
*** 5,11 ****
     Data was extracted from the Unicode 3.0 UnicodeData.txt file.
  
! Written by Marc-Andre Lemburg (mal@lemburg.com).
  
! Copyright (c) Corporation for National Research Initiatives.
  
     ------------------------------------------------------------------------ */
--- 5,12 ----
     Data was extracted from the Unicode 3.0 UnicodeData.txt file.
  
!    Written by Marc-Andre Lemburg (mal@lemburg.com).
!    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
  
!    Copyright (c) Corporation for National Research Initiatives.
  
     ------------------------------------------------------------------------ */
***************
*** 20,33 ****
  					   _PyUnicode_BidirectionalNames */
      const unsigned char mirrored;	/* true if mirrored in bidir mode */
-     const char *decomposition;		/* pointer to the decomposition
- 					   string or NULL */
  } _PyUnicode_DatabaseRecord;
  
  /* --- Unicode category names --------------------------------------------- */
  
! extern const char *_PyUnicode_CategoryNames[32];
! extern const char *_PyUnicode_BidirectionalNames[21];
  
  /* --- Unicode Database --------------------------------------------------- */
  
  extern const _PyUnicode_DatabaseRecord *_PyUnicode_Database_GetRecord(int ch);
--- 21,33 ----
  					   _PyUnicode_BidirectionalNames */
      const unsigned char mirrored;	/* true if mirrored in bidir mode */
  } _PyUnicode_DatabaseRecord;
  
  /* --- Unicode category names --------------------------------------------- */
  
! extern const char *_PyUnicode_CategoryNames[];
! extern const char *_PyUnicode_BidirectionalNames[];
  
  /* --- Unicode Database --------------------------------------------------- */
  
  extern const _PyUnicode_DatabaseRecord *_PyUnicode_Database_GetRecord(int ch);
+ extern const char *_PyUnicode_Database_GetDecomposition(int ch);

Index: unicodedata.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Modules/unicodedata.c,v
retrieving revision 2.5
retrieving revision 2.6
diff -C2 -r2.5 -r2.6
*** unicodedata.c	2000/09/24 21:45:34	2.5
--- unicodedata.c	2000/09/25 08:07:06	2.6
***************
*** 5,11 ****
     Data was extracted from the Unicode 3.0 UnicodeData.txt file.
  
! Written by Marc-Andre Lemburg (mal@lemburg.com).
  
! Copyright (c) Corporation for National Research Initiatives.
  
     ------------------------------------------------------------------------ */
--- 5,12 ----
     Data was extracted from the Unicode 3.0 UnicodeData.txt file.
  
!    Written by Marc-Andre Lemburg (mal@lemburg.com).
!    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
  
!    Copyright (c) Corporation for National Research Initiatives.
  
     ------------------------------------------------------------------------ */
***************
*** 14,19 ****
  #include "unicodedatabase.h"
  
- #define unicode_db _PyUnicode_Database_GetRecord
- 
  /* --- Module API --------------------------------------------------------- */
  
--- 15,18 ----
***************
*** 134,147 ****
  			"need a single Unicode character as parameter");
  	goto onError;
-     }
-     index = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->category;
-     if (index < 0 || 
- 	index > sizeof(_PyUnicode_CategoryNames) / 
- 	        sizeof(_PyUnicode_CategoryNames[0])) {
- 	PyErr_Format(PyExc_SystemError,
- 		     "category index out of range: %i",
- 		     index);
- 	goto onError;
      }
      return PyString_FromString(_PyUnicode_CategoryNames[index]);
      
--- 133,140 ----
  			"need a single Unicode character as parameter");
  	goto onError;
      }
+     index = (int) _PyUnicode_Database_GetRecord(
+         (int) *PyUnicode_AS_UNICODE(v)
+         )->category;
      return PyString_FromString(_PyUnicode_CategoryNames[index]);
      
***************
*** 164,177 ****
  			"need a single Unicode character as parameter");
  	goto onError;
-     }
-     index = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->bidirectional;
-     if (index < 0 || 
- 	index > sizeof(_PyUnicode_CategoryNames) / 
- 	        sizeof(_PyUnicode_CategoryNames[0])) {
- 	PyErr_Format(PyExc_SystemError,
- 		     "bidirectional index out of range: %i",
- 		     index);
- 	goto onError;
      }
      return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
      
--- 157,164 ----
  			"need a single Unicode character as parameter");
  	goto onError;
      }
+     index = (int) _PyUnicode_Database_GetRecord(
+         (int) *PyUnicode_AS_UNICODE(v)
+         )->bidirectional;
      return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
      
***************
*** 195,199 ****
  	goto onError;
      }
!     value = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->combining;
      return PyInt_FromLong(value);
      
--- 182,188 ----
  	goto onError;
      }
!     value = (int) _PyUnicode_Database_GetRecord(
!         (int) *PyUnicode_AS_UNICODE(v)
!         )->combining;
      return PyInt_FromLong(value);
      
***************
*** 217,221 ****
  	goto onError;
      }
!     value = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->mirrored;
      return PyInt_FromLong(value);
      
--- 206,212 ----
  	goto onError;
      }
!     value = (int) _PyUnicode_Database_GetRecord(
!         (int) *PyUnicode_AS_UNICODE(v)
!         )->mirrored;
      return PyInt_FromLong(value);
      
***************
*** 239,246 ****
  	goto onError;
      }
!     value = unicode_db((int)*PyUnicode_AS_UNICODE(v))->decomposition;
!     if (value == NULL)
! 	return PyString_FromString("");
!     else
  	return PyString_FromString(value);
      
--- 230,236 ----
  	goto onError;
      }
!     value = _PyUnicode_Database_GetDecomposition(
!         (int) *PyUnicode_AS_UNICODE(v)
!         );
  	return PyString_FromString(value);
      

Index: unicodedatabase.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Modules/unicodedatabase.c,v
retrieving revision 2.4
retrieving revision 2.5
diff -C2 -r2.4 -r2.5
*** unicodedatabase.c	2000/09/24 21:28:28	2.4
--- unicodedatabase.c	2000/09/25 08:07:06	2.5
***************
*** 5,11 ****
     Data was extracted from the Unicode 3.0 UnicodeData.txt file.
  
! Written by Marc-Andre Lemburg (mal@lemburg.com).
  
! Copyright (c) Corporation for National Research Initiatives.
  
     ------------------------------------------------------------------------ */
--- 5,12 ----
     Data was extracted from the Unicode 3.0 UnicodeData.txt file.
  
!    Written by Marc-Andre Lemburg (mal@lemburg.com).
!    Rewritten for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
  
!    Copyright (c) Corporation for National Research Initiatives.
  
     ------------------------------------------------------------------------ */
***************
*** 29,31 ****
--- 30,47 ----
      }
      return &_PyUnicode_Database_Records[index];
+ }
+ 
+ const char *
+ _PyUnicode_Database_GetDecomposition(int code)
+ {
+     int index;
+ 
+     if (code < 0 || code >= 65536)
+         index = 0;
+     else {
+         index = decomp_index1[(code>>DECOMP_SHIFT)];
+         index = decomp_index2[(index<<DECOMP_SHIFT)+
+                              (code&((1<<DECOMP_SHIFT)-1))];
+     }
+     return decomp_data[index];
  }