[pypy-svn] r11944 - pypy/branch/non-fake-unicode/pypy/module/unicodedata
ac at codespeak.net
ac at codespeak.net
Wed May 4 18:04:52 CEST 2005
Author: ac
Date: Wed May 4 18:04:52 2005
New Revision: 11944
Modified:
pypy/branch/non-fake-unicode/pypy/module/unicodedata/functions.py
pypy/branch/non-fake-unicode/pypy/module/unicodedata/generate_unicodedb.py
Log:
Improve the unicodedatabase.
Modified: pypy/branch/non-fake-unicode/pypy/module/unicodedata/functions.py
==============================================================================
--- pypy/branch/non-fake-unicode/pypy/module/unicodedata/functions.py (original)
+++ pypy/branch/non-fake-unicode/pypy/module/unicodedata/functions.py Wed May 4 18:04:52 2005
@@ -15,7 +15,7 @@
def lookup(space, w_name):
name = space.str_w(w_name)
try:
- code = unicodedb.charcodeByName[name]
+ code = unicodedb.lookup(name)
except KeyError:
msg = space.mod(space.wrap("undefined character name '%s'"), w_name)
raise OperationError(space.w_KeyError, msg)
@@ -25,7 +25,7 @@
def name(space, w_unichr, w_default=NoneNotWrapped):
code = unichr_to_code_w(space, w_unichr)
try:
- name = unicodedb.charnameByCode[code]
+ name = unicodedb.name(code)
except KeyError:
if w_default is not None:
return w_default
@@ -36,7 +36,7 @@
def decimal(space, w_unichr, default=NoneNotWrapped):
code = unichr_to_code_w(space, w_unichr)
try:
- return space.wrap(unicodedb.decimalValue[code])
+ return space.wrap(unicodedb.decimal(code))
except KeyError:
pass
if w_default is not None:
@@ -46,7 +46,7 @@
def digit(space, w_unichr, w_default=NoneNotWrapped):
code = unichr_to_code_w(space, w_unichr)
try:
- return space.wrap(unicodedb.digitValue[code])
+ return space.wrap(unicodedb.digit(code))
except KeyError:
pass
if w_default is not None:
@@ -56,7 +56,7 @@
def numeric(space, w_unichr, w_default=NoneNotWrapped):
code = unichr_to_code_w(space, w_unichr)
try:
- return space.wrap(unicodedb.numericValue[code])
+ return space.wrap(unicodedb.numeric(code))
except KeyError:
pass
if w_default is not None:
@@ -66,24 +66,24 @@
def category(space, w_unichr):
code = unichr_to_code_w(space, w_unichr)
- return space.wrap(unicodedb.category.get(code, 'Cn'))
+ return space.wrap(unicodedb.category(code))
def bidirectional(space, w_unichr):
code = unichr_to_code_w(space, w_unichr)
- return space.wrap(unicodedb.bidirectional.get(code, ''))
+ return space.wrap(unicodedb.bidirectional(code))
def combining(space, w_unichr):
code = unichr_to_code_w(space, w_unichr)
- return space.wrap(unicodedb.combining(code, 0)
+ return space.wrap(unicodedb.combining(code))
def mirrored(space, w_unichr):
code = unichr_to_code_w(space, w_unichr)
- return space.wrap(unicodedb.mirrored(code, 0))
-
+ return space.wrap(unicodedb.mirrored(code))
def decomposition(space, w_unichr):
code = unichr_to_code_w(space, w_unichr)
- return space.wrap('')
+ raise OperationError(space.w_NotImplementedError,
+ space.wrap('Decomposition is not implemented'))
def normalize(space, w_form, w_unistr):
form = space.str_w(w_form)
Modified: pypy/branch/non-fake-unicode/pypy/module/unicodedata/generate_unicodedb.py
==============================================================================
--- pypy/branch/non-fake-unicode/pypy/module/unicodedata/generate_unicodedb.py (original)
+++ pypy/branch/non-fake-unicode/pypy/module/unicodedata/generate_unicodedb.py Wed May 4 18:04:52 2005
@@ -1,132 +1,223 @@
#!/usr/bin/env python
-
import sys
-import __future__
-def printDict(outfile, name, dictionary):
- keys = dictionary.keys()
- keys.sort()
- print >> outfile, name, '= {'
- for key in keys:
- print >>outfile, ' %r : %r,' % (key, dictionary[key])
- print >>outfile, '}'
+class Unicodechar(object):
+ __slots__ = '''code name category combining bidirectional
+ decomposition decomposition_tag decimal digit numeric
+ mirrored upper lower title'''.split()
+ def __init__(self, data):
+ if not data[1] or data[1][0] == '<' and data[1][-1] == '>':
+ self.name = None
+ else:
+ self.name = data[1]
+ self.category = data[2]
+ self.combining = 0
+ if data[3]:
+ self.combining = int(data[3])
+ self.bidirectional = data[4]
+ self.decomposition = None
+ self.decomposition_tag = None
+ if data[5]:
+ if data[5][0] == '<':
+ tag, value = data[5].split(None, 1)
+ else:
+ tag = '<canonical>'
+ value = data[5]
+ self.decomposition_tag = tag[1:-1]
+ self.decomposition = [int(v, 16) for v in value.split()]
+
+ self.decimal = None
+ if data[6]:
+ self.decimal = int(data[6])
+ self.digit = None
+ if data[7]:
+ self.digit = int(data[7])
+ self.numeric = None
+ if data[8]:
+ try:
+ numerator, denomenator = data[8].split('/')
+ self.numeric = float(numerator) / float(denomenator)
+ except ValueError:
+ self.numeric = float(data[8])
+ self.mirrored = (data[9] == 'Y')
+ if data[12]:
+ self.upper = int(data[12], 16)
+ self.lower = None
+ if data[13]:
+ self.lower = int(data[13], 16)
+ self.title = None
+ if data[14]:
+ self.title = int(data[14], 16)
+
+ def isspace(self):
+ return (self.category == 'Zs' or
+ self.bidirectional in ('WS', 'B' or 'S'))
-def generate_unicodedb(unidata_version, infile, outfile):
- decimal = {}
- digit = {}
- number = {}
- uppercase = {}
- lowercase = {}
- titlecase = {}
- category = {}
- name = {}
- combining = {}
- bidir = {}
- mirrored = {}
-
- decomp = {}
-
- table = {}
+def read_unicodedata(infile):
+ rangeFirst = {}
+ rangeLast = {}
+ table = [Unicodechar(['0000', None, 'Cn'] + [''] * 12)] * (sys.maxunicode + 1)
for line in infile:
line = line.split('#', 1)[0].strip()
if not line:
continue
data = [ v.strip() for v in line.split(';') ]
- code = data[0] = int(data[0], 16)
- table[code] = data
+ if data[1].endswith(', First>'):
+ code = int(data[0], 16)
+ name = data[1][1:-len(', First>')]
+ rangeFirst[name] = (code, data)
+ continue
+ if data[1].endswith(', Last>'):
+ code = int(data[0], 16)
+ rangeLast[name] = code
+ continue
+ code = int(data[0], 16)
+ u = Unicodechar(data)
+ table[code] = u
+
+ # Expand ranges
+ for name, (start, data) in rangeFirst.iteritems():
+ end = rangeLast[name]
+ unichar = Unicodechar(['0000', None] + data[2:])
+ for code in range(start, end + 1):
+ table[code] = unichar
- # Expand named ranges
- field = None
- for i in range(0, 0x110000):
- s = table.get(i)
- if s:
- if s[1][-8:] == ", First>":
- field = s[:]
- field[1] = s[1][:-8]
- s[1] = s[1][:-8] + '-%4X'%s[0]
- elif s[1][-7:] == ", Last>":
- s[1] = s[1][:-7] + '-%4X'%s[0]
- field = None
- elif field:
- s = field[:]
- s[0] = i
- s[1] = s[1] + '-%4X'%s[0]
- table[i] = s
-
- for (code, _name, cat, _combine, _bidir, _decomp,
- _decimal, _digit, _number, _mirrored, unicode1_name, comment,
- _uppercase, _lowercase, _titlecase) in table.itervalues():
- if cat != 'Cn':
- category[code] = cat
-
- name[code] = _name
- if _combine:
- combine = int(_combine)
- if combine != 0:
- combining[code] = combine
-
- if _decimal:
- decimal[code] = int(_decimal)
- if _digit:
- d = digit[code] = int(_digit)
- if _number:
- number[code] = float(eval(compile(_number, '-', 'eval', __future__.CO_FUTURE_DIVISION, 1)))
-
- if _uppercase:
- uppercase[code] = int(_uppercase, 16)
- if _lowercase:
- lowercase[code] = int(_lowercase, 16)
- if _titlecase:
- titlecase[code] = int(_titlecase, 16)
+ return table
- if _mirrored == 'Y':
- mirrored[code] = 1
+def writeDict(outfile, name, dictionary):
+ print >> outfile, '%s = {' % name
+ keys = dictionary.keys()
+ keys.sort()
+ for key in keys:
+ print >> outfile, '%r: %r,'%(key, dictionary[key])
+ print >> outfile, '}'
+ print >> outfile
- if _bidir:
- bidir[code] = _bidir
+def writeCategory(outfile, table, name, categoryNames):
+ print >> outfile, '_%s_names = %r' % (name, categoryNames)
+ print >> outfile, '_%s = "".join([' % name
+ for i in range(0, len(table), 64):
+ result = []
+ for char in table[i:i + 64]:
+ result.append(chr(categoryNames.index(getattr(char, name)) + 0x20))
+ print >> outfile, ' %r,' % ''.join(result)
+ print >> outfile, '])'
+ print >> outfile, '''
+def %s(code):
+ return _%s_names[ord(_%s[code]) & 0x1f]
+
+'''%(name, name, name)
+
+def writeUnicodedata(version, table, outfile):
+ # Version
+ print >> outfile, 'version = %r' % version
+ print >> outfile
+ # Character names
+ print >> outfile, '_charnames = {'
+ for code in range(len(table)):
+ if table[code].name:
+ print >> outfile, '%r: %r,'%(code, table[code].name)
+ print >> outfile, '''}
+
+_code_by_name = dict(zip(_charnames.itervalues(), _charnames.iterkeys()))
- #if _decomp:
- # raise Exception
+def lookup(name):
+ return _code_by_name[name]
- codeByName = {}
- duplicateNames = {}
- for k, v in name.iteritems():
- if duplicateNames.has_key(k):
- continue
- if codeByName.has_key(k):
- duplicateNames[k] = 1
- del codeByName[k]
- continue
- codeByName[k] = v
+def name(code):
+ return _charnames[code]
+'''
+ # Categories
+ categories = {}
+ bidirs = {}
+ for char in table:
+ categories[char.category] = 1
+ bidirs[char.bidirectional] = 1
+ category_names = categories.keys()
+ category_names.sort()
+ if len(category_names) > 32:
+ raise RuntimeError('Too many general categories defined.')
+ bidirectional_names = bidirs.keys()
+ bidirectional_names.sort()
+ if len(bidirectional_names) > 32:
+ raise RuntimeError('Too many bidirectional categories defined.')
+
+ writeCategory(outfile, table, 'category', category_names)
+ writeCategory(outfile, table, 'bidirectional', bidirectional_names)
+ print >> outfile, '''
+def isspace(code):
+ return category(code) == "Zs" or bidirectional(code) in ("WS", "B", "S")
+def islower(code):
+ return category(code) == "Ll"
+def isupper(code):
+ return category(code) == "Lu"
+def istitle(code):
+ return category(code) == "Lt"
+def isalpha(code):
+ return category(code) in ("Lm", "Lt", "Lu", "Ll", "Lo")
+def islinebreak(code):
+ return category(code) == "Zl" or bidirectional(code) == "B"
+'''
+
+ # Numeric characters
+ decimal = {}
+ digit = {}
+ numeric = {}
+ for code in range(len(table)):
+ if table[code].decimal is not None:
+ decimal[code] = table[code].decimal
+ if table[code].digit is not None:
+ digit[code] = table[code].digit
+ if table[code].numeric is not None:
+ numeric[code] = table[code].numeric
+
+ writeDict(outfile, '_decimal', decimal)
+ writeDict(outfile, '_digit', digit)
+ writeDict(outfile, '_numeric', numeric)
+ print >> outfile, '''
+def decimal(code):
+ return _decimal[code]
+
+def isdecimal(code):
+ return _decimal.has_key(code)
+
+def digit(code):
+ return _digit[code]
+
+def isdigit(code):
+ return _digit.has_key(code)
- print >> outfile, 'version = %r'%unidata_version
- print >> outfile
- printDict(outfile, 'charnameByCode', name)
- print >> outfile
- printDict(outfile,'charcodeByName', codeByName)
- print >> outfile
- printDict(outfile, 'decimalValue', decimal)
- print >> outfile
- printDict(outfile, 'digitValue', digit)
- print >> outfile
- printDict(outfile, 'numericValue', number)
- print >> outfile
- printDict(outfile, 'category', category)
- print >> outfile
- printDict(outfile, 'bidirectional', bidir)
- print >> outfile
- printDict(outfile, 'combining', combining)
- print >> outfile
- printDict(outfile, 'mirrored', mirrored)
- print >> outfile
- printDict(outfile, 'decomposition', decomp)
- print >> outfile
- printDict(outfile, 'uppercase', uppercase)
- print >> outfile
- printDict(outfile, 'lowercase', lowercase)
- print >> outfile
- printDict(outfile, 'titlecase', titlecase)
- print >> outfile
+def numeric(code):
+ return _numeric[code]
+
+def isnumeric(code):
+ return _numeric.has_key(code)
+
+'''
+ # Combining
+ combining = {}
+ for code in range(len(table)):
+ if table[code].combining:
+ combining[code] = table[code].combining
+ writeDict(outfile, '_combining', combining)
+ print >> outfile, '''
+def combining(code):
+ return _combining.get(code, 0)
+
+'''
+ # Mirrored
+ mirrored = dict([(code, 1) for char in table
+ if char.mirrored])
+ mirrored = {}
+ for code in range(len(table)):
+ if table[code].mirrored:
+ mirrored[code] = 1
+ writeDict(outfile, '_mirrored', mirrored)
+ print >> outfile, '''
+def mirrored(code):
+ return _mirrored.get(code, 0)
+
+'''
if __name__ == '__main__':
import getopt, re
@@ -153,10 +244,10 @@
if unidata_version is None:
raise ValueError('No version specified')
-
+
+ table = read_unicodedata(infile)
print >> outfile, '# UNICODE CHARACTER DATABASE'
- print >> outfile, '# This ficle was genrated with the command:'
- print >> outfile, '# ', ' '.join(sys.argv)
+ print >> outfile, '# This file was genrated with the command:'
+ print >> outfile, '# ', ' '.join(sys.argv)
print >> outfile
-
- generate_unicodedb(unidata_version, infile, outfile)
+ writeUnicodedata(unidata_version, table, outfile)
More information about the Pypy-commit
mailing list