split string with hieroglyphs
Belize
uss.japan at gmail.com
Sun Dec 24 05:20:15 EST 2006
Steven, thanks! Very nice algorithm.
Here is code:
#!/usr/bin/env python
# -*- coding: utf_8 -*-
# Thanks Steven D'Aprano for hints
import unicodedata
import MySQLdb
#MySQL variables
mysql_host = "localhost"
mysql_user = "dict"
mysql_password = "passwd"
mysql_db = "dictionary"
try:
mysql_conn = MySQLdb.connect(mysql_host, mysql_user, mysql_password,
mysql_db)
cur = mysql_conn.cursor()
cur.execute("""SET NAMES UTF8""")
except:
print "unable insert to MySQL, check connection"
jap_text = "BZツーリTVツキDVD?"
jap_text = unicode(jap_text, 'utf-8') # fight with
full-width, half-width katakana madness :-)
jap_text = unicodedata.normalize('NFKC', jap_text) #
jap_text = jap_text.encode('utf-8') #
def translate_hieroglyph(jap_text):
eng_text = ""
mysql_translate_query = "SELECT Eng FROM dictionary where Jis='%s'
collate utf8_unicode_ci LIMIT 1" % jap_text
cur.execute(mysql_translate_query)
mysql_trans_data = cur.fetchall()
for line in mysql_trans_data:
eng_text = line[0]
if not eng_text:
eng_text = jap_text
return eng_text
def islatin(s):
try:
unicode(s, 'ascii')
except UnicodeError:
pass
else:
return True
def split_fragments(s):
fragments = []
latin = []
nonlatin = []
for c in s:
if islatin(c):
if nonlatin:
fragments.append(''.join(nonlatin))
nonlatin = []
latin.append(c)
else:
if latin:
fragments.append(''.join(latin))
latin = []
nonlatin.append(c)
if latin: # without
this we lose last fragment
fragments.append(''.join(latin)) #
else: #
fragments.append(''.join(nonlatin)) #
return fragments
fragments = split_fragments(jap_text)
def join_fragments(fragments):
accumulator = []
for fragment in fragments:
if islatin(fragment):
accumulator.append(fragment)
else:
accumulator.append(translate_hieroglyph(fragment))
return ' '.join(accumulator)
print join_fragments(fragments)
home at my ~/Src/Code $ python translate.py
BZ navigation TV display DVD?
Work as needed :-) Thanks again!
More information about the Python-list
mailing list