Wanted: safe codec for filenames
Torsten Bronger
bronger at physik.rwth-aachen.de
Wed Sep 5 18:20:45 EDT 2007
Hallöchen!
Torsten Bronger writes:
> I'd like to map general unicode strings to safe filename. I tried
> punycode but it is case-sensitive, which Windows is not. Thus,
> "Hallo" and "hallo" are mapped to "Hallo-" and "hallo-", however,
> I need uppercase Latin letters being encoded, too, and the
> encoding must contain only lowercase Latin letters, numbers,
> underscores, and maybe a little bit more. The result should be
> more legible than base64, though.
>
> Has anybody created such a codec already?
Okay, the following works fine for me:
--8<---------------cut here---------------start------------->8---
import codecs
class Codec(codecs.Codec):
"""Codec class for safe filenames. Safe filenames work on all important
filesystems, i.e., they don't contain special or dangerous characters, and
they don't assume that filenames are treated case-sensitively.
>>> u"hallo".encode("safefilename")
'hallo'
>>> u"Hallo".encode("safefilename")
'(h)allo'
>>> u"MIT Thesis".encode("safefilename")
'(mit)_(t)hesis'
>>> u"Gesch\\u00e4ftsbrief".encode("safefilename")
'(g)esch{e4}ftsbrief'
Of course, the mapping works in both directions as expected:
>>> "(g)esch{e4}ftsbrief".decode("safefilename")
u'Gesch\\xe4ftsbrief'
>>> "(mit)_(t)hesis".decode("safefilename")
u'MIT Thesis'
"""
lowercase_letters = "abcdefghijklmnopqrstuvwxyz"
safe_characters = lowercase_letters + "0123456789-+!$%&`'@~#.,^"
uppercase_letters = lowercase_letters.upper()
def encode(self, input, errors='strict'):
"""Convert Unicode strings to safe filenames."""
output = ""
i = 0
input_length = len(input)
while i < input_length:
c = input[i]
if c in self.safe_characters:
output += str(c)
elif c == " ":
output += "_"
elif c in self.uppercase_letters:
output += "("
while i < input_length and input[i] in self.uppercase_letters:
output += str(input[i]).lower()
i += 1
output += ")"
continue
else:
output += "{" + hex(ord(c))[2:] + "}"
i += 1
return output, input_length
def handle_problematic_characters(self, errors, input, start, end, message):
if errors == 'ignore':
return u""
elif errors == 'replace':
return u"?"
else:
raise UnicodeDecodeError("safefilename", input, start, end, message)
def decode(self, input, errors='strict'):
"""Convert safe filenames to Unicode strings."""
input = str(input)
input_length = len(input)
output = u""
i = 0
while i < input_length:
c = input[i]
if c in self.safe_characters:
output += c
elif c == "_":
output += " "
elif c == "(":
i += 1
while i < input_length and input[i] in self.lowercase_letters:
output += input[i].upper()
i += 1
if i == input_length:
self.handle_problematic_characters(errors, input, i-1, i, "open parenthesis was never closed")
continue
if input[i] != ')':
self.handle_problematic_characters(
errors, input, i, i+1, "invalid character '%s' in parentheses sequence" % input[i])
continue
elif c == "{":
end_position = input.find("}", i)
if end_position == -1:
end_position = i+1
while end_position < input_length and input[end_position] in "0123456789abcdef" and \
end_position - i <= 8:
end_position += 1
output += self.handle_problematic_characters(errors, input, i, end_position,
"open backet was never closed")
i = end_position
continue
else:
try:
output += unichr(int(input[i+1:end_position], 16))
except:
output += self.handle_problematic_characters(errors, input, i, end_position+1,
"invalid data between brackets")
i = end_position
else:
output += self.handle_problematic_characters(errors, input, i, i+1, "invalid character '%s'" % c)
i += 1
return output, input_length
class StreamWriter(Codec, codecs.StreamWriter):
pass
class StreamReader(Codec, codecs.StreamReader):
pass
def _registry(encoding):
if encoding == "safefilename":
return (Codec().encode, Codec().decode, StreamReader, StreamWriter)
else:
return None
codecs.register(_registry)
if __name__ == "__main__":
import doctest
doctest.testmod()
--8<---------------cut here---------------end--------------->8---
--
Torsten Bronger, aquisgrana, europa vetus
Jabber ID: bronger at jabber.org
(See http://ime.webhop.org for ICQ, MSN, etc.)
More information about the Python-list
mailing list