Getting Unicode decode error using lxml.iterparse
digitig at gmail.com
digitig at gmail.com
Tue May 22 18:56:43 EDT 2018
I'm trying to read my iTunes library in Python using iterparse. My current stub is:
---- Snip ----
import sys
import datetime
import xml.etree.ElementTree as ET
import argparse
import re
class Library:
unmarshallers = {
# collections
"array": lambda x: [v.text for v in x],
"dict": lambda x:
dict((x[i].text, x[i+1].text) for i in range(0, len(x), 2)),
"key": lambda x: x.text or "",
# simple types
"string": lambda x: x.text or "",
"data": lambda x: base64.decodestring(x.text or ""),
"date": lambda x: datetime.datetime(*map(int, re.findall("\d+", x.text))),
"true": lambda x: True,
"false": lambda x: False,
"real": lambda x: float(x.text),
"integer": lambda x: int(x.text)
}
def load(self, file):
print('Starting...')
parser = ET.iterparse(file)
for action, elem in parser:
unmarshal = self.unmarshallers.get(elem.tag)
if unmarshal:
data = unmarshal(elem)
elem.clear()
elem.text = data
print(elem.text)
elif elem.tag != "plist":
raise IOError("unknown plist type: %r" % elem.tag)
return parser.root[0].text
def __init__(self, infile):
self.root = self.load(infile)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description = "Parse an iTunes library file to a set of CSV files suitable for import to a database.")
parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
args=parser.parse_args()
print('Infile = ', args.infile)
library = Library(args.infile)
My input file (reduced to home in on the error) is:
---- snip -----
<?xml version="1.0" encoding="UTF-8"?>
<plist version="1.0">
<dict>
<dict>
<key>15078</key>
<dict>
<key>Name</key><string>Part 2. The Death Of Enkidu. Skon Přitele Mého Mne Zdeptal Težče</string>
</dict>
</dict>
</dict>
</plist>
---- snip ----
<?xml version="1.0" encoding="UTF-8"?>
<plist version="1.0">
<dict>
<dict>
<key>15078</key>
<dict>
<key>Name</key><string>Part 2. The Death Of Enkidu. Skon Přitele Mého Mne Zdeptal Težče</string>
</dict>
</dict>
</dict>
</plist>
I'm getting an error on one part of the XML:
File "C:\Users\digit\Anaconda3\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 202: character maps to <undefined>
I suspect the issue is that it's using cp1252.py, which I don't think is UTF-8 as specified in the XML prolog. Is this an iterparse problem, or am I using it wrongly?
Thanks.
More information about the Python-list
mailing list