ASCII delimited files
D'Arcy J.M. Cain
darcy at vex.net
Thu Nov 11 13:14:10 EST 1999
D'Arcy J.M. Cain <darcy at vex.net> wrote:
> Skip it. There's a bug. If a line ends with a quoted entry it gets
> attached to the next one. I'm tracking it down now and will post the
> corrected one.
OK, this one does the right thing.
#! /usr/bin/env python
# delimited.py
# Modified from the original by D'Arcy J.M. Cain <darcy at druid.net>
#
# CT990226 V0.1
"""
breaking tab delimited or CSV text into a list of tuples.
Original version: SMC databook plugin, readxls.py
Generalized to be usable for any delimiter but \n.
"""
import string, re
def split_delimited(s, delimiter=None) :
"""split a delimited text file string into a list of tuples.
Quotes are obeyed, enclosed newlines are expanded to tab,
double quotes go to quotes"""
# 980426 finding line delimiter dynamically
probe = s[:10000]
eol = findlinedelimiter(probe)
# 990226 guessing field delimiter from '\t,' if not supplied
if not delimiter:
candidates = [
(string.count(probe, '\t'), '\t'),
(string.count(probe, ','), ','),
]
candidates.sort()
delimiter = candidates[-1][-1]
del probe
# the trick to make this handy is to use SEP as a placeholder
# Kind of ugly but it works for embedded commas - DJMC
inquote = 0
SEP='|'
for i in range(len(s)):
if s[i] == '"': inquote = (inquote == 0)
if inquote == 0 and s[i] == delimiter:
s = s[:i] + SEP + s[i + 1:]
parts = string.split(s, '"')
limits = (0, len(parts)-1)
for i in range(len(parts)) :
part = parts[i]
if i%2 :
part = string.replace(part, eol, delimiter)
else :
if not part and i not in limits: part = '"'
parts[i] = part
# merge it back
txt = string.join(parts, "")
parts = string.split(txt, eol)
# now break by SEP
for i in range(len(parts)) :
# strip leading and trailing spaces
#fields = string.split(parts[i], SEP)
fields = re.split(r' *\0 *', parts[i])
parts[i] = tuple(fields)
return parts
# utilities
def findlinedelimiter(txt) :
"""
provide some kb of text to this function. It will determine
the best delimiter and therefore guess the system
"""
mac = "\x0D"
unix = "\x0A"
dos = mac+unix
oses = [dos, unix, mac]
# find the one which gives the most lines.
# in doubt, the longest delimiter wins.
lis = []
while txt and txt[-1] in dos: txt = txt[:-1] # CT970904
for delim in oses:
lis.append((len(string.split(txt, delim)), delim))
lis.sort();
return lis[-1][-1]
if __name__=="__main__":
for l in split_delimited('''1, 2,3, "vier", "quo""te", "embedded, comma", "this
is with a newline", here
another record
And another, "should end here"
"should not be attached to previous"'''):
print l
# eof
--
D'Arcy J.M. Cain <darcy at caingang.com> | Democracy is three wolves
http://www.druid.net/darcy/ | and a sheep voting on
+1 416 425 1212 (DoD#0082) (eNTP) | what's for dinner.
More information about the Python-list
mailing list