parsing CSV files with quotes
Bob Alexander
bobalex at home.com
Thu Mar 30 12:47:05 EST 2000
Here's a Python module I wrote to manage CSVs, and I think it handles the
cases you care about:
------------------- CSV.py ---------------------
#
# Functions for handling comma-separated value (CSV) files
#
# Lines of a CSV file are represented by lists of their fields.
# Fields are represented as strings.
#
# Comments on handling of special characters in CSV:
#
# - Output fields are quoted only if text contains quote or comma
# - Quotes in quoted text are doubled
# - Quote after first char treated as normal text
#
import re
FieldPat = re.compile(r'(?!$)(?:"((?:.*?"")*?[^"]*)"|([^,]*))(?:[,\n]|$)')
Quote = re.compile('"')
TwoQuotes = re.compile('""')
CommaOrQuote = re.compile('[,"]')
#
# Reads file "fileName" and returns a list of CSV records.
#
def ReadFile(fileName):
f = open(fileName)
return ReadLines(f)
#
# Reads remaining lines from "file" and returns a list of lists of fields.
#
def ReadLines(file):
result = []
while 1:
rec = ReadLine(file)
if rec is None:
break
result.append(rec)
return result
#
# Reads a line from "file" and returns a list of fields.
#
def ReadLine(file):
line = file.readline()
if not line:
return None
return ParseLine(line)
#
# Parses a CSV line, returning a list of fields.
#
def ParseLine(line):
rec = []
p = 0
while 1:
match = FieldPat.match(line, p)
if not match:
break
field = match.group(2)
if field is None:
field = TwoQuotes.sub('"', match.group(1))
rec.append(field)
p = match.end()
return rec
#
# Creates a CSV file from a list of records (lists of fields).
#
def WriteFile(fileName, recList):
f = open(fileName, 'w')
WriteLines(f, recList)
#
# Writes a list of records (lists of fields) to "file".
#
def WriteLines(file, recList):
for rec in recList:
WriteLine(file, rec)
#
# Writes a CSV line from record "rec" (a list of fields).
#
def WriteLine(file, rec):
file.write(FormLine(rec) + '\n')
#
# Forms a CSV line (string) from record "rec" (a list of fields).
#
def FormLine(rec):
line = ''
sep = ''
for field in rec:
if CommaOrQuote.search(field):
field = '"' + Quote.sub('""', field) + '"'
line = line + sep + field
sep = ','
return line
#
# Adds CSV fields to a map. "values" is a record as returned from
# ParseLine(). "keys" is a list of field names. "map" is the map to
# which entries are to be added. If "map" is omitted a new map is
# created. The modified map is returned.
#
def ToMap(values, keys, map = None):
if map is None:
map = {}
for i in xrange(min(len(values), len(keys))):
map[keys[i]] = values[i]
return map
#
# Adds CSV fields to an object as properties. "values" is a record as
# returned from ParseLine(). "fieldNames" is a list of field names
# (which must conform to Python name syntax). "object" is the object to
# which properties are to be added. The modified object is returned.
#
def ToObject(values, fieldNames, object):
ToMap(values, fieldNames, object.__dict__)
return object
#
# Test program
#
if __name__ == '__main__':
import sys, os
print os.getcwd()
for fileName in sys.argv[1:]:
print fileName, '-'
x = ReadFile(fileName)
for rec in x:
print rec
WriteLines(sys.stdout, x)
print ParseLine('"aa","bb"\n') # try line with trailing \n
print ParseLine('aa,bb') # try line with no trailing \n
print ParseLine('a"a,"b""b"') # try line with no trailing \n
print ToMap(['value1', 'value2'], ['name1', 'name2'])
print ToMap(['value1', 'value2'], ['name1', 'name2', 'name3'])
print ToMap(['value1', 'value2', 'value3'], ['name1', 'name2'])
-----------------------------------------------------
----- Original Message -----
From: "Warren Postma" <embed at geocities.com>
Newsgroups: comp.lang.python
To: <python-list at python.org>
Sent: Thursday, March 30, 2000 8:50 AM
Subject: parsing CSV files with quotes
> Suppose I have a CSV file where line 1 is the column names, and lines 2..n
> are comma separated variables, where all String fields are quoted like
this:
>
> ID, NAME, AGE
> 1, "Postma, Warren", 30
> 2, "Twain, Shania", 31
> 3, "Nelson, Willy", 57
> 4, "Austin, \"Stone Cold\" Steve", 34
>
> So, the obvious thing I tried is:
>
> import string
> >>> print string.splitfields("4, \"Austin, \\\"Stone Cold\\\" Steve,
> 34",",")
> ['4', ' "Austin', ' \\"Stone Cold\\" Steve', ' 34']
>
> Hmm. Interesting. So I tried this:
>
> >>> print string.splitfields(r'4, "Austin, \"Stone Cold\" Steve", 34')
> ['4,', '"Austin,', '\\"Stone', 'Cold\\"', 'Steve",', '34']
>
> I'm getting close, I can feel it!
>
> The Rules:
>
> 1. All integer and other fields are output as ascii.
> 2. String fields have quotes. Commas are allowed inside the quotes.
> 3. Quotes inside quotes are escaped by a backslash
> 4. Backslashes are themselves quoted by a backslash
>
> Is this complex enough that I basically need the "parser" module of
Python?
>
> Problem is I'm scared of it. Anyone got any Parser Tutorials Howtos/Links?
>
> Or is this beasty solveable by judicious use of Regular Expressions?
>
> While I'm taking up bandwidth, I'll ask another silly question:
>
> Is there a "compressed dbShelve" out there anywhere? In this case I just
> want to store arrays and dictionaries of built-in Python types, in a
> compressed manner, in a bsd database. Anyone heard of something like this?
>
> Warren
>
>
>
>
>
More information about the Python-list
mailing list