parsing CSV files with quotes

Thu Mar 30 12:47:05 EST 2000

Here's a Python module I wrote to manage CSVs, and I think it handles the
cases you care about:

------------------- CSV.py ---------------------
#
# Functions for handling comma-separated value (CSV) files
#
# Lines of a CSV file are represented by lists of their fields.
# Fields are represented as strings.
#
# Comments on handling of special characters in CSV:
#
# - Output fields are quoted only if text contains quote or comma
# - Quotes in quoted text are doubled
# - Quote after first char treated as normal text
#

import re

FieldPat = re.compile(r'(?!$)(?:"((?:.*?"")*?[^"]*)"|([^,]*))(?:[,\n]|$)')
Quote = re.compile('"')
TwoQuotes = re.compile('""')
CommaOrQuote = re.compile('[,"]')

#
# Reads file "fileName" and returns a list of CSV records.
#
def ReadFile(fileName):
 f = open(fileName)
 return ReadLines(f)

#
# Reads remaining lines from "file" and returns a list of lists of fields.
#
def ReadLines(file):
 result = []
 while 1:
  rec = ReadLine(file)
  if rec is None:
   break
  result.append(rec)
 return result

#
# Reads a line from "file" and returns a list of fields.
#
def ReadLine(file):
 line = file.readline()
 if not line:
  return None
 return ParseLine(line)

#
# Parses a CSV line, returning a list of fields.
#
def ParseLine(line):
 rec = []
 p = 0
 while 1:
  match = FieldPat.match(line, p)
  if not match:
   break
  field = match.group(2)
  if field is None:
   field = TwoQuotes.sub('"', match.group(1))
  rec.append(field)
  p = match.end()
 return rec

#
# Creates a CSV file from a list of records (lists of fields).
#
def WriteFile(fileName, recList):
 f = open(fileName, 'w')
 WriteLines(f, recList)

#
# Writes a list of records (lists of fields) to "file".
#
def WriteLines(file, recList):
 for rec in recList:
  WriteLine(file, rec)

#
# Writes a CSV line from record "rec" (a list of fields).
#
def WriteLine(file, rec):
 file.write(FormLine(rec) + '\n')

#
# Forms a CSV line (string) from record "rec" (a list of fields).
#
def FormLine(rec):
 line = ''
 sep = ''
 for field in rec:
  if CommaOrQuote.search(field):
   field = '"' + Quote.sub('""', field) + '"'
  line = line + sep + field
  sep = ','
 return line

#
# Adds CSV fields to a map. "values" is a record as returned from
# ParseLine(). "keys" is a list of field names. "map" is the map to
# which entries are to be added. If "map" is omitted a new map is
# created. The modified map is returned.
#
def ToMap(values, keys, map = None):
 if map is None:
  map = {}
 for i in xrange(min(len(values), len(keys))):
  map[keys[i]] = values[i]
 return map

#
# Adds CSV fields to an object as properties. "values" is a record as
# returned from ParseLine(). "fieldNames" is a list of field names
# (which must conform to Python name syntax). "object" is the object to
# which properties are to be added. The modified object is returned.
#
def ToObject(values, fieldNames, object):
 ToMap(values, fieldNames, object.__dict__)
 return object

#
# Test program
#
if __name__ == '__main__':
 import sys, os
 print os.getcwd()
 for fileName in sys.argv[1:]:
  print fileName, '-'
  x = ReadFile(fileName)
  for rec in x:
   print rec
  WriteLines(sys.stdout, x)
 print ParseLine('"aa","bb"\n')  # try line with trailing \n
 print ParseLine('aa,bb')  # try line with no trailing \n
 print ParseLine('a"a,"b""b"')  # try line with no trailing \n
 print ToMap(['value1', 'value2'], ['name1', 'name2'])
 print ToMap(['value1', 'value2'], ['name1', 'name2', 'name3'])
 print ToMap(['value1', 'value2', 'value3'], ['name1', 'name2'])
-----------------------------------------------------

----- Original Message -----
From: "Warren Postma" <embed at geocities.com>
Newsgroups: comp.lang.python
To: <python-list at python.org>
Sent: Thursday, March 30, 2000 8:50 AM
Subject: parsing CSV files with quotes

> Suppose I have a CSV file where line 1 is the column names, and lines 2..n
> are comma separated variables, where all String fields are quoted like
this:
>
> ID, NAME, AGE
> 1, "Postma, Warren", 30
> 2, "Twain, Shania",  31
> 3, "Nelson, Willy",  57
> 4, "Austin, \"Stone Cold\" Steve", 34
>
> So, the obvious thing I tried is:
>
> import string
> >>> print string.splitfields("4, \"Austin, \\\"Stone Cold\\\" Steve,
> 34",",")
> ['4', ' "Austin', ' \\"Stone Cold\\" Steve', ' 34']
>
> Hmm. Interesting. So I tried this:
>
> >>> print string.splitfields(r'4, "Austin, \"Stone Cold\" Steve", 34')
> ['4,', '"Austin,', '\\"Stone', 'Cold\\"', 'Steve",', '34']
>
> I'm getting close, I can feel it!
>
> The Rules:
>
> 1. All integer and other fields are output as ascii.
> 2. String fields have quotes. Commas are allowed inside the quotes.
> 3. Quotes inside quotes are escaped by a backslash
> 4. Backslashes are themselves quoted by a backslash
>
> Is this complex enough that I basically need the "parser" module of
Python?
>
> Problem is I'm scared of it. Anyone got any Parser Tutorials Howtos/Links?
>
> Or is this beasty solveable by judicious use of Regular Expressions?
>
> While I'm taking up bandwidth, I'll ask another silly question:
>
> Is there a "compressed dbShelve" out there anywhere? In this case I just
> want to store arrays and dictionaries of built-in Python types, in a
> compressed manner, in a bsd database. Anyone heard of something like this?
>
> Warren
>
>
>
>
>