Unicode support in python
sonald
sonaldgr8 at gmail.com
Wed Oct 25 05:04:56 EDT 2006
Fredrik Lundh wrote:
>
> what does the word "validate" mean here?
>
Let me explain our module.
We receive text files (with comma separated values, as per some
predefined format) from a third party.
for example account file comes as "abc.acc" {.acc is the extension for
account file as per our code}
it must contain account_code, account_description, account_balance in
the same order.
So, from the text file("abc.acc") we receive for 2 or more records,
will look like
A001, test account1, 100000
A002, test account2, 500000
We may have multiple .acc files
Our job is to validate the incoming data on the basis of its datatype,
field number, etc and copy all the error free records in acc.txt
for this, we use a schema as follows
----------------------------------------------------------------------------------------------------------
if account_flg == 1:
start = time()
# the input fields
acct_schema = {
0: Text('AccountCode', 50),
1: Text('AccountDescription', 100),
2: Text('AccountBalance', 50)
}
validate( schema = acct_schema,
primary_keys = [acct_pk],
infile = '../data/ACC/*.acc',
outfile = '../data/acc.txt',
update_freq = 10000)
----------------------------------------------------------------------------------------------------------
In a core.py, we have defined a function validate, which checks for the
datatypes & other validations.
All the erroneous records are copied in a error log file, and the
correct records are copied to a clean acc.text file
The validate function is as given below...
---------------------------------------------------------------------------------------------------------------------------
def validate(infile, outfile, schema, primary_keys=[], foreign_keys=[],
record_checks=[], buffer_size=0, update_freq=0):
show("intitalizing ... ")
# find matching input files
all_files = glob.glob(infile)
if not all_files:
raise ValueError('No input files were found.')
# initialize data structures
freq = update_freq or DEFAULT_UPDATE
input = fileinput.FileInput(all_files, bufsize = buffer_size
or DEFAULT_BUFFER)
output = open(outfile, 'wb+')
logs = {}
for name in all_files:
logs[name] = open(name + DEFAULT_SUFFIX, 'wb+')
#logs[name] = open(name + DEFAULT_SUFFIX, 'a+')
errors = []
num_fields = len(schema)
pk_length = range(len(primary_keys))
fk_length = range(len(foreign_keys))
rc_length = range(len(record_checks))
# initialize the PKs and FKs with the given schema
for idx in primary_keys:
idx.setup(schema)
for idx in foreign_keys:
idx.setup(schema)
# start processing: collect all lines which have errors
for line in input:
rec_num = input.lineno()
if rec_num % freq == 0:
show("processed %d records ... " % (rec_num))
for idx in primary_keys:
idx.flush()
for idx in foreign_keys:
idx.flush()
if BLANK_LINE.match(line):
continue
try:
data = csv.parse(line)
# check number of fields
if len(data) != num_fields:
errors.append( (rec_num, LINE_ERROR, 'incorrect number
of fields') )
continue
# check for well-formed fields
fields_ok = True
for i in range(num_fields):
if not schema[i].validate(data[i]):
errors.append( (rec_num, FIELD_ERROR, i) )
fields_ok = False
break
# check the PKs
for i in pk_length:
if fields_ok and not primary_keys[i].valid(rec_num,
data):
errors.append( (rec_num, PK_ERROR, i) )
break
# check the FKs
for i in fk_length:
if fields_ok and not foreign_keys[i].valid(rec_num,
data):
#print 'here ---> %s, rec_num : %d'%(data,rec_num)
errors.append( (rec_num, FK_ERROR, i) )
break
# perform record-level checks
for i in rc_length:
if fields_ok and not record_checks[i](schema, data):
errors.append( (rec_num, REC_ERROR, i) )
break
except fastcsv.Error, err:
errors.append( (rec_num, LINE_ERROR, err.__str__()) )
# finalize the indexes to check for any more errors
for i in pk_length:
error_list = primary_keys[i].finalize()
primary_keys[i].save()
if error_list:
errors.extend( [ (rec_num, PK_ERROR, i) for rec_num in
error_list ] )
for i in fk_length:
error_list = foreign_keys[i].finalize()
if error_list:
errors.extend( [ (rec_num, FK_ERROR, i) for rec_num in
error_list ] )
# sort the list of errors by the cumulative line number
errors.sort( lambda l, r: cmp(l[0], r[0]) )
show("saving output ... ")
# reopen input and sort it into either the output file or error log
file
input = fileinput.FileInput(all_files, bufsize = buffer_size
or DEFAULT_BUFFER)
error_list = iter(errors)
count = input.lineno
filename = input.filename
line_no = input.filelineno
try:
line_num, reason, i = error_list.next()
except StopIteration:
line_num = -1
for line in input:
line = line + '\r\n'
#print '%d,%d'%(line_num,count())
if line_num == count():
if reason == FIELD_ERROR:
logs[filename()].write(ERROR_FORMAT % (line_no(),
INVALID_FIELD % (schema[i].name), line))
elif reason == LINE_ERROR:
logs[filename()].write(ERROR_FORMAT % (line_no(), i,
line))
elif reason == PK_ERROR:
logs[filename()].write(ERROR_FORMAT % (line_no(),
INVALID_PK % (primary_keys[i].name), line))
elif reason == FK_ERROR:
#print 'Test FK %s, rec_num : %d, line :
%s'%(foreign_keys[i].name,line_no(),line)
logs[filename()].write(ERROR_FORMAT % (line_no(),
INVALID_FK % (foreign_keys[i].name), line))
elif reason == REC_ERROR:
logs[filename()].write(ERROR_FORMAT % (line_no(),
INVALID_REC % (record_checks[i].__doc__), line))
else:
raise RuntimeError("shouldn't reach here")
try:
#print 'CURRENT ITERATION, line_num : %d, line :
%s'%(line_num,line)
line_num1 = line_num
line_num, reason, i = error_list.next()
if line_num1 == line_num :
line_num, reason, i = error_list.next()
#print 'FOR NEXT ITERATION, line_num : %d, line :
%s'%(line_num,line)
except StopIteration:
line_num = -1
continue
if not BLANK_LINE.match(line):
output.write(line)
output.close()
for f in logs.values():
f.close()
-----------------------------------------------------------------------------------------------------------------------------
now when I open the error log file, it contains the error message for
each erroneous record, along with the original record copied from the
*.acc file.
Now this record is preceeded with a box like character.
Do you want me to post the complete code , just incase...
It might help... you might then understand my problem well..
plz let me know soon
More information about the Python-list
mailing list