Unicode support in python

Wed Oct 25 05:04:56 EDT 2006

Fredrik Lundh wrote:
>
> what does the word "validate" mean here?
>
Let me explain our module.
We receive text files (with comma separated values, as per some
predefined format) from a third party.
for example account file comes as "abc.acc" {.acc is the extension for
account file as per our code}
it must contain account_code, account_description, account_balance in
the same order.

So, from the text file("abc.acc") we receive for 2 or more records,
will look like
A001, test account1, 100000
A002, test account2, 500000

We may have multiple .acc files

Our job is to validate the incoming data on the basis of its datatype,
field number, etc and copy all the error free records in acc.txt

for this, we use a schema as follows
----------------------------------------------------------------------------------------------------------
if account_flg == 1:
        start = time()

        # the input fields
        acct_schema = {
            0: Text('AccountCode', 50),
            1: Text('AccountDescription', 100),
            2: Text('AccountBalance', 50)
        }

validate( schema        = acct_schema,
                      primary_keys  = [acct_pk],
                      infile        = '../data/ACC/*.acc',
                      outfile       = '../data/acc.txt',
                      update_freq = 10000)
----------------------------------------------------------------------------------------------------------
In a core.py, we have defined a function validate, which checks for the
datatypes & other validations.
All the erroneous records are copied in a error log file, and the
correct records are copied to a clean acc.text file

The validate function is as given below...
---------------------------------------------------------------------------------------------------------------------------
def validate(infile, outfile, schema, primary_keys=[], foreign_keys=[],
record_checks=[], buffer_size=0, update_freq=0):

    show("intitalizing ... ")

    # find matching input files
    all_files  = glob.glob(infile)
    if not all_files:
        raise ValueError('No input files were found.')

    # initialize data structures
    freq       = update_freq or DEFAULT_UPDATE
    input      = fileinput.FileInput(all_files, bufsize = buffer_size
or DEFAULT_BUFFER)
    output     = open(outfile, 'wb+')
    logs       = {}
    for name in all_files:
        logs[name]  = open(name + DEFAULT_SUFFIX, 'wb+')
        #logs[name]  = open(name + DEFAULT_SUFFIX, 'a+')

    errors     = []
    num_fields = len(schema)
    pk_length  = range(len(primary_keys))
    fk_length  = range(len(foreign_keys))
    rc_length  = range(len(record_checks))

    # initialize the PKs and FKs with the given schema
    for idx in primary_keys:
        idx.setup(schema)
    for idx in foreign_keys:
        idx.setup(schema)

    # start processing: collect all lines which have errors
    for line in input:
        rec_num = input.lineno()
        if rec_num % freq == 0:
            show("processed %d records ... " % (rec_num))
            for idx in primary_keys:
                idx.flush()
            for idx in foreign_keys:
                idx.flush()

        if BLANK_LINE.match(line):
            continue

        try:
            data = csv.parse(line)

            # check number of fields
            if len(data) != num_fields:
                errors.append( (rec_num, LINE_ERROR, 'incorrect number
of fields') )
                continue

            # check for well-formed fields
            fields_ok = True
            for i in range(num_fields):
                if not schema[i].validate(data[i]):
                    errors.append( (rec_num, FIELD_ERROR, i) )
                    fields_ok = False
                    break

            # check the PKs
            for i in pk_length:
                if fields_ok and not primary_keys[i].valid(rec_num,
data):
                    errors.append( (rec_num, PK_ERROR, i) )
                    break

            # check the FKs
            for i in fk_length:
                if fields_ok and not foreign_keys[i].valid(rec_num,
data):
                    #print 'here ---> %s, rec_num : %d'%(data,rec_num)
                    errors.append( (rec_num, FK_ERROR, i) )
                    break

            # perform record-level checks
            for i in rc_length:
                if fields_ok and not record_checks[i](schema, data):
                    errors.append( (rec_num, REC_ERROR, i) )
                    break

        except fastcsv.Error, err:
            errors.append( (rec_num, LINE_ERROR, err.__str__()) )

    # finalize the indexes to check for any more errors
    for i in pk_length:
        error_list = primary_keys[i].finalize()
        primary_keys[i].save()
        if error_list:
            errors.extend( [ (rec_num, PK_ERROR, i) for rec_num in
error_list ] )

    for i in fk_length:
        error_list = foreign_keys[i].finalize()
        if error_list:
            errors.extend( [ (rec_num, FK_ERROR, i) for rec_num in
error_list ] )

    # sort the list of errors by the cumulative line number
    errors.sort( lambda l, r: cmp(l[0], r[0]) )

    show("saving output ... ")

    # reopen input and sort it into either the output file or error log
file
    input      = fileinput.FileInput(all_files, bufsize = buffer_size
or DEFAULT_BUFFER)
    error_list = iter(errors)
    count      = input.lineno
    filename   = input.filename
    line_no    = input.filelineno

    try:
        line_num, reason, i = error_list.next()
    except StopIteration:
        line_num = -1
    for line in input:
        line = line + '\r\n'
        #print '%d,%d'%(line_num,count())
        if line_num == count():

            if reason == FIELD_ERROR:
                logs[filename()].write(ERROR_FORMAT % (line_no(),
INVALID_FIELD % (schema[i].name), line))
            elif reason == LINE_ERROR:
                logs[filename()].write(ERROR_FORMAT % (line_no(), i,
line))
            elif reason == PK_ERROR:
                logs[filename()].write(ERROR_FORMAT % (line_no(),
INVALID_PK % (primary_keys[i].name), line))
            elif reason == FK_ERROR:
                #print 'Test FK %s, rec_num : %d, line :
%s'%(foreign_keys[i].name,line_no(),line)
                logs[filename()].write(ERROR_FORMAT % (line_no(),
INVALID_FK % (foreign_keys[i].name), line))
            elif reason == REC_ERROR:
                logs[filename()].write(ERROR_FORMAT % (line_no(),
INVALID_REC % (record_checks[i].__doc__), line))
            else:
                raise RuntimeError("shouldn't reach here")

            try:
                #print 'CURRENT ITERATION, line_num : %d, line :
%s'%(line_num,line)
                line_num1 = line_num
                line_num, reason, i = error_list.next()
                if line_num1 == line_num :
                    line_num, reason, i = error_list.next()

                #print 'FOR NEXT ITERATION, line_num : %d, line :
%s'%(line_num,line)

            except StopIteration:
                line_num = -1
            continue

        if not BLANK_LINE.match(line):
            output.write(line)

    output.close()
    for f in logs.values():
        f.close()
-----------------------------------------------------------------------------------------------------------------------------

now when I open the error log file, it contains the error message for
each erroneous record, along with the original record copied from the
*.acc file.
Now this record is preceeded with a box like character.

Do you want me to post the complete code , just incase...
It might help... you might then understand my problem well..
plz let me know soon