Regular Expression

patrick.waldo at gmail.com patrick.waldo at gmail.com
Sat Oct 27 14:19:18 CEST 2007


Finally I solved the problem, with some really minor things to tweak.
I guess it's true that I had two problems working with regular
expressions.

Thank you all for your help.  I really learned a lot on quite a
difficult problem.

Final Code:

#For text files in a directory...
#Analyzes a randomly organized UTF8 document with EINECS, CAS,
Chemical, and Chemical Formula
#into a document structured as EINECS|CAS|Chemical|Chemical Formula.

import os
import codecs
import re

path = "C:\\text_samples\\text\\"
path2 = "C:\\text_samples\\text\\output\\"
EINECS = re.compile(r'^\d\d\d-\d\d\d-\d$')
CAS = re.compile(r'^\d*-\d\d-\d$')
FORMULA = re.compile(r'([A-Z][A-Za-z0-9]+\.?[A-Za-z0-9]+/?[A-Za-
z0-9]+)')


def iter_elements(tokens):
    product = []
    for tok in tokens:
        if EINECS.match(tok) and len(product) >= 4:
            match = re.match(FORMULA,product[-1])
            if match:
                product[2:-1] = [' '.join(product[2:-1])]
                yield product
                product = []
            else:
                product[2:-1] = [' '.join(product[2:])]
                del product[-1]
                yield product
                product = []
        product.append(tok)
    yield product

for text in os.listdir(path):
    input_text = os.path.join(path,text)
    output_text = os.path.join(path2,text)
    input = codecs.open(input_text, 'r','utf8')
    output = codecs.open(output_text, 'w', 'utf8')
    tokens = input.read().split()
    for element in iter_elements(tokens):
        output.write('|'.join(element))
        output.write("\r\n")
input.close()
output.close()




More information about the Python-list mailing list