Extract sentences in nested parentheses using Python
A S
aishan0403 at gmail.com
Mon Dec 2 10:40:41 EST 2019
I am trying to extract all strings in nested parentheses (along with the parentheses itself) in my .txt file. Please see the sample .txt file that I have used in this example here: (https://drive.google.com/open?id=1UKc0ZgY9Fsz5O1rSeBCLqt5dwZkMaQgr).
I have tried and done up three different codes but none of them seems to be able to extract all the nested parentheses. They can only extract a portion of the nested parentheses. Any advice on what I've done wrong could really help!
Here are the three codes I have done so far:
1st attempt:
import re
from os.path import join
def balanced_braces(args):
parts = []
for arg in args:
if '(' not in arg:
continue
chars = []
n = 0
for c in arg:
if c == '(':
if n > 0:
chars.append(c)
n += 1
elif c == ')':
n -= 1
if n > 0:
chars.append(c)
elif n == 0:
parts.append(''.join(chars).lstrip().rstrip())
chars = []
elif n > 0:
chars.append(c)
return parts
with open('lan sample text file.txt','r') as fd:
#for words in fd.readlines():
t1 = balanced_braces(fd);
print(t1)
Output:
['"xE\'", PUT(xx.xxxx.),"\'"', '"TRUuuuth"', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.", '"xE\'", PUT(xx.xxxx.),"\'"', '"CUuuiiiiuth"', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]
2nd attempt:
from pyparsing import nestedExpr
matchedParens = nestedExpr('(',')')
with open('lan sample text file.txt','r') as fd:
for words in fd.readlines():
for e in matchedParens.searchString(words):
print(e)
Output:
[['"xE\'"', ',', 'PUT', ['xx.xxxx.'], ',', '"\'"']]
[['"TRUuuuth"']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'gff', '&jfjfsj_jfjfj.']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'lec', '&jgjsd_vnv.']]
[['"xE\'"', ',', 'PUT', ['xx.xxxx.'], ',', '"\'"']]
[['"CUuuiiiiuth"']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'gff', '&jfjfsj_jfjfj.']]
[['xxx', ['xx_ix', 'as', 'format', "'xxxx-xx'"], 'lec', '&jgjsd_vnv.']]
3rd attempt:
def parse_segments(source, recurse=False):
unmatched_count = 0
start_pos = 0
opened = False
open_pos = 0
cur_pos = 0
finished = []
segments = []
for character in source:
#scan for mismatched parenthesis:
if character == '(':
unmatched_count += 1
if not opened:
open_pos = cur_pos
opened = True
if character == ')':
unmatched_count -= 1
if opened and unmatched_count == 0:
segment = source[open_pos:cur_pos+1]
segments.append(segment)
clean = source[start_pos:open_pos]
if clean:
finished.append(clean)
opened = False
start_pos = cur_pos+1
cur_pos += 1
# assert unmatched_count == 0
if start_pos != cur_pos:
#get anything that was left over here
finished.append(source[start_pos:cur_pos])
#now check on recursion:
for item in segments:
#get rid of bounding parentheses:
pruned = item[1:-1]
if recurse:
results = parse_tags(pruned, recurse)
finished.expand(results)
else:
finished.append(pruned)
return finished
with open('lan sample text file.txt','r') as fd:
for words in fd.readlines():
t = parse_segments(words)
print(t)
Output:
['kkkkk;\n']
['\n']
[' select xx', ' jdfjhf:jhfjj from xxxx_x_xx_L ;\n', '"xE\'", PUT(xx.xxxx.),"\'"']
['quit; \n']
['\n']
['/* 1.xxxxx FROM xxxx_x_Ex_x */ \n']
['proc sql; ', ';\n', '"TRUuuuth"']
['hhhjhfjs as fdsjfsj:\n']
['select * from djfkjd to jfkjs\n']
['(\n']
['SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj\n']
['\tFROM &xxx..xxx_xxx_xxE\n']
["where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and \n"]
[' ', ')\n', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]
[' );\n']
['\n']
['\n']
['jjjjjj;\n']
['\n']
[' select xx', ' jdfjhf:jhfjj from xxxx_x_xx_L ;\n', '"xE\'", PUT(xx.xxxx.),"\'"']
['quit; \n']
['\n']
['/* 1.xxxxx FROM xxxx_x_Ex_x */ \n']
['proc sql; ', ';\n', '"CUuuiiiiuth"']
['hhhjhfjs as fdsjfsj:\n']
['select * from djfkjd to jfkjs\n']
['(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj\n']
['\tFROM &xxx..xxx_xxx_xxE\n']
["where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and \n"]
[' ', ')\n', "xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv."]
[' );']
My intended Output that I am unable to get should look something like this:
("xE'", PUT(xx.xxxx.),"'")
("TRUuuuth")
(
SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
(xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))
)
("xE'", PUT(xx.xxxx.),"'")
("CUuuiiiiuth")
(SELECT abc AS abc1, abc_2_ AS efg, abc_fg, fkdkfj_vv, jjsflkl_ff, fjkdsf_jfkj
FROM &xxx..xxx_xxx_xxE
where ((xxx(xx_ix as format 'xxxx-xx') gff &jfjfsj_jfjfj.) and
(xxx(xx_ix as format 'xxxx-xx') lec &jgjsd_vnv.))(( ))
)
More information about the Python-list
mailing list