[XML-SIG] I am confused...
Roman Suzi
rnd@onego.ru
Mon, 29 Jan 2001 13:59:29 +0300 (MSK)
On Sun, 28 Jan 2001, Martin v. Loewis wrote:
I do not remember if this was what I used for measuring, but
this was my another effort to create query-mechanisms
(It doesnt work anymore due to lack of xml.dom.utils)
--------------------
#!/usr/bin/python1.5
print "1. simple"
from xml.dom.utils import FileReader
from xml.dom.core import createDocument
from string import split, index
ELEMENT = 1
ATTRIBUTE = 2
TEXT = 3
CDATA_SECTION = 4
ENTITY_REFERENCE = 5
ENTITY = 6
PROCESSING_INSTRUCTION = 7
COMMENT = 8
DOCUMENT = 9
DOCUMENT_TYPE = 10
DOCUMENT_FRAGMENT = 11
NOTATION = 12
d = FileReader()
dom = d.readFile('104.xml')
def portr(node):
typ = node.get_nodeType()
value = node.get_nodeValue()
name = node.get_nodeName()
atts = node.get_attributes()
par = node.get_parentNode()
print "t ", typ, "v ",value, "n ",name, "a ", atts, "p ", par
class strstream:
def __init__(self, str):
self.str = str
# print "strstream init"
def read(self, n):
tmp = self.str[:n]
self.str = self.str[n:]
return tmp
def readline(self):
return self.str
def _normalize_tokens(tl):
""" rules:
$,word,$ --> $word$
"""
rules2 = {
("/","/") : "//",
(".","/") : "./",
("!","=") : "$ne$",
("<","=") : "$le$",
(">","=") : "$ge$",
("=","~") : "$match$",
("!","~") : "$no_match$",
(";",";") : ";",
}
rules1 = {
"=" : "$eq$",
"!" : "$lt$",
"<" : "$lt$",
">" : "$gt$",
}
ntl = []
i = 0
while i < len(tl)-1:
if rules2.has_key( tuple(tl[i:i+2]) ):
toapp = rules2[tuple(tl[i:i+2])]
i = i+2
else:
if tl[i] == "$":
if i+2 < len(tl):
toapp = tl[i] + tl[i+1] + tl[i+2]
i = i+3
else:
raise "Query error !!!" + `tl`
else:
toapp = tl[i]
i = i+1
if rules1.has_key( toapp ):
toapp = rules1[toapp]
ntl.append( toapp )
return ntl
def _parse_query(q):
from shlex import shlex
# i1 = index(q, "/")
lexer = shlex(strstream(q))
tokens = []
tt = lexer.get_token()
while tt:
tokens.append(tt)
tt = lexer.get_token()
return _normalize_tokens(tokens)
def find_all_descendants(node, cond):
return None # XXX !!! stub
def find_all_children(node, cond):
lst = []
exec(cond) ### must define condition !!!
for n in node.get_childNodes():
if condition(n):
lst.append(n)
return lst
class PYQL:
def __init__(self, file):
d = FileReader()
self.dom = d.readFile(file)
if self.dom.get_nodeType() == DOCUMENT:
self.docel = self.dom.get_documentElement()
def query(self, q):
# return self._query(self.docel, q)
# return _parse_query(q)
qr = self._query(self.docel, _parse_query(q), self.dom ) # ???
qel = self.dom.createElement("xql:result")
if qr:
qel.appendChild(qr)
qel.setAttribute("orig", str(q))
return qel
def _query(self, node, subq, qrdoc):
# print subq
print find_all_children(node,
"""def condition(n): return n.get_nodeName() == "fig" """)
if subq[0] == "//":
self._query(node, subq[1:], qrdoc)
elif subq[0] == "/":
if subq[1] == node.get_nodeName():
if len(subq) > 2:
if subq[2] == "/":
qel = qrdoc.createElement(node.get_nodeName())
for a in node.get_attributes().keys():
qel.setAttribute(a, node.get_attributes()[a].get_nodeValue())
for node1 in node.get_childNodes():
q2 = self._query(node1, subq[2:], qrdoc)
# print "q2: ", q2
if q2:
qel.appendChild(q2)
if len(qel.get_childNodes())==0:
del qel
return None
else:
return qel
else:
return node
else:
return node
else:
return None
a = PYQL('104.xml')
# a.query('$or$ != 1.23E-4 /article/text/topic$')
# print a.query('/article/text/topic.').toxml()
print a.query('/article/text/figures/fig.').toxml()
# print a.query('//fig.').toxml()
-----------
It was naive attempt to write XQL for Python...
>> I remember I was doing queries in the form
>> "/article/author/name"
>> - and it was so slow... (0.5 - 1 sec per query on Celeron 400)
>
>What kind of API did you use? For simple queries like this, a SAX
>ContentHandler may be sufficient. Using Uche's bigxml file, you can
>try
>import xml.sax
>class NameRetriever(xml.sax.ContentHandler):
> def __init__(self):
> self.authors = []
> self.in_author = self.in_name = 0
>
> def startElement(self, tag, attrs):
> if tag=="author":
> self.in_author = 1
> else:
> if self.in_author and tag == "name":
> self.in_name = 1
> self.txt = ""
>
> def characters(self,str):
> if self.in_name:
> self.txt = self.txt+str
>
> def endElement(self,tag):
> if self.in_name and tag=="name":
> self.authors.append(self.txt)
> self.in_name=0
> elif self.in_author and tag=="author":
> self.in_author=0
>
>h = NameRetriever()
>start=time.time();xml.sax.parse("bigxml",handler=h);end = time.time()
>print end - start
>print len(h.authors)
The above code is what I avoid to do. I want my application to be
completely data-driven, so even "/article/author/name" must not appear in
the program!
>To my own surprise, this is not as fast as the cDomlette; probably
>because the latter links directly with expat, and thus avoids a number
>of indirections. Still, it takes only three times as long (0.5s vs
>1.4s on my machine), and it will work on any Python 2.0 installation.
>
>> Please, tell me if I did it wrong:
>>
>> - parsed xml-file
>> - quered each variable in a template-file from the xml-file
>> - filled template with values found to produce web-page
>> (some variables go to other pages, for example, content page)
>
>In general, that is ok - except that the description is unprecise. How
>did you parse? How did you query? How did you fill the template?
My code above answer these questions.
>> Anyway, before claiming XML tools for Python slow I need to recheck
>> with new versions - if there are no objections to the above
>> scheme. (And what is preferrable tool for queries? XPath?)
>
>It depends. A SAX ContentHandler may do in many cases - although it is
>apparently not necessarily faster than XPath over a fast DOM
>implementation.
>> Is there any on-line tutorial (?) or just example code
>> to learn how to work efficiently with XML from Python?
>
>To learn PyXML, there is a an online tutorial on the PyXML topic
>guide. To learn working efficiently is probably not something that can
>be taught in a tutorial - that is much a matter of experience.
Thanks! I shall look there too.
>Regards,
>Martin
Sincerely yours, Roman Suzi
--
Vote for my design: http://silvermouse.onego.ru/gray.php3?id=0018
_/ Russia _/ Karelia _/ Petrozavodsk _/ rnd@onego.ru _/
_/ Monday, January 29, 2001 _/ Powered by Linux RedHat 6.2 _/
_/ "The tuna doesn't taste the same without the dolphin." _/