[XML-SIG] I am confused...

Roman Suzi rnd@onego.ru
Mon, 29 Jan 2001 13:59:29 +0300 (MSK)


On Sun, 28 Jan 2001, Martin v. Loewis wrote:

I do not remember if this was what I used for measuring, but
this was my another effort to create query-mechanisms
(It doesnt work anymore due to lack of xml.dom.utils)

--------------------

#!/usr/bin/python1.5

print "1. simple"

from xml.dom.utils import FileReader
from xml.dom.core import createDocument
from string import split, index

ELEMENT                 = 1
ATTRIBUTE               = 2
TEXT                    = 3
CDATA_SECTION           = 4
ENTITY_REFERENCE        = 5
ENTITY                  = 6
PROCESSING_INSTRUCTION  = 7
COMMENT                 = 8
DOCUMENT                = 9
DOCUMENT_TYPE           = 10
DOCUMENT_FRAGMENT       = 11
NOTATION                = 12

d = FileReader()
dom = d.readFile('104.xml')

def portr(node):
    typ = node.get_nodeType()
    value = node.get_nodeValue()
    name = node.get_nodeName()
    atts = node.get_attributes()
    par = node.get_parentNode()
    print "t ",   typ, "v ",value, "n ",name, "a ", atts, "p ", par

class strstream:
  def __init__(self, str):
     self.str = str
#     print "strstream init"

  def read(self, n):
     tmp = self.str[:n]
     self.str = self.str[n:]
     return tmp

  def readline(self):
     return self.str

def _normalize_tokens(tl):
    """ rules:
    $,word,$ --> $word$
    """
    rules2 = {
    ("/","/") : "//",
    (".","/") : "./",
    ("!","=") : "$ne$",
    ("<","=") : "$le$",
    (">","=") : "$ge$",
    ("=","~") : "$match$",
    ("!","~") : "$no_match$",
    (";",";") : ";",
    }

    rules1 = {
    "=" : "$eq$",
    "!" : "$lt$",
    "<" : "$lt$",
    ">" : "$gt$",
    }

    ntl = []
    i = 0
    while i < len(tl)-1:
      if rules2.has_key( tuple(tl[i:i+2]) ):
        toapp = rules2[tuple(tl[i:i+2])]
        i = i+2
      else:
        if tl[i] == "$":
          if i+2 < len(tl):
            toapp = tl[i] + tl[i+1] + tl[i+2]
            i = i+3
          else:
            raise "Query error !!!" + `tl`
        else:
          toapp = tl[i]
          i = i+1
      if rules1.has_key( toapp ):
        toapp = rules1[toapp]
      ntl.append( toapp )
    return ntl

def _parse_query(q):
    from shlex import shlex
    #  i1 = index(q, "/")
    lexer = shlex(strstream(q))
    tokens = []
    tt = lexer.get_token()
    while tt:
      tokens.append(tt)
      tt = lexer.get_token()
    return _normalize_tokens(tokens)

def find_all_descendants(node, cond):
    return None     # XXX !!! stub

def find_all_children(node, cond):
    lst = []
    exec(cond)       ### must define condition !!!
    for n in node.get_childNodes():
      if condition(n):
        lst.append(n)
    return lst

class PYQL:
  def __init__(self, file):
    d = FileReader()
    self.dom = d.readFile(file)
    if self.dom.get_nodeType() == DOCUMENT:
      self.docel = self.dom.get_documentElement()


  def query(self, q):
#    return  self._query(self.docel, q)
#     return  _parse_query(q)
    qr = self._query(self.docel, _parse_query(q), self.dom )      # ???
    qel = self.dom.createElement("xql:result")
    if qr:
      qel.appendChild(qr)
    qel.setAttribute("orig", str(q))
    return qel

  def _query(self, node, subq, qrdoc):
#    print subq
    print find_all_children(node,
    """def condition(n): return n.get_nodeName() == "fig" """)
    if subq[0] == "//":
      self._query(node, subq[1:], qrdoc)
    elif subq[0] == "/":
      if subq[1] == node.get_nodeName():
        if len(subq) > 2:
          if subq[2] == "/":
            qel = qrdoc.createElement(node.get_nodeName())
            for a in node.get_attributes().keys():
              qel.setAttribute(a, node.get_attributes()[a].get_nodeValue())
            for node1 in node.get_childNodes():
              q2 = self._query(node1, subq[2:], qrdoc)
#              print "q2: ", q2
              if q2:
                 qel.appendChild(q2)
            if len(qel.get_childNodes())==0:
              del qel
              return None
            else:
              return qel
          else:
            return node
        else:
          return node
      else:
        return None


a = PYQL('104.xml')
#  a.query('$or$ != 1.23E-4          /article/text/topic$')
#  print a.query('/article/text/topic.').toxml()
print a.query('/article/text/figures/fig.').toxml()
#   print a.query('//fig.').toxml()

-----------

It was naive attempt to write XQL for Python...

>> I remember I was doing queries in the form
>> "/article/author/name"
>> - and it was so slow... (0.5 - 1 sec per query on Celeron 400)
>
>What kind of API did you use? For simple queries like this, a SAX
>ContentHandler may be sufficient. Using Uche's bigxml file, you can
>try

>import xml.sax
>class NameRetriever(xml.sax.ContentHandler):
>    def __init__(self):
>        self.authors = []
>        self.in_author = self.in_name = 0
>
>    def startElement(self, tag, attrs):
>        if tag=="author":
>            self.in_author = 1
>        else:
>            if self.in_author and tag == "name":
>                self.in_name = 1
>                self.txt = ""
>
>    def characters(self,str):
>        if self.in_name:
>            self.txt = self.txt+str
>
>    def endElement(self,tag):
>        if self.in_name and tag=="name":
>            self.authors.append(self.txt)
>            self.in_name=0
>        elif self.in_author and tag=="author":
>            self.in_author=0
>
>h = NameRetriever()
>start=time.time();xml.sax.parse("bigxml",handler=h);end = time.time()
>print end - start
>print len(h.authors)

The above code is what I avoid to do.  I want my application to be
completely data-driven, so even "/article/author/name" must not appear in
the program!

>To my own surprise, this is not as fast as the cDomlette; probably
>because the latter links directly with expat, and thus avoids a number
>of indirections. Still, it takes only three times as long (0.5s vs
>1.4s on my machine), and it will work on any Python 2.0 installation.
>
>> Please, tell me if I did it wrong:
>>
>> - parsed xml-file
>> - quered each variable in a template-file from the xml-file
>> - filled template with values found to produce web-page
>>   (some variables go to other pages, for example, content page)
>
>In general, that is ok - except that the description is unprecise. How
>did you parse? How did you query? How did you fill the template?

My code above answer these questions.

>> Anyway, before claiming XML tools for Python slow I need to recheck
>> with new versions - if there are no objections to the above
>> scheme. (And what is preferrable tool for queries?  XPath?)
>
>It depends. A SAX ContentHandler may do in many cases - although it is
>apparently not necessarily faster than XPath over a fast DOM
>implementation.

>> Is there any on-line tutorial (?) or just example code
>> to learn how to work efficiently with XML from Python?
>
>To learn PyXML, there is a an online tutorial on the PyXML topic
>guide. To learn working efficiently is probably not something that can
>be taught in a tutorial - that is much a matter of experience.

Thanks! I shall look there too.

>Regards,
>Martin

Sincerely yours, Roman Suzi
-- 
Vote for my design: http://silvermouse.onego.ru/gray.php3?id=0018
_/ Russia _/ Karelia _/ Petrozavodsk _/ rnd@onego.ru _/
_/ Monday, January 29, 2001 _/ Powered by Linux RedHat 6.2 _/
_/ "The tuna doesn't taste the same without the dolphin." _/