[XML-SIG] Using SP in Python

Lars Marius Garshol larsga@garshol.priv.no
28 Dec 2000 12:59:37 +0100


I've written a simple wrapper for the SP SGML parser's generic API and
also a SAX driver for that wrapper.  The SAX driver probably belongs
in saxtools and will be placed there.  The SP wrapper is perhaps
better off as a separate project, but if anyone feels it belongs in the
XML-SIG, I'll be happy to reconsider.

Appended are a sample application that emits ESIS, the C module and
the SAX driver, in that order.

Comments of all kinds would be welcome.

--Lars M.


======================================================================

import pysp

class EsisHandler:
    
    def start_element(self, name, attrs):
        print "(" + name
        for pair in attrs.items():
            print "A%s %s" % pair

    def error(self, msg):
        print "E" + msg
            
    def data(self, data):
        print "-" + repr(data)

    def sdata(self, text, name):
        print "[" + text + " " + name

    def pi(self, data):
        print "?" + data
        
    def end_element(self, name):
        print ")" + name

class Empty:
    pass
        
pysp.add_catalog("/home/larsga/data/catalog")
parser = pysp.make_parser("/home/larsga/cvs-co/data/book/bok.sgml")
parser.run(Empty())

======================================================================


/**
 * A wrapper module for the generic API of the SP SGML parser.
 *
 * $Id$
 */

/**
 * Todo:
 * - implement more events
 * - support more SP options
 * - better support for attributes through dedicated attribute type?
 * - let parser use an internal dictionary to intern element and attr names?
 */

#include "Python.h"

// define this if your libsp.a has been built with multibyte support
// (this is the default)
// undefine it if it has not
// if you fail to define this and libsp.a _does_ have multibyte support
// all your element and attribute names will be one character long...
#define SP_MULTI_BYTE 1

#include "ParserEventGeneratorKit.h"

// defines SP_VERSION as SP_T("x.x.x")
#include "version.h"
#define SP_T(x) x

static char pysp_module_documentation[] =
  "Python wrapper for the generic API of the SP SGML parser.";

/* ----------------------------------------------------------------------
   INTERNAL STUFF
 */

ParserEventGeneratorKit parserGenerator;

/* ----------------------------------------------------------------------
   UTILITIES
 */

char* extract_string(const SGMLApplication::CharString &string) {
  char* str = new char[string.len + 1];
  for (int ix = 0; ix < string.len; ix++)
    str[ix] = char(string.ptr[ix]);
  str[string.len] = 0;

  return str;
}

void extract_string(char* buffer, const SGMLApplication::CharString &string) {
  for (int ix = 0; ix < string.len; ix++)
    buffer[ix] = char(string.ptr[ix]);  
}

/* ----------------------------------------------------------------------
   SGML APPLICATION
 */

class PYSPApplication : public SGMLApplication {
 public:
  PYSPApplication(PyObject *_pyapp, EventGenerator *_eventGen) {
    Py_INCREF(_pyapp);
    pyapp = _pyapp;
    eventGen = _eventGen;
    position = NULL;
    openEntity = NULL;
  }

  void openEntityChange(const OpenEntityPtr &event) {
    openEntity = (OpenEntityPtr*) &event;
  }

  void startElement(const StartElementEvent &event) {
    position = (Position*) &event.pos;
    char *gi = extract_string(event.gi);
    PyObject *attrs = PyDict_New();
    for (size_t ix = 0; ix < event.nAttributes; ix++) {
      if (event.attributes[ix].type != Attribute::implied &&
	  event.attributes[ix].type != Attribute::invalid) {
	char *name = extract_string(event.attributes[ix].name);
	PyDict_SetItemString(attrs, name, getValue(event.attributes[ix]));
	delete[] name;
      }
    }    
    PyObject *arglist = Py_BuildValue("(sO)", gi, attrs);      

    handleCallback("start_element", arglist);
    
    delete[] gi;
  }

  void data(const DataEvent &event) {
    position = (Position*) &event.pos;
    char *data = extract_string(event.data);    
    PyObject *arglist = Py_BuildValue("(s)", data);

    handleCallback("data", arglist);
    
    delete[] data;
  }

  void sdata(const SdataEvent &event) {
    position = (Position*) &event.pos;
    char *text = extract_string(event.text);    
    char *name = extract_string(event.entityName);    
    PyObject *arglist = Py_BuildValue("(ss)", text, name);

    handleCallback("sdata", arglist);
    
    delete[] text, name;    
  }
  
  void endElement(const EndElementEvent &event) {
    position = (Position*) &event.pos;
    char *gi = extract_string(event.gi);    
    PyObject *arglist = Py_BuildValue("(s)", gi);

    handleCallback("end_element", arglist);
    
    delete[] gi;
  }

  void pi(const PiEvent &event) {
    position = (Position*) &event.pos;
    char *data = extract_string(event.data);    
    PyObject *arglist = Py_BuildValue("(s)", data);

    handleCallback("pi", arglist);
    
    delete[] data;
  }
  
  void error(const ErrorEvent &event) {
    position = (Position*) &event.pos;
    char* msg = extract_string(event.message);    
    PyObject *arglist = Py_BuildValue("(s)", msg);

    handleCallback("error", arglist);

    delete[] msg;
  }

  Location* getLocation() {
    return new Location(*openEntity, *position);
  }
  
  ~PYSPApplication() {
    Py_DECREF(pyapp);
  }
  
 private:
  PyObject *pyapp;
  EventGenerator *eventGen;
  Position *position;
  OpenEntityPtr *openEntity;

  void handleCallback(char *name, PyObject *arglist) {
    // get function from pyapp
    PyObject *callback = PyObject_GetAttrString(pyapp, name);
    if (callback == NULL) {
      PyErr_Clear(); // not really a problem; ignore
      return;
    }

    if (!PyCallable_Check(callback)) {
      eventGen->halt();
      PyErr_SetString(PyExc_TypeError, "callback attribute must be callable");
      return;
    }
    
    // call function
    if (PyEval_CallObject(callback, arglist) == NULL) 
      eventGen->halt();

    Py_DECREF(arglist);    
  }

  PyObject *getValue(const Attribute &attr) {
    PyObject *value = PyString_FromString("<value>");
    char *tmp_value;
    int value_len = 0;
    int pos = 0;
    
    switch(attr.type) {
    case Attribute::cdata:
      for (int ix = 0; ix < attr.nCdataChunks; ix++) 
	value_len += attr.cdataChunks[ix].data.len;
      
      tmp_value = new char[value_len + 1];
      for (int ix = 0; ix < attr.nCdataChunks; ix++) {
	extract_string(tmp_value + pos, attr.cdataChunks[ix].data);
	pos += attr.cdataChunks[ix].data.len;
      }
      tmp_value[pos] = 0;

      value = PyString_FromString(tmp_value);
      delete[] tmp_value;
      break;
    case Attribute::tokenized:
      tmp_value = extract_string(attr.tokens);
      value = PyString_FromString(tmp_value);
      delete[] tmp_value;
      break;      
    }

    return value;
  }
};


/* ----------------------------------------------------------------------
   SGML PARSER CLASS
 */

typedef struct {
  PyObject_HEAD

  EventGenerator *eventGen;
  PYSPApplication *application;
} sgmlparseobject;

static char Sgmlparsetype__doc__[] = "SGML parser.";

static char sgmlparse_halt__doc__[] =
"halt()\n Halt the generation of events by run(). This can be at any point\nduring the execution of run(). It is safe to call this function from a\ndifferent thread from that which called run(). ";

extern "C" PyObject* sgmlparse_halt(sgmlparseobject *self,
				    PyObject *args) {
  if (!PyArg_ParseTuple(args, ""))
    return NULL;

  self->eventGen->halt();

  Py_INCREF(Py_None);
  return Py_None;
}

static char sgmlparse_get_line_number__doc__[] =
"get_line_number()\n Returns the line number of the current event.";

extern "C" PyObject* sgmlparse_get_line_number(sgmlparseobject *self,
					       PyObject *args) {
  if (!PyArg_ParseTuple(args, ""))
    return NULL;

  SGMLApplication::Location *location = self->application->getLocation();
  PyObject *value = Py_BuildValue("i", location->lineNumber);
  delete location;
  return value;
}

static char sgmlparse_get_column_number__doc__[] =
"get_column_number()\n Returns the column number of the current event.";

extern "C" PyObject* sgmlparse_get_column_number(sgmlparseobject *self,
						 PyObject *args) {
  if (!PyArg_ParseTuple(args, ""))
    return NULL;

  SGMLApplication::Location *location = self->application->getLocation();
  PyObject *value = Py_BuildValue("i", location->columnNumber);
  delete location;
  return value;
}

static char sgmlparse_get_filename__doc__[] =
"get_filename()\n Returns the name of the file where the current event occurred.";

extern "C" PyObject* sgmlparse_get_filename(sgmlparseobject *self,
					    PyObject *args) {
  if (!PyArg_ParseTuple(args, ""))
    return NULL;

  SGMLApplication::Location *location = self->application->getLocation();
  char* tmp = extract_string(location->filename);
  PyObject *value = Py_BuildValue("s", tmp);
  delete location;
  delete tmp;
  return value;
}

static char sgmlparse_get_entity_name__doc__[] =
"get_entity_name()\n Returns the name of the entity where the current event occurred.";

extern "C" PyObject* sgmlparse_get_entity_name(sgmlparseobject *self,
					       PyObject *args) {
  if (!PyArg_ParseTuple(args, ""))
    return NULL;

  SGMLApplication::Location *location = self->application->getLocation();
  char* tmp = extract_string(location->entityName);
  PyObject *value = Py_BuildValue("s", tmp);
  delete location;
  delete tmp;
  return value;
}

static char sgmlparse_get_byte_offset__doc__[] =
"get_byte_offset()\n Returns number of bytes in the storage object preceding the point\nwhere the current event occurred.";

extern "C" PyObject* sgmlparse_get_byte_offset(sgmlparseobject *self,
					       PyObject *args) {
  if (!PyArg_ParseTuple(args, ""))
    return NULL;

  SGMLApplication::Location *location = self->application->getLocation();
  PyObject *value = Py_BuildValue("i", location->byteOffset);
  delete location;
  return value;
}

static char sgmlparse_get_entity_offset__doc__[] =
"get_entity_offset()\n Returns number of characters in the current entity preceding the\npoint where the current event occurred.";

extern "C" PyObject* sgmlparse_get_entity_offset(sgmlparseobject *self,
						 PyObject *args) {
  if (!PyArg_ParseTuple(args, ""))
    return NULL;

  SGMLApplication::Location *location = self->application->getLocation();
  PyObject *value = Py_BuildValue("i", location->entityOffset);
  delete location;
  return value;
}

static char sgmlparse_run__doc__[] =
"run(app)\n Generate the sequence of events, calling the corresponding\nmember of app for each event. Returns the number of errors. This must\nnot be called more than once for any SGML parser object.";

extern "C" PyObject* sgmlparse_run(sgmlparseobject *self,
				    PyObject *args) {
  PyObject* app;
  
  if (!PyArg_ParseTuple(args, "O", &app))
    return NULL;

  PYSPApplication realapp = PYSPApplication(app, self->eventGen);
  self->application = &realapp;
  self->eventGen->run(realapp);
  
  if (PyErr_Occurred()) 
    return NULL; // an error occurred in a callback; tell Python about it
  
  Py_INCREF(Py_None);
  return Py_None;
}

struct PyMethodDef sgmlparse_methods[] = {
        {"halt",             (PyCFunction) sgmlparse_halt,
	 METH_VARARGS,   sgmlparse_halt__doc__},
        {"run",              (PyCFunction) sgmlparse_run,
	 METH_VARARGS,   sgmlparse_run__doc__},
        {"get_line_number",  (PyCFunction) sgmlparse_get_line_number,
	 METH_VARARGS,   sgmlparse_get_line_number__doc__},
        {"get_column_number",(PyCFunction) sgmlparse_get_column_number,
	 METH_VARARGS,   sgmlparse_get_column_number__doc__},
        {"get_filename",     (PyCFunction) sgmlparse_get_filename,
	 METH_VARARGS,   sgmlparse_get_filename__doc__},
        {"get_entity_name",  (PyCFunction) sgmlparse_get_entity_name,
	 METH_VARARGS,   sgmlparse_get_entity_name__doc__},
        {"get_byte_offset",  (PyCFunction) sgmlparse_get_byte_offset,
	 METH_VARARGS,   sgmlparse_get_byte_offset__doc__},
        {"get_entity_offset",(PyCFunction) sgmlparse_get_entity_offset,
	 METH_VARARGS,   sgmlparse_get_entity_offset__doc__},
        {NULL,          NULL}           /* sentinel */
};

extern "C" void sgmlparse_dealloc(sgmlparseobject *self) {
  delete self->eventGen;
  self->eventGen = NULL;
  PyMem_DEL(self);
}

extern "C" PyObject* sgmlparse_getattr(sgmlparseobject *self, char *name)
{
  if (strcmp(name, "__members__") == 0){
    PyObject *list = PyList_New(0);
    for (int ix = 0; sgmlparse_methods[ix].ml_name; ix++)
      PyList_Append(list, PyString_FromString(sgmlparse_methods[ix].ml_name));
    return list;
  }
  
  return Py_FindMethod(sgmlparse_methods, (PyObject*) self, name);
}

static PyTypeObject Sgmlparsetype = {
        PyObject_HEAD_INIT(NULL) 0,              /*ob_size*/
        "sgmlparser",                            /*tp_name*/
        sizeof(sgmlparseobject),                 /*tp_basicsize*/
        0,                                       /*tp_itemsize*/
        /* methods */
        (destructor)  sgmlparse_dealloc,         /*tp_dealloc*/
        (printfunc)   0,                         /*tp_print*/
        (getattrfunc) sgmlparse_getattr,         /*tp_getattr*/
        (setattrfunc) 0,                         /*tp_setattr*/
        (cmpfunc)     0,                         /*tp_compare*/
        (reprfunc)    0,                         /*tp_repr*/
                      0,                         /*tp_as_number*/
                      0,                         /*tp_as_sequence*/
                      0,                         /*tp_as_mapping*/
        (hashfunc)    0,                         /*tp_hash*/
        (ternaryfunc) 0,                         /*tp_call*/
        (reprfunc)    0,                         /*tp_str*/

        /* Space for future expansion */
        0L,0L,0L,0L,
        Sgmlparsetype__doc__ /* Documentation string */
};

/* ----------------------------------------------------------------------
   FUNCTIONS
 */

static char pysp_make_parser__doc__[] =
"make_parser(filename) -> parser\n\
Return a new SGML parser object bound to the given file name.";

extern "C" PyObject* pysp_make_parser(PyObject *self, PyObject *args) {
    char *filename;
    sgmlparseobject *parser;

    if (!PyArg_ParseTuple(args, "s", &filename))
      return NULL;

    EventGenerator *evg = parserGenerator.makeEventGenerator(1, &filename);

    parser = PyObject_NEW(sgmlparseobject, &Sgmlparsetype);
    if (parser == NULL)
      return NULL;
   
    parser->eventGen = evg;
    evg->inhibitMessages(1); // don't print error messages to stderr
    return (PyObject*) parser;
}

static char pysp_add_catalog__doc__[] =
"add_catalog(filename)\n\
Tell the pysp module about a catalog file.";

extern "C" PyObject* pysp_add_catalog(PyObject *self, PyObject *args) {
    char *filename;

    if (!PyArg_ParseTuple(args, "s", &filename))
      return NULL;

    parserGenerator.setOption(ParserEventGeneratorKit::addCatalog, filename);
    
    Py_INCREF(Py_None);
    return Py_None;
}

/* ----------------------------------------------------------------------
   MODULE INITIALIZATION
 */

static PyMethodDef PYSPMethods[] = {
  {"make_parser",  pysp_make_parser, METH_VARARGS, pysp_make_parser__doc__},
  {"add_catalog",  pysp_add_catalog, METH_VARARGS, pysp_add_catalog__doc__},
  {NULL,      NULL}        /* Sentinel */
};

extern "C" void initpysp() {
  PyObject *module, *dict;

  Sgmlparsetype.ob_type = &PyType_Type;
                          
  module = Py_InitModule4("pysp", PYSPMethods, pysp_module_documentation,
			  (PyObject*) NULL, PYTHON_API_VERSION);
  dict = PyModule_GetDict(module);
  PyDict_SetItemString(dict, "sp_version", Py_BuildValue("s", SP_VERSION));
  PyDict_SetItemString(dict, "version", Py_BuildValue("s", "0.01"));
}

======================================================================

"""A SAX driver for the SP SGML parser, using the pysp extension module.

$Id$
"""

# --- Import wizardry

from xml.sax._exceptions import *
try:
    import pysp
except ImportError:
    raise SAXReaderNotAvailable("pysp not supported", None)

from xml.sax import xmlreader, saxutils, handler

AttributesImpl = xmlreader.AttributesImpl

import string

# --- Constants

version = "0.01"

namespace = "http://garshol.priv.no/symbolic/"
property_catalogs = "http://garshol.priv.no/symbolic/" + "properties/catalogs"

# --- PySPParser

class PySPParser(xmlreader.XMLReader, xmlreader.Locator):
    "SAX driver for the pysp C module."

    def __init__(self):
        xmlreader.XMLReader.__init__(self)
        self._source = xmlreader.InputSource()
        self._parser = None
        self._parsing = 0

        self._catalogs = []

    # XMLReader methods

    def parse(self, source):
        "Parse an XML document from a file. (Nothing else is supported.)"
        source = saxutils.prepare_input_source(source)

        self._cont_handler.setDocumentLocator(self)
        for catalog in self._catalogs:
            pysp.add_catalog(catalog)

        parser = pysp.make_parser(source.getSystemId())
        parser.run(self)
        
    def getFeature(self, name):
        raise SAXNotRecognizedException("Feature '%s' not recognized" % name)

    def setFeature(self, name, state):
        if self._parsing:
            raise SAXNotSupportedException("Cannot set features while parsing")

        raise SAXNotRecognizedException("Feature '%s' not recognized" % name)

    def getProperty(self, name):
        if name == property_catalogs:
            return self._catalogs
        
        raise SAXNotRecognizedException("Property '%s' not recognized" % name)

    def setProperty(self, name, value):
        if self._parsing:
            raise SAXNotSupportedException("Cannot set properties while parsing")

        if name == property_catalogs:
            if type(value) != type([]):
                raise SAXException("Value must be a list of strings!")

            self._catalogs = value
            return
        
        raise SAXNotRecognizedException("Property '%s' not recognized" % name)
        
    # Locator methods

    def getColumnNumber(self):
        return self._parser.get_column_number()

    def getLineNumber(self):
        return self._parser.get_line_number()

    def getPublicId(self):
        return None # FIXME!

    def getSystemId(self):
        return self._parser.get_filename()
    
    # event handlers
    def start_element(self, name, attrs):
        self._cont_handler.startElement(name, AttributesImpl(attrs))

    def end_element(self, name):
        self._cont_handler.endElement(name)

    def pi(self, data):
        pos = string.find(data, " ")
        if pos != -1:
            self._cont_handler.processingInstruction(data[ : pos],
                                                     data[pos + 1 : ])

    def data(self, data):
        self._cont_handler.characters(data)

    def sdata(self, text, entityname):
        # FIXME: does this make sense?
        self._cont_handler.characters(text)

    def error(self, msg):
        self._err_handler.error(SAXException(msg))

# ---
        
def create_parser(*args, **kwargs):
    return apply(PySPParser, args, kwargs)
        
# ---

if __name__ == "__main__":
    from xml.sax.saxutils import XMLGenerator
    from xml.sax.handler import ErrorHandler
    p = create_parser()
    p.setContentHandler(XMLGenerator(open("bok.xml", "w")))
    p.setErrorHandler(ErrorHandler())
    p.setProperty(property_catalogs, ["/home/larsga/data/catalog"])
    p.parse("/home/larsga/cvs-co/data/book/bok.sgml")