User:John Vandenberg/wiki xml

From Meta, a Wikimedia project coordination wiki
Jump to navigation Jump to search

Processing MediaWiki syntax using XML.

flexbisonparse[edit]

Python support[edit]

In the flexbisonparse direction, add the following files:

  • "pymod.c"
#include "Python.h"

const char* wikiparse_do_parse (const char* input);

char wikiparse_toxml__doc__[] =
    "toxml(wikitext) -- Convert Wikitext to XML\n";

PyObject *wikiparse_toxml(PyObject *self, PyObject *args) {
    PyObject *result = NULL;
    const char *wikitext, *xml;

    if (!PyArg_ParseTuple(args, "s", &wikitext))
        return NULL;

    xml = wikiparse_do_parse(wikitext);

    result = Py_BuildValue("s", xml);
    return result;
}

static PyMethodDef wikiparse_functions[] = {
    {"toxml", (PyCFunction)wikiparse_toxml, METH_VARARGS,
                           wikiparse_toxml__doc__},
    {NULL, NULL, 0, NULL}
};

/* module entry-point (module-initialization) function */
void initwikiparse(void) {
   /* Create the module and add the functions */
   PyObject *m = Py_InitModule3("wikiparse", wikiparse_functions, "MediaWiki syntax parser");
}

  • setup.py
from distutils.core import setup, Extension

setup(name="wikiparse", version="0.1",
      ext_modules=[Extension("wikiparse", ["pymod.c", "lex.yy.c", "wikiparse.tab.c", "parsetree.c"])])
  • wikiparse.py
import wikiparse
import sys

def parse(page):
   return wikiparse.toxml(page)

def main():
   print "%s" % parse("hello")
   print "%s" % parse( .join( sys.stdin.readlines() ) )

if __name__ == "__main__":
   main()


Execute:

$ make
$ python setup.py install
$ cat test.txt | python ./wikiparse.py | xmllint --format -