Processing MediaWiki XML with STX/Extract templates
Jump to navigation
Jump to search
Here is a STX-script to extract Personendaten and PND templates out of wiki pages of the German Wikipedia. (tested with joost alpha build 2005-05-21).
Output[edit]
This is what the output looks like:
<mediawiki> <page> <title>Title of the first page</title> <revisions> <parsed> <template name="name_of_template"/> <template name="name_of_template_with_parameters"> <param name="named_parameter">content</param> <param>content</param> <!-- parameter without name --> ... </template> ... </parsed> </revisions> </page> ... </mediawiki>
STL Code[edit]
<?xml version="1.0"?> <!-- Extract Personendaten and PND-Numbers out of MediaWiki pages Version: 0.1 When Joost implements regular expressions live will be easier --> <stx:transform version="1.0" xmlns:stx="http://stx.sourceforge.net/2002/ns" xmlns:m="http://www.mediawiki.org/xml/export-0.3/" pass-through="none" output-method="xml" exclude-result-prefixes="#all" > <stx:variable name="namespace-prefixes"/> <!-- fill sequence of namespace-prefixes --> <stx:template match="m:namespace"> <stx:if test="@key!=0"> <stx:assign name="namespace-prefixes" select="($namespace-prefixes, .)"/> </stx:if> </stx:template> <!-- root element --> <stx:template match="/m:mediawiki"> <mediawiki> <stx:process-children /> </mediawiki> </stx:template> <stx:template match="m:siteinfo"> <stx:process-children /> <!-- TODO: add siteinfo and time (using a parameter) --> </stx:template> <stx:template match="m:namespaces"> <stx:process-children /> </stx:template> <!-- Store title and text of a page --> <stx:variable name="page-title"/> <stx:variable name="page-text"/> <stx:template match="m:title"> <stx:assign name="page-title" select="string(.)"/> </stx:template> <stx:template match="m:text"> <stx:assign name="page-text" select="string(.)"/> </stx:template> <stx:variable name="first-revision" select="true()"/> <stx:template match="m:revision"> <stx:if test="$first-revision"> <stx:assign name="first-revision" select="false()"/> <stx:process-children/> </stx:if> </stx:template> <stx:buffer name="parsed"/> <stx:variable name="pd-count" select="0"/> <stx:variable name="pnd-count" select="0"/> <stx:variable name="found-something"/> <stx:template match="m:page"> <stx:assign name="first-revision" select="true()"/> <stx:process-children /> <!-- Test for namespace prefix. We only want namespace 0 --> <stx:variable name="prefix" select="substring-before($page-title,':')"/> <stx:variable name="skip" select="false()"/> <stx:if test="$prefix"> <stx:value-of select="$prefix"/> <stx:for-each-item name="p" select="$namespace-prefixes"> <stx:if test="string($p) = string($prefix)"> <stx:assign name="skip" select="true()"/> </stx:if> </stx:for-each-item> </stx:if> <stx:if test="not($skip)"> <stx:assign name="found-something" select="false()"/> <stx:result-buffer name="parsed" clear="yes"> <!-- Parse "{{...}}". Does not work with nested templates (FIXME) --> <stx:variable name="text" select="$page-text"/> <stx:while test="string-length($text) > 0"> <stx:variable name="before" select="substring-before($text,'{{')"/> <stx:assign name="text" select="substring-after($text,'{{')"/> <stx:call-procedure name="template"> <stx:with-param name="content" select="substring-before($text,'}}')"/> </stx:call-procedure> <stx:assign name="text" select="substring-after($text,'}}')"/> </stx:while> </stx:result-buffer> <stx:if test="$found-something"> <stx:message> <stx:value-of select="$pd-count"/> <stx:text>/</stx:text> <stx:value-of select="$pnd-count"/> </stx:message> <stx:text> </stx:text> <!-- newline --> <page> <title><stx:value-of select="$page-title"/></title> <stx:text> </stx:text> <!-- newline --> <revision> <!-- TODO: add last-modified etc. --> <parsed> <stx:process-buffer name="parsed" group="copy"/> </parsed> <stx:text> </stx:text> <!-- newline --> </revision> </page> <stx:text> </stx:text> <!-- newline --> </stx:if> </stx:if> </stx:template> <stx:group name="copy"> <!-- don't copy xmlns, please! --> <stx:template match="*"> <stx:element name="{name(.)}"> <stx:process-attributes/> <stx:process-children/> </stx:element> </stx:template> <stx:template match="@*"> <stx:attribute name="{name(.)}" select="."/> </stx:template> <stx:template match="text()"> <stx:value-of select="."/> </stx:template> </stx:group> <stx:procedure name="template"> <stx:param name="content" required="yes"/> <stx:if test="starts-with($content,'PND')"> <stx:assign name="pnd-count" select="$pnd-count+1"/> <stx:assign name="found-something" select="true()"/> <template name="PND"> <param> <stx:value-of select="normalize-space(substring-after($content,'|'))"/> </param> </template> </stx:if> <stx:else> <stx:if test="starts-with($content,'Personendaten')"> <stx:assign name="pd-count" select="$pd-count+1"/> <stx:assign name="found-something" select="true()"/> <template name="Personendaten"> <stx:call-procedure name="Personendaten"> <stx:with-param name="text" select="normalize-space(substring-after($content,'|'))"/> </stx:call-procedure> </template> </stx:if> </stx:else> </stx:procedure> <!-- Parses template parameters in "{{Personendaten|...}}" --> <stx:procedure name="Personendaten"> <stx:param name="text"/> <stx:variable name="tokens"/> <!-- tokenize ..."|"..."|"... --> <stx:while test="string-length($text) > 0"> <stx:variable name="before" select="substring-before($text,'|')"/> <stx:if test="not($before)"> <stx:assign name="before" select="$text"/> </stx:if> <stx:assign name="tokens" select="($tokens, $before)"/> <stx:assign name="text" select="substring-after($text,'|')"/> </stx:while> <!-- recognize named parameters (only!) --> <stx:variable name="parameter"/> <stx:variable name="value"/> <stx:for-each-item name="token" select="$tokens"> <stx:variable name="name" select="normalize-space(substring-before($token,'='))"/> <stx:if test="$name"> <stx:if test="$parameter"> <!-- handle previous parameter --> <param name="{$parameter}"> <stx:value-of select="normalize-space($value)"/> </param> </stx:if> <stx:assign name="parameter" select="$name"/> <stx:assign name="value" select="substring-after($token,'=')"/> </stx:if> <stx:else> <!-- token is part of the previous value --> <stx:assign name="value" select="concat($value,'|',$token)"/> </stx:else> </stx:for-each-item> <stx:if test="$parameter"> <!-- handle last parameter --> <param name="{$parameter}"> <stx:value-of select="normalize-space($value)"/> </param> </stx:if> </stx:procedure> </stx:transform>