Processing MediaWiki XML with STX/Page ids
From Meta, a Wikimedia project coordination wiki
This script returns a tabbed list of id, namespace and title of each page. You need to use the script to add namespaces as a filter and process the output with this script. If the first script is in file AddNamespaces.stx and the second in TabbedPageIds.stx just call:
zcat pages_current.xml.gz | java -jar joost.jar - AddNamespaces.stx TabbedPageIds.stx
[edit] Output
3 8 MediaWiki:Categories 4 8 MediaWiki:Category 5 8 MediaWiki:Category header ...
[edit] STX code
(see source of this article for the right version)
<stx:transform version="1.0"
xmlns:stx="http://stx.sourceforge.net/2002/ns"
xmlns:m="http://www.mediawiki.org/xml/export-0.3/"
pass-through="none"
output-method="text"
>
<stx:buffer name="namespaces"/>
<stx:variable name="first-revision"/>
<stx:variable name="page-title"/>
<stx:variable name="page-id"/>
<stx:variable name="page-namespace"/>
<stx:template match="m:page">
<stx:assign name="first-revision" select="true()"/>
<stx:assign name="page-id" select="false()"/>
<stx:process-children/>
<stx:value-of select="$page-id"/>
<stx:text> </stx:text>
<stx:value-of select="$page-namespace"/>
<stx:text> </stx:text>
<stx:value-of select="$page-title"/>
<stx:text>
</stx:text>
</stx:template>
<stx:template match="m:id">
<stx:if test="$first-revision">
<stx:assign name="page-id" select="."/>
</stx:if>
</stx:template>
<stx:template match="m:title">
<stx:assign name="page-namespace" select="@namespace"/>
<stx:assign name="page-title" select="."/>
</stx:template>
<!-- don't process revisions (they also have ids) -->
<stx:template match="m:revision"/>
</stx:transform>