Processing MediaWiki XML with STX/Page ids

From Meta, a Wikimedia project coordination wiki
Jump to navigation Jump to search

This script returns a tabbed list of id, namespace and title of each page. You need to use the script to add namespaces as a filter and process the output with this script. If the first script is in file AddNamespaces.stx and the second in TabbedPageIds.stx just call:

zcat pages_current.xml.gz | java -jar joost.jar - AddNamespaces.stx TabbedPageIds.stx

Output[edit]

3       8       MediaWiki:Categories
4       8       MediaWiki:Category
5       8       MediaWiki:Category header
...

STX code[edit]

(see source of this article for the right version)

<stx:transform version="1.0"
               xmlns:stx="http://stx.sourceforge.net/2002/ns"
               xmlns:m="http://www.mediawiki.org/xml/export-0.3/"               
               pass-through="none"
               output-method="text"
>

<stx:buffer name="namespaces"/>

<stx:variable name="first-revision"/>
<stx:variable name="page-title"/>
<stx:variable name="page-id"/>
<stx:variable name="page-namespace"/>

<stx:template match="m:page">
  <stx:assign name="first-revision" select="true()"/>
  <stx:assign name="page-id" select="false()"/>

  <stx:process-children/>
  
  <stx:value-of select="$page-id"/>
  <stx:text>	</stx:text>
  <stx:value-of select="$page-namespace"/>
  <stx:text>	</stx:text>
  <stx:value-of select="$page-title"/>
  <stx:text>
</stx:text>
</stx:template>

<stx:template match="m:id">
  <stx:if test="$first-revision">
    <stx:assign name="page-id" select="."/>
  </stx:if>
</stx:template>

<stx:template match="m:title">
  <stx:assign name="page-namespace" select="@namespace"/>
  <stx:assign name="page-title" select="."/>  
</stx:template>

<!-- don't process revisions (they also have ids) -->
<stx:template match="m:revision"/>  

</stx:transform>