Processing MediaWiki XML with STX/Page ids

From Meta, a Wikimedia project coordination wiki
Jump to navigation Jump to search

This script returns a tabbed list of id, namespace and title of each page. You need to use the script to add namespaces as a filter and process the output with this script. If the first script is in file AddNamespaces.stx and the second in TabbedPageIds.stx just call:

zcat pages_current.xml.gz | java -jar joost.jar - AddNamespaces.stx TabbedPageIds.stx


3       8       MediaWiki:Categories
4       8       MediaWiki:Category
5       8       MediaWiki:Category header

STX code[edit]

(see source of this article for the right version)

<stx:transform version="1.0"

<stx:buffer name="namespaces"/>

<stx:variable name="first-revision"/>
<stx:variable name="page-title"/>
<stx:variable name="page-id"/>
<stx:variable name="page-namespace"/>

<stx:template match="m:page">
  <stx:assign name="first-revision" select="true()"/>
  <stx:assign name="page-id" select="false()"/>

  <stx:value-of select="$page-id"/>
  <stx:text>	</stx:text>
  <stx:value-of select="$page-namespace"/>
  <stx:text>	</stx:text>
  <stx:value-of select="$page-title"/>

<stx:template match="m:id">
  <stx:if test="$first-revision">
    <stx:assign name="page-id" select="."/>

<stx:template match="m:title">
  <stx:assign name="page-namespace" select="@namespace"/>
  <stx:assign name="page-title" select="."/>  

<!-- don't process revisions (they also have ids) -->
<stx:template match="m:revision"/>