Processing MediaWiki XML with STX/Extract templates

From Meta, a Wikimedia project coordination wiki
Jump to: navigation, search
Blue Glass Arrow.svg MediaWiki logo.png
This page should be moved to MediaWiki.org.
Please do not move the page by hand. It will be imported by a MediaWiki.org administrator with the full edit history. In the meantime, you may continue to edit the page as normal.

Here is a STX-script to extract Personendaten and PND templates out of wiki pages of the German Wikipedia. (tested with joost alpha build 2005-05-21).

Output[edit]

This is what the output looks like:

<mediawiki>
  <page>
   <title>Title of the first page</title>
   <revisions>
     <parsed>
       <template name="name_of_template"/>
       <template name="name_of_template_with_parameters">
         <param name="named_parameter">content</param>
         <param>content</param>  <!-- parameter without name -->
         ...
       </template>       
       ...
     </parsed>
   </revisions>
  </page>
  ...
</mediawiki>

STL Code[edit]

<?xml version="1.0"?>
<!--
    Extract Personendaten and PND-Numbers out of MediaWiki pages
    Version: 0.1

    When Joost implements regular expressions live will be easier    
-->
<stx:transform version="1.0"
               xmlns:stx="http://stx.sourceforge.net/2002/ns"
               xmlns:m="http://www.mediawiki.org/xml/export-0.3/"
               pass-through="none"
               output-method="xml"
               exclude-result-prefixes="#all"
>

<stx:variable name="namespace-prefixes"/>

<!-- fill sequence of namespace-prefixes -->
<stx:template match="m:namespace">
  <stx:if test="@key!=0">
    <stx:assign name="namespace-prefixes" select="($namespace-prefixes, .)"/>
  </stx:if>
</stx:template>

<!-- root element -->
<stx:template match="/m:mediawiki">
  <mediawiki>
    <stx:process-children />
  </mediawiki>    
</stx:template>

<stx:template match="m:siteinfo">    
  <stx:process-children />
  <!-- TODO: add siteinfo and time (using a parameter) -->
</stx:template>

<stx:template match="m:namespaces">
  <stx:process-children />
</stx:template>

<!-- Store title and text of a page -->
<stx:variable name="page-title"/>
<stx:variable name="page-text"/>

<stx:template match="m:title">
  <stx:assign name="page-title" select="string(.)"/>    
</stx:template>

<stx:template match="m:text">
  <stx:assign name="page-text" select="string(.)"/>
</stx:template>

<stx:variable name="first-revision" select="true()"/>

<stx:template match="m:revision">
  <stx:if test="$first-revision">
    <stx:assign name="first-revision" select="false()"/>  
    <stx:process-children/>
  </stx:if>  
</stx:template>

<stx:buffer name="parsed"/>
<stx:variable name="pd-count" select="0"/>
<stx:variable name="pnd-count" select="0"/>
<stx:variable name="found-something"/>

<stx:template match="m:page">  
  <stx:assign name="first-revision" select="true()"/>
  <stx:process-children />
  
  <!-- Test for namespace prefix. We only want namespace 0 -->
  <stx:variable name="prefix" select="substring-before($page-title,':')"/>
  <stx:variable name="skip" select="false()"/>
  <stx:if test="$prefix">
    <stx:value-of select="$prefix"/>    
    <stx:for-each-item name="p" select="$namespace-prefixes">
      <stx:if test="string($p) = string($prefix)">
        <stx:assign name="skip" select="true()"/> 
      </stx:if>
    </stx:for-each-item>
  </stx:if>  
  <stx:if test="not($skip)">
    <stx:assign name="found-something" select="false()"/> 
    <stx:result-buffer name="parsed" clear="yes">  
      <!-- Parse "{{...}}". Does not work with nested templates (FIXME) -->
      <stx:variable name="text" select="$page-text"/>
      <stx:while test="string-length($text) > 0">
        <stx:variable name="before" select="substring-before($text,'{{')"/>
        <stx:assign name="text" select="substring-after($text,'{{')"/>
        <stx:call-procedure name="template">
          <stx:with-param name="content" select="substring-before($text,'}}')"/>
        </stx:call-procedure>
        <stx:assign name="text" select="substring-after($text,'}}')"/>
      </stx:while>
    </stx:result-buffer>  
    <stx:if test="$found-something">
      <stx:message>
        <stx:value-of select="$pd-count"/>
        <stx:text>/</stx:text>
        <stx:value-of select="$pnd-count"/>
      </stx:message>
      <stx:text>
</stx:text> <!-- newline -->
  <page>
    <title><stx:value-of select="$page-title"/></title>
    <stx:text>
</stx:text> <!-- newline -->
    <revision>
      <!-- TODO: add last-modified etc. -->      
      <parsed>  
        <stx:process-buffer name="parsed" group="copy"/>
      </parsed>
      <stx:text>
</stx:text> <!-- newline -->
    </revision>
  </page>
  <stx:text>
</stx:text> <!-- newline -->
  </stx:if>
</stx:if>  
</stx:template>

<stx:group name="copy">
  <!-- don't copy xmlns, please! -->  
  <stx:template match="*">
    <stx:element name="{name(.)}">
      <stx:process-attributes/>
      <stx:process-children/>
    </stx:element>
  </stx:template>
  <stx:template match="@*">
    <stx:attribute name="{name(.)}" select="."/>
  </stx:template>
  <stx:template match="text()">
    <stx:value-of select="."/>
  </stx:template>
</stx:group>

<stx:procedure name="template">
  <stx:param name="content" required="yes"/>
  <stx:if test="starts-with($content,'PND')">
    <stx:assign name="pnd-count" select="$pnd-count+1"/>
    <stx:assign name="found-something" select="true()"/>
    <template name="PND">
      <param>
        <stx:value-of select="normalize-space(substring-after($content,'|'))"/>
      </param>
    </template>
  </stx:if>
  <stx:else>
    <stx:if test="starts-with($content,'Personendaten')">
      <stx:assign name="pd-count" select="$pd-count+1"/>
      <stx:assign name="found-something" select="true()"/>
      <template name="Personendaten">  
        <stx:call-procedure name="Personendaten">
          <stx:with-param name="text" 
               select="normalize-space(substring-after($content,'|'))"/>
        </stx:call-procedure>  
      </template>
    </stx:if>
  </stx:else>
</stx:procedure>


<!-- Parses template parameters in "{{Personendaten|...}}" -->
<stx:procedure name="Personendaten">
  <stx:param name="text"/>  
  <stx:variable name="tokens"/>  
   
  <!-- tokenize ..."|"..."|"... -->
  <stx:while test="string-length($text) > 0">
    <stx:variable name="before" select="substring-before($text,'|')"/>
    <stx:if test="not($before)">
      <stx:assign name="before" select="$text"/>
    </stx:if>
    <stx:assign name="tokens" select="($tokens, $before)"/>    
    <stx:assign name="text" select="substring-after($text,'|')"/>
  </stx:while>  

  <!-- recognize named parameters (only!) -->
  <stx:variable name="parameter"/>
  <stx:variable name="value"/>
  <stx:for-each-item name="token" select="$tokens">  
    <stx:variable name="name" select="normalize-space(substring-before($token,'='))"/>
    <stx:if test="$name">     
      <stx:if test="$parameter"> <!-- handle previous parameter -->
        <param name="{$parameter}">
          <stx:value-of select="normalize-space($value)"/>
        </param>
      </stx:if>  
      <stx:assign name="parameter" select="$name"/>
      <stx:assign name="value" select="substring-after($token,'=')"/>
    </stx:if>
    <stx:else> <!-- token is part of the previous value -->
      <stx:assign name="value" select="concat($value,'|',$token)"/>
    </stx:else>
  </stx:for-each-item>
  <stx:if test="$parameter"> <!-- handle last parameter -->
    <param name="{$parameter}">
      <stx:value-of select="normalize-space($value)"/>
    </param>
  </stx:if>
</stx:procedure>

</stx:transform>