Processing MediaWiki XML with STX/Extract templates

From Meta, a Wikimedia project coordination wiki
Jump to navigation Jump to search
Blue Glass Arrow.svg MediaWiki logo.png
This page should be moved to MediaWiki.org.
Please do not move the page by hand. It will be imported by a MediaWiki.org administrator with the full edit history. In the meantime, you may continue to edit the page as normal.

Here is a STX-script to extract Personendaten and PND templates out of wiki pages of the German Wikipedia. (tested with joost alpha build 2005-05-21).

Output[edit]

This is what the output looks like:

<mediawiki>
  <page>
   <title>Title of the first page</title>
   <revisions>
     <parsed>
       <template name="name_of_template"/>
       <template name="name_of_template_with_parameters">
         <param name="named_parameter">content</param>
         <param>content</param>  <!-- parameter without name -->
         ...
       </template>       
       ...
     </parsed>
   </revisions>
  </page>
  ...
</mediawiki>

STL Code[edit]

<?xml version="1.0"?>
<!--
    Extract Personendaten and PND-Numbers out of MediaWiki pages
    Version: 0.1

    When Joost implements regular expressions live will be easier    
-->
<stx:transform version="1.0"
               xmlns:stx="http://stx.sourceforge.net/2002/ns"
               xmlns:m="http://www.mediawiki.org/xml/export-0.3/"
               pass-through="none"
               output-method="xml"
               exclude-result-prefixes="#all"
>

<stx:variable name="namespace-prefixes"/>

<!-- fill sequence of namespace-prefixes -->
<stx:template match="m:namespace">
  <stx:if test="@key!=0">
    <stx:assign name="namespace-prefixes" select="($namespace-prefixes, .)"/>
  </stx:if>
</stx:template>

<!-- root element -->
<stx:template match="/m:mediawiki">
  <mediawiki>
    <stx:process-children />
  </mediawiki>    
</stx:template>

<stx:template match="m:siteinfo">    
  <stx:process-children />
  <!-- TODO: add siteinfo and time (using a parameter) -->
</stx:template>

<stx:template match="m:namespaces">
  <stx:process-children />
</stx:template>

<!-- Store title and text of a page -->
<stx:variable name="page-title"/>
<stx:variable name="page-text"/>

<stx:template match="m:title">
  <stx:assign name="page-title" select="string(.)"/>    
</stx:template>

<stx:template match="m:text">
  <stx:assign name="page-text" select="string(.)"/>
</stx:template>

<stx:variable name="first-revision" select="true()"/>

<stx:template match="m:revision">
  <stx:if test="$first-revision">
    <stx:assign name="first-revision" select="false()"/>  
    <stx:process-children/>
  </stx:if>  
</stx:template>

<stx:buffer name="parsed"/>
<stx:variable name="pd-count" select="0"/>
<stx:variable name="pnd-count" select="0"/>
<stx:variable name="found-something"/>

<stx:template match="m:page">  
  <stx:assign name="first-revision" select="true()"/>
  <stx:process-children />
  
  <!-- Test for namespace prefix. We only want namespace 0 -->
  <stx:variable name="prefix" select="substring-before($page-title,':')"/>
  <stx:variable name="skip" select="false()"/>
  <stx:if test="$prefix">
    <stx:value-of select="$prefix"/>    
    <stx:for-each-item name="p" select="$namespace-prefixes">
      <stx:if test="string($p) = string($prefix)">
        <stx:assign name="skip" select="true()"/> 
      </stx:if>
    </stx:for-each-item>
  </stx:if>  
  <stx:if test="not($skip)">
    <stx:assign name="found-something" select="false()"/> 
    <stx:result-buffer name="parsed" clear="yes">  
      <!-- Parse "{{...}}". Does not work with nested templates (FIXME) -->
      <stx:variable name="text" select="$page-text"/>
      <stx:while test="string-length($text) > 0">
        <stx:variable name="before" select="substring-before($text,'{{')"/>
        <stx:assign name="text" select="substring-after($text,'{{')"/>
        <stx:call-procedure name="template">
          <stx:with-param name="content" select="substring-before($text,'}}')"/>
        </stx:call-procedure>
        <stx:assign name="text" select="substring-after($text,'}}')"/>
      </stx:while>
    </stx:result-buffer>  
    <stx:if test="$found-something">
      <stx:message>
        <stx:value-of select="$pd-count"/>
        <stx:text>/</stx:text>
        <stx:value-of select="$pnd-count"/>
      </stx:message>
      <stx:text>
</stx:text> <!-- newline -->
  <page>
    <title><stx:value-of select="$page-title"/></title>
    <stx:text>
</stx:text> <!-- newline -->
    <revision>
      <!-- TODO: add last-modified etc. -->      
      <parsed>  
        <stx:process-buffer name="parsed" group="copy"/>
      </parsed>
      <stx:text>
</stx:text> <!-- newline -->
    </revision>
  </page>
  <stx:text>
</stx:text> <!-- newline -->
  </stx:if>
</stx:if>  
</stx:template>

<stx:group name="copy">
  <!-- don't copy xmlns, please! -->  
  <stx:template match="*">
    <stx:element name="{name(.)}">
      <stx:process-attributes/>
      <stx:process-children/>
    </stx:element>
  </stx:template>
  <stx:template match="@*">
    <stx:attribute name="{name(.)}" select="."/>
  </stx:template>
  <stx:template match="text()">
    <stx:value-of select="."/>
  </stx:template>
</stx:group>

<stx:procedure name="template">
  <stx:param name="content" required="yes"/>
  <stx:if test="starts-with($content,'PND')">
    <stx:assign name="pnd-count" select="$pnd-count+1"/>
    <stx:assign name="found-something" select="true()"/>
    <template name="PND">
      <param>
        <stx:value-of select="normalize-space(substring-after($content,'|'))"/>
      </param>
    </template>
  </stx:if>
  <stx:else>
    <stx:if test="starts-with($content,'Personendaten')">
      <stx:assign name="pd-count" select="$pd-count+1"/>
      <stx:assign name="found-something" select="true()"/>
      <template name="Personendaten">  
        <stx:call-procedure name="Personendaten">
          <stx:with-param name="text" 
               select="normalize-space(substring-after($content,'|'))"/>
        </stx:call-procedure>  
      </template>
    </stx:if>
  </stx:else>
</stx:procedure>


<!-- Parses template parameters in "{{Personendaten|...}}" -->
<stx:procedure name="Personendaten">
  <stx:param name="text"/>  
  <stx:variable name="tokens"/>  
   
  <!-- tokenize ..."|"..."|"... -->
  <stx:while test="string-length($text) > 0">
    <stx:variable name="before" select="substring-before($text,'|')"/>
    <stx:if test="not($before)">
      <stx:assign name="before" select="$text"/>
    </stx:if>
    <stx:assign name="tokens" select="($tokens, $before)"/>    
    <stx:assign name="text" select="substring-after($text,'|')"/>
  </stx:while>  

  <!-- recognize named parameters (only!) -->
  <stx:variable name="parameter"/>
  <stx:variable name="value"/>
  <stx:for-each-item name="token" select="$tokens">  
    <stx:variable name="name" select="normalize-space(substring-before($token,'='))"/>
    <stx:if test="$name">     
      <stx:if test="$parameter"> <!-- handle previous parameter -->
        <param name="{$parameter}">
          <stx:value-of select="normalize-space($value)"/>
        </param>
      </stx:if>  
      <stx:assign name="parameter" select="$name"/>
      <stx:assign name="value" select="substring-after($token,'=')"/>
    </stx:if>
    <stx:else> <!-- token is part of the previous value -->
      <stx:assign name="value" select="concat($value,'|',$token)"/>
    </stx:else>
  </stx:for-each-item>
  <stx:if test="$parameter"> <!-- handle last parameter -->
    <param name="{$parameter}">
      <stx:value-of select="normalize-space($value)"/>
    </param>
  </stx:if>
</stx:procedure>

</stx:transform>