Jump to content

User:Zeno Gantner/Perl hacks

From Meta, a Wikimedia project coordination wiki

I used these scripts to enhance an existing translation dictionary with Wikipedia interlink word pairs.

All you need is a working Perl installation and the current Wikipedia database dump in a MySQL database.

The programs are just quick hacks, so no documentation and no warranty whatsoever. Additionally licensed under GNU GPL, Version 2 or later, as published by the Free Software Foundation. --Zenogantner 17:23, 1 Dec 2004 (UTC)

I think these scripts won't work for newer MediaWiki versions like 1.5, as the database schema was changed. --zeno 12:21, 30 Jun 2005 (UTC)

extract_articles.pl

[edit]
#!/usr/bin/perl -w

# Extracts all real articles from the Wikipedia database
# Only saves the article ID, the title and the article's text
# into the new table.

use DBI;

my $user = 'db_user';
my $password = 'db_user_pw';

my $step = 1000;

my $handle = DBI->connect("dbi:mysql:wikipedia_en", $user, $password);

my $query = $handle->prepare(<<SQL);
   SELECT MAX(cur_id) FROM cur
SQL
$query->execute;

my ($max) = $query->fetchrow_array();
print "Highest article id is $max\n";

my $drop = $handle->prepare(<<SQL);
  DROP TABLE articles
SQL
$drop->execute;

my $create = $handle->prepare(<<SQL);
   CREATE TABLE articles (
     counter   INT(8) UNSIGNED,
     cur_id    INT(8) UNSIGNED,
     cur_title VARCHAR(255) BINARY,
     cur_text  MEDIUMTEXT,
     PRIMARY KEY (counter))
SQL
$create->execute;



$query = $handle->prepare(<<SQL);
   SELECT cur_id, cur_title, cur_text
   FROM cur 
   WHERE cur_namespace = 0
     AND cur_is_redirect = 0
     AND cur_id >= ?
     AND cur_id < ?
SQL

$ins = $handle->prepare(<<SQL);
   INSERT INTO articles (counter, cur_id, cur_title, cur_text)
   VALUES (?, ?, ?, ?)
SQL


$max++;
# increment max to have the last row included (if max mod step == 0)

my $counter = 0;
for (my $low = 0; $low < $max; $low += $step) {
    my $high = $low + $step;
    print "Query from id $low to $high: ";
    print 'Now ';
    $query->execute($low, $high);
    while (my @row = $query->fetchrow_array()) {
	my ($id, $title, $text) = @row;	
	$ins->execute($counter++, $id, $title, $text);
    }
    print "$counter entries in the database\n";
}

extract_commas.pl

[edit]
#!/usr/bin/perl -w

# extract articles that have commas in their name (mostly geographical names,
# book/movie/album titles, persons)
# This script will most likely make sense if you use English data, as
# English town names often have the form
# TOWN, STATE or TOWN, COUNTRY

my @target_languages = ('de', 'es', 'ca', 'fr');

use DBI;

my $user = 'root';
my $password = 'root';

my $step = 1000;

my $handle = DBI->connect("dbi:mysql:wikipedia_en", $user, $password);


my $query = $handle->prepare(<<SQL);
   SELECT MAX(counter) FROM interlinks
SQL
$query->execute;

my ($max) = $query->fetchrow_array();
print "Highest article id is $max\n";


my $drop = $handle->prepare(<<SQL);
    DROP TABLE comma_articles
SQL
$drop->execute;

$drop = $handle->prepare(<<SQL);
    DROP TABLE noncomma_articles
SQL
$drop->execute;

$drop = $handle->prepare(<<SQL);
    DROP TABLE paranthese_articles
SQL
$drop->execute;

my $create = $handle->prepare(<<SQL);
   CREATE TABLE comma_articles (
     counter   INT(8) UNSIGNED,
     cur_id    INT(8) UNSIGNED,
     cur_title VARCHAR(255) BINARY,
     $target_languages[0] VARCHAR(255) BINARY,
     $target_languages[1] VARCHAR(255) BINARY,
     $target_languages[2] VARCHAR(255) BINARY,
     $target_languages[3] VARCHAR(255) BINARY,
   PRIMARY KEY (counter))
SQL
$create->execute or die;

$create = $handle->prepare(<<SQL);
   CREATE TABLE noncomma_articles (
     counter   INT(8) UNSIGNED,
     cur_id    INT(8) UNSIGNED,
     cur_title VARCHAR(255) BINARY,
     $target_languages[0] VARCHAR(255) BINARY,
     $target_languages[1] VARCHAR(255) BINARY,
     $target_languages[2] VARCHAR(255) BINARY,
     $target_languages[3] VARCHAR(255) BINARY,
   PRIMARY KEY (counter));
SQL
$create->execute;

$create = $handle->prepare(<<SQL);
   CREATE TABLE paranthese_articles (
     counter   INT(8) UNSIGNED,
     cur_id    INT(8) UNSIGNED,
     cur_title VARCHAR(255) BINARY,
     comment VARCHAR(255) BINARY,
     $target_languages[0] VARCHAR(255) BINARY,
     $target_languages[1] VARCHAR(255) BINARY,
     $target_languages[2] VARCHAR(255) BINARY,
     $target_languages[3] VARCHAR(255) BINARY,
   PRIMARY KEY (counter));
SQL
$create->execute;


$query = $handle->prepare(<<SQL);
   SELECT cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2]
   FROM interlinks
   WHERE counter >= ?
     AND counter < ?
SQL

my $ins1 = $handle->prepare(<<SQL);
   INSERT INTO comma_articles (counter, cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3])
   VALUES (?, ?, ?, ?, ?, ?, ?);
SQL

my $ins2 = $handle->prepare(<<SQL);
   INSERT INTO noncomma_articles (counter, cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3])
   VALUES (?, ?, ?, ?, ?, ?, ?);
SQL

my $ins3 = $handle->prepare(<<SQL);
INSERT INTO paranthese_articles (counter, cur_id, cur_title, comment, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3])
   VALUES (?, ?, ?, ?, ?, ?, ?, ?);
SQL

$max++;
# increment max to have the last row included (if max mod step == 0)


my $comma_counter = 0;
my $noncomma_counter = 0;
my $paranthese_counter = 0;
for (my $low = 0; $low < $max; $low += $step) {
    my $high = $low + $step;
    print "Query from id $low to $high: ";
    $query->execute($low, $high) or die;

    while (my @row = $query->fetchrow_array()) {
	my ($id, $title, $l1, $l2, $l3, $l4) = @row;	

	if ($title =~ /,/) {
	    $ins1->execute($comma_counter++, $id, $title, $l1, $l2, $l3, $l4) or die;
	 } else {
	    $ins2->execute($noncomma_counter++, $id, $title, $l1, $l2, $l3, $l4) or die;
	    if ($title =~ /\((.+)\)/ ) {
		$ins3->execute($paranthese_counter++, $id, $title, $1, $l1, $l2, $l3, $l4) or die;
	    }
	}
    }
    print "$comma_counter article names with comma, $noncomma_counter without. $paranthese_counter article titles with comments in parantheses\n";
}

extract_interlinks.pl

[edit]
#!/usr/bin/perl -w

my $source_language = 'en';
my @target_languages = ('de', 'es', 'ca', 'fr');

use DBI;

my $user = 'root';
my $password = 'root';

my $step = 1000;

my $handle = DBI->connect("dbi:mysql:wikipedia_en", $user, $password);


my $query = $handle->prepare(<<SQL);
   SELECT MAX(counter) FROM articles
SQL
$query->execute;

my ($max) = $query->fetchrow_array();
print "Highest article id is $max\n";



my $drop = $handle->prepare(<<SQL);
  DROP TABLE interlinks
SQL
$drop->execute;

my $create = $handle->prepare(<<SQL);
   CREATE TABLE interlinks (
     counter   INT(8) UNSIGNED,
     cur_id    INT(8) UNSIGNED,
     cur_title VARCHAR(255) BINARY,
     $target_languages[0] VARCHAR(255) BINARY,
     $target_languages[1] VARCHAR(255) BINARY,
     $target_languages[2] VARCHAR(255) BINARY,
     $target_languages[3] VARCHAR(255) BINARY,
     PRIMARY KEY (counter))
SQL
$create->execute;



$query = $handle->prepare(<<SQL);
   SELECT cur_id, cur_title, cur_text
   FROM articles
   WHERE counter >= ?
     AND counter < ?
SQL

$ins = $handle->prepare(<<SQL);
   INSERT INTO interlinks (counter, cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3])
   VALUES (?, ?, ?, ?, ?, ?, ?)
SQL


$max++;
# increment max to have the last row included (if max mod step == 0)

my $counter = 0;
for (my $low = 0; $low < $max; $low += $step) {
    my $high = $low + $step;
    print "Query from id $low to $high: ";
    $query->execute($low, $high) or die;

    my $no_counter = 0;

    while (my @row = $query->fetchrow_array()) {
	my ($id, $title, $text) = @row;	

	$title =~ s/\_/ /; # change underscore to whitespace

	($l1, $l2, $l3, $l4) = @target_languages;

	my %trans;

	for my $l (@target_languages) {
	    if ( $text =~ /\[\[$l:([^\]^\[]+)\]\]/ ) {
		my $translation = $1;
		$translation =~ s/_/ /;
		$trans{$l} = $translation;
	    }
	}

	if (%trans) {
	    $ins->execute($counter++, $id, $title, $trans{$l1}, $trans{$l2}, $trans{$l3}, $trans{$l4}) or warn;
	 } else {
	    $no_counter++;
	}
    }
    print "Dropped $no_counter articles without translation, ";
    print "now $counter interwiki link sets in the database\n";
}