User:Zeno Gantner/Perl hacks
Appearance
(Redirected from User:Zenogantner/Perl hacks)
I used these scripts to enhance an existing translation dictionary with Wikipedia interlink word pairs.
All you need is a working Perl installation and the current Wikipedia database dump in a MySQL database.
The programs are just quick hacks, so no documentation and no warranty whatsoever. Additionally licensed under GNU GPL, Version 2 or later, as published by the Free Software Foundation. --Zenogantner 17:23, 1 Dec 2004 (UTC)
I think these scripts won't work for newer MediaWiki versions like 1.5, as the database schema was changed. --zeno 12:21, 30 Jun 2005 (UTC)
extract_articles.pl
[edit]#!/usr/bin/perl -w
# Extracts all real articles from the Wikipedia database
# Only saves the article ID, the title and the article's text
# into the new table.
use DBI;
my $user = 'db_user';
my $password = 'db_user_pw';
my $step = 1000;
my $handle = DBI->connect("dbi:mysql:wikipedia_en", $user, $password);
my $query = $handle->prepare(<<SQL);
SELECT MAX(cur_id) FROM cur
SQL
$query->execute;
my ($max) = $query->fetchrow_array();
print "Highest article id is $max\n";
my $drop = $handle->prepare(<<SQL);
DROP TABLE articles
SQL
$drop->execute;
my $create = $handle->prepare(<<SQL);
CREATE TABLE articles (
counter INT(8) UNSIGNED,
cur_id INT(8) UNSIGNED,
cur_title VARCHAR(255) BINARY,
cur_text MEDIUMTEXT,
PRIMARY KEY (counter))
SQL
$create->execute;
$query = $handle->prepare(<<SQL);
SELECT cur_id, cur_title, cur_text
FROM cur
WHERE cur_namespace = 0
AND cur_is_redirect = 0
AND cur_id >= ?
AND cur_id < ?
SQL
$ins = $handle->prepare(<<SQL);
INSERT INTO articles (counter, cur_id, cur_title, cur_text)
VALUES (?, ?, ?, ?)
SQL
$max++;
# increment max to have the last row included (if max mod step == 0)
my $counter = 0;
for (my $low = 0; $low < $max; $low += $step) {
my $high = $low + $step;
print "Query from id $low to $high: ";
print 'Now ';
$query->execute($low, $high);
while (my @row = $query->fetchrow_array()) {
my ($id, $title, $text) = @row;
$ins->execute($counter++, $id, $title, $text);
}
print "$counter entries in the database\n";
}
extract_commas.pl
[edit]#!/usr/bin/perl -w
# extract articles that have commas in their name (mostly geographical names,
# book/movie/album titles, persons)
# This script will most likely make sense if you use English data, as
# English town names often have the form
# TOWN, STATE or TOWN, COUNTRY
my @target_languages = ('de', 'es', 'ca', 'fr');
use DBI;
my $user = 'root';
my $password = 'root';
my $step = 1000;
my $handle = DBI->connect("dbi:mysql:wikipedia_en", $user, $password);
my $query = $handle->prepare(<<SQL);
SELECT MAX(counter) FROM interlinks
SQL
$query->execute;
my ($max) = $query->fetchrow_array();
print "Highest article id is $max\n";
my $drop = $handle->prepare(<<SQL);
DROP TABLE comma_articles
SQL
$drop->execute;
$drop = $handle->prepare(<<SQL);
DROP TABLE noncomma_articles
SQL
$drop->execute;
$drop = $handle->prepare(<<SQL);
DROP TABLE paranthese_articles
SQL
$drop->execute;
my $create = $handle->prepare(<<SQL);
CREATE TABLE comma_articles (
counter INT(8) UNSIGNED,
cur_id INT(8) UNSIGNED,
cur_title VARCHAR(255) BINARY,
$target_languages[0] VARCHAR(255) BINARY,
$target_languages[1] VARCHAR(255) BINARY,
$target_languages[2] VARCHAR(255) BINARY,
$target_languages[3] VARCHAR(255) BINARY,
PRIMARY KEY (counter))
SQL
$create->execute or die;
$create = $handle->prepare(<<SQL);
CREATE TABLE noncomma_articles (
counter INT(8) UNSIGNED,
cur_id INT(8) UNSIGNED,
cur_title VARCHAR(255) BINARY,
$target_languages[0] VARCHAR(255) BINARY,
$target_languages[1] VARCHAR(255) BINARY,
$target_languages[2] VARCHAR(255) BINARY,
$target_languages[3] VARCHAR(255) BINARY,
PRIMARY KEY (counter));
SQL
$create->execute;
$create = $handle->prepare(<<SQL);
CREATE TABLE paranthese_articles (
counter INT(8) UNSIGNED,
cur_id INT(8) UNSIGNED,
cur_title VARCHAR(255) BINARY,
comment VARCHAR(255) BINARY,
$target_languages[0] VARCHAR(255) BINARY,
$target_languages[1] VARCHAR(255) BINARY,
$target_languages[2] VARCHAR(255) BINARY,
$target_languages[3] VARCHAR(255) BINARY,
PRIMARY KEY (counter));
SQL
$create->execute;
$query = $handle->prepare(<<SQL);
SELECT cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2]
FROM interlinks
WHERE counter >= ?
AND counter < ?
SQL
my $ins1 = $handle->prepare(<<SQL);
INSERT INTO comma_articles (counter, cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3])
VALUES (?, ?, ?, ?, ?, ?, ?);
SQL
my $ins2 = $handle->prepare(<<SQL);
INSERT INTO noncomma_articles (counter, cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3])
VALUES (?, ?, ?, ?, ?, ?, ?);
SQL
my $ins3 = $handle->prepare(<<SQL);
INSERT INTO paranthese_articles (counter, cur_id, cur_title, comment, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3])
VALUES (?, ?, ?, ?, ?, ?, ?, ?);
SQL
$max++;
# increment max to have the last row included (if max mod step == 0)
my $comma_counter = 0;
my $noncomma_counter = 0;
my $paranthese_counter = 0;
for (my $low = 0; $low < $max; $low += $step) {
my $high = $low + $step;
print "Query from id $low to $high: ";
$query->execute($low, $high) or die;
while (my @row = $query->fetchrow_array()) {
my ($id, $title, $l1, $l2, $l3, $l4) = @row;
if ($title =~ /,/) {
$ins1->execute($comma_counter++, $id, $title, $l1, $l2, $l3, $l4) or die;
} else {
$ins2->execute($noncomma_counter++, $id, $title, $l1, $l2, $l3, $l4) or die;
if ($title =~ /\((.+)\)/ ) {
$ins3->execute($paranthese_counter++, $id, $title, $1, $l1, $l2, $l3, $l4) or die;
}
}
}
print "$comma_counter article names with comma, $noncomma_counter without. $paranthese_counter article titles with comments in parantheses\n";
}
extract_interlinks.pl
[edit]#!/usr/bin/perl -w
my $source_language = 'en';
my @target_languages = ('de', 'es', 'ca', 'fr');
use DBI;
my $user = 'root';
my $password = 'root';
my $step = 1000;
my $handle = DBI->connect("dbi:mysql:wikipedia_en", $user, $password);
my $query = $handle->prepare(<<SQL);
SELECT MAX(counter) FROM articles
SQL
$query->execute;
my ($max) = $query->fetchrow_array();
print "Highest article id is $max\n";
my $drop = $handle->prepare(<<SQL);
DROP TABLE interlinks
SQL
$drop->execute;
my $create = $handle->prepare(<<SQL);
CREATE TABLE interlinks (
counter INT(8) UNSIGNED,
cur_id INT(8) UNSIGNED,
cur_title VARCHAR(255) BINARY,
$target_languages[0] VARCHAR(255) BINARY,
$target_languages[1] VARCHAR(255) BINARY,
$target_languages[2] VARCHAR(255) BINARY,
$target_languages[3] VARCHAR(255) BINARY,
PRIMARY KEY (counter))
SQL
$create->execute;
$query = $handle->prepare(<<SQL);
SELECT cur_id, cur_title, cur_text
FROM articles
WHERE counter >= ?
AND counter < ?
SQL
$ins = $handle->prepare(<<SQL);
INSERT INTO interlinks (counter, cur_id, cur_title, $target_languages[0], $target_languages[1], $target_languages[2], $target_languages[3])
VALUES (?, ?, ?, ?, ?, ?, ?)
SQL
$max++;
# increment max to have the last row included (if max mod step == 0)
my $counter = 0;
for (my $low = 0; $low < $max; $low += $step) {
my $high = $low + $step;
print "Query from id $low to $high: ";
$query->execute($low, $high) or die;
my $no_counter = 0;
while (my @row = $query->fetchrow_array()) {
my ($id, $title, $text) = @row;
$title =~ s/\_/ /; # change underscore to whitespace
($l1, $l2, $l3, $l4) = @target_languages;
my %trans;
for my $l (@target_languages) {
if ( $text =~ /\[\[$l:([^\]^\[]+)\]\]/ ) {
my $translation = $1;
$translation =~ s/_/ /;
$trans{$l} = $translation;
}
}
if (%trans) {
$ins->execute($counter++, $id, $title, $trans{$l1}, $trans{$l2}, $trans{$l3}, $trans{$l4}) or warn;
} else {
$no_counter++;
}
}
print "Dropped $no_counter articles without translation, ";
print "now $counter interwiki link sets in the database\n";
}