Useless Perl scripts

From Meta, a Wikimedia project coordination wiki

I was asked to post the Perl scripts I wrote to do Wikipedia stuff somewhere, so here they are. I hereby place these in the public domain.

I am not responsible for any bugs or errors in these. Use these at your own risk.

required.pl[edit]

The next two scripts (whoami.pl and global_replace.pl) need this one.

#!/usr/bin/perl

use strict;

package Req;

use warnings;

use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Request;
use HTTP::Request::Common;

our $htmlentityhash = { 'lt' => '<', 'gt' => '>', 'amp' => '&', 'quot' => '"',

                        'aacute' => 'á', 'agrave' => 'à', 'acirc' => 'â', 'auml' => 'ä',
                        'eacute' => 'é', 'egrave' => 'è', 'ecirc' => 'ê', 'euml' => 'ë',
                        'iacute' => 'í', 'igrave' => 'ì', 'icirc' => 'î', 'iuml' => 'ï',
                        'oacute' => 'ó', 'ograve' => 'ò', 'ocirc' => 'ô', 'ouml' => 'ö',
                        'uacute' => 'ú', 'ugrave' => 'ù', 'ucirc' => 'û', 'uuml' => 'ü',
                        'yacute' => 'ý', 'atilde' => 'ã', 'otilde'=> 'õ', 'oslash' => 'ø',
                        'ntilde' => 'ñ', 'ccedil' => 'ç', 'aring' => 'å', 'yuml' => 'ÿ',
                        'aelig' => 'æ',  'oelig' => 'œ',  'szlig' => 'ß',

                        'Aacute' => 'Á', 'Agrave' => 'À', 'Acirc' => 'Â', 'Auml' => 'Ä',
                        'Eacute' => 'É', 'Egrave' => 'È', 'Ecirc' => 'Ê', 'Euml' => 'Ë',
                        'Iacute' => 'Í', 'Igrave' => 'Ì', 'Icirc' => 'Î', 'Iuml' => 'Ï',
                        'Oacute' => 'Ó', 'Ograve' => 'Ò', 'Ocirc' => 'Ô', 'Ouml' => 'Ö',
                        'Uacute' => 'Ú', 'Ugrave' => 'Ù', 'Ucirc' => 'Û', 'Uuml' => 'Ü',
                        'Yacute' => 'Ý', 'Atilde' => 'Ã', 'Otilde'=> 'Õ', 'Oslash' => 'Ø',
                        'Ntilde' => 'Ñ', 'Ccedil' => 'Ç', 'Aring' => 'Å',
                        'AElig' => 'Æ',  'OElig' => 'Œ',

                        'thorn' => 'þ', 'Thorn' => 'Þ', 'THORN' => 'Þ',
                        'eth' => 'ð', 'Eth' => 'Ð', 'ETH' => 'Ð',

                        'scaron' => 'š', 'zcaron' => 'ž', 'Scaron' => 'Š', 'Zcaron' => 'Ž', 
                        'sup2' => '²', 'sup3' => '³', 'times' => '×', 'div' => '÷',
                    };

sub download
{
    my $url = shift;
    my $ua = LWP::UserAgent->new ('agent' => 'Timwiscript');
    $ua->cookie_jar (HTTP::Cookies->new (file => "cookies.txt", autosave => 1));
    my $request = HTTP::Request->new ('GET' => $url);
    my $response = $ua->request ($request);
    if ($response->is_success) {
        return $response->content;
    } else {
        return $response->error_as_HTML;
    }
}

sub downloadsource
{
    my $pagename = esc(shift);
    my $opts = shift;
    my $domain = $opts->{'domain'} || "en.wikipedia.org";
    my $page = download ("http://$domain/w/wiki.phtml?title=$pagename&action=edit");
    if ($page =~ /<textarea .*? name="wpTextbox1" .*?>(.*?)<\/textarea>/s) {
        my $text = $1;
        $text =~ s/&(\w+);/ $htmlentityhash->{$1} || &$1; /eg;
        if ($page =~ /<input type=hidden value="(.*?)" name="wpEdittime">/) {
            my $edittime = $1;
            return { text => $text, edittime => $edittime };
        }
    }
    return undef;
}

sub postform
{
    my $url = shift;
    my $formdata = shift;  # arrayref
    my $ua = LWP::UserAgent->new ('agent' => 'Timwiscript');
    $ua->cookie_jar (HTTP::Cookies->new (file => "cookies.txt", autosave => 1));
    my $response = $ua->request(POST $url, $formdata);
    my $cont = $response->content;
    return $cont;
}

sub dec
{
    my $a = $_[0];
    $a =~ s/%([a-fA-F0-9][a-fA-F0-9])/ chr (hex $1) /eg;
    $a =~ s/&(\w+);/ $htmlentityhash->{$1} or "&$1;" /seg;
    $a =~ s/&#(\d+);/ chr $1 /seg;
    $a =~ s/&#x([a-fA-F0-9]+);/ chr (hex $1) /seg;
    $a =~ tr/_/ /;
    $a =~ s/^(.)/ uc $1 /e;
    return $a;
}

sub esc
{
    my $a = $_[0];
    $a =~ s/^(.)/ uc $1 /e;
    my @hex = ("0","1","2","3","4","5","6","7","8","9","A","B","C","D","E","F");
    $a =~ tr/ /_/;
    $a =~ s/([^a-zA-Z0-9_-])/'%'.$hex[(ord($1)>>4)%16].$hex[ord($1)%16]/eg;
    return $a;
}

sub submitedit
{
    my $title = shift;
    my $newtext = shift;
    my $summary = shift;
    my $minor = (shift) ? 1 : 0;
    my $etitle = esc ($title);

    print qq~Downloading "$title"...\n~;
    my $tpage = Req::download ("http://en.wikipedia.org/w/wiki.phtml?title=$title&action=edit");
    if ($tpage =~ /<input type=hidden value="(.*?)" name="wpEdittime">/)
    {
        my $edittime = $1;
        my $posturl = "http://en.wikipedia.org/w/wiki.phtml?title=$title&action=submit";
        my $attr = [ 'wpTextbox1'   => qq~$newtext~,
                     'wpMinoredit'  => $minor,
                     'wpSave'       => "Save page",
                     'wpSummary'    => $summary,
                     'wpEdittime'   => $edittime,
                   ];
        print qq~Submitting edit to "$title"...\n~;
        Req::postform ($posturl, $attr);
    }
}

1;

whoami.pl[edit]

Checks if you're logged in, and if not, lets you do so.

Command line syntax: just "whoami.pl" will do. It lets you say "whoami.pl login" if you want to log in without checking first. You can also say "whoami.pl site de" (etc.) to use other Wikipedias than the English one.

#!/usr/bin/perl

use strict;
require "required.pl";

use Term::ReadKey;

my $command = 'whoami';
my $site = 'en';
while (my $cmd = shift)
{
    if ($cmd eq 'site') {
        $site = shift;
    } else {
        $command = $cmd;
    }
}

my $baseurl = "http://${site}.wikipedia.org";
my $loginpage = "Special:Userlogin";

my $do_login = sub
{
    print "Username: ";
    my $username = ReadLine (0);
    chomp $username;
    ReadMode('noecho');
    print "Password: ";
    my $password = ReadLine(0);
    ReadMode('restore');
    chomp $password;
    print "\nLogging in...\n";
    my $loginurl = "$baseurl/w/wiki.phtml?title=$loginpage&action=submit";
    my $attr = [ 'wpName' => $username, 'wpPassword' => $password,
                 'wpLoginattempt' => "Anmelden", 'wpRemember' => '1' ];
    my $result = Req::postform ($loginurl, $attr);
    print "${result}Done.\n";
};

if ($command eq 'whoami')
{
    print "Downloading main page...\n";
    my $mainpage = Req::download ("$baseurl/");
    if ($mainpage =~ /valign='top' align='right' nowrap><a href="(http:\/\/\w+\.wikipedia\.org)?\/wiki\/User:(.*?)" class='internal'/s) {
        print "You are currently logged in as $2.\n";
    } else {
        print "$mainpage\n\n\nYou are not logged in. Do you want to log in now? ";
        my $ans = ReadLine (0);
        $do_login->() if lc (substr ($ans, 0, 1)) eq 'y';
    }
}
elsif ($command eq 'login')
{
    $do_login->();
}
else
{
    print "Unknown command: $command\n";
}

global_replace.pl[edit]

Replaces instances of one word with another in a given set of Wikipedia pages.

Command line syntax:

  • global_replace.pl 'word x' 'word y' url 'http://...etc...' - downloads the given URL, harvests Wikipedia links from it, and modifies the pages linked to by replacing 'word x' with 'word y'
  • global_replace.pl 'word x' 'word y' wlh 'pagename' - same as above, but for the URL it uses the "What links here" of the page 'pagename' (i.e. en.wikipedia.org/w/wiki.phtml?title=Special:Whatlinkshere&target=pagename)
  • global_replace.pl 'word x' 'word y' search 'searchterm' - same as above, but for the URL it uses en.wikipedia.org/w/wiki.phtml?search=pagename. Probably no longer works with search disabled and all
  • global_replace.pl 'word x' 'word y' page 'pagename' - modifies just the page pagename itself.
  • As an optional fifth parameter, you can specify an edit summary. By default it uses "$wordx" => "$wordy". It marks all its edits as minor; comment out line 107 to prevent this. Remove the # from line 105 if you want it to open the history page for all modified pages, so you can check if all the changes worked fine (Windows only).
#!/usr/bin/perl

use strict;
use warnings;

require "required.pl";

$| = 1;
my $baseurl = 'http://en.wikipedia.org';

my $count = 0;

my $search = shift;
my $replace = shift;
my $frominfo = shift;
my $fromstuff = shift;
my $summary = shift || qq~"$search" => "$replace"~;

my $harvestfrom;
my $frompage;
if ($frominfo eq 'url') {
    $harvestfrom = $fromstuff;
} elsif ($frominfo eq 'wlh') {
    $harvestfrom = "$baseurl/w/wiki.phtml?title=Special:Whatlinkshere&target=" . Req::esc ($fromstuff);
} elsif ($frominfo eq 'search') {
    $harvestfrom = "$baseurl/w/wiki.phtml?search=" . Req::esc ($fromstuff);
} elsif ($frominfo eq 'page') {
    $frompage = $fromstuff;
}

print qq~Using parameters:
    search  = $search
    replace = $replace
    summary = $summary
    ~;
print $harvestfrom ? "harvestfrom = $harvestfrom\n" : "frompage = $frompage\n";

my $searchresults;
if ($frompage) {
    $searchresults = qq~<a href="/wiki/$frompage"~;
} else {
    print qq~Downloading $harvestfrom...\n~;
    $searchresults = Req::download ($harvestfrom);
    $searchresults =~ s/^.*?<div\s*id='article'>//s;
    $searchresults =~ s!</div>.*$!!s;
}

my $alreadydone = {};

while ($searchresults =~ s!<a[^>]*?href="?(http://(en|www).wikipedia.org)?/w(iki/|/wiki\.phtml\?title=)(.*?)(&amp;|"|\s|>)!!s) {
    my $pagename = $4;
    next if $pagename =~ /^(User|Wikipedia|Wikipedia_talk|User_talk|MediaWiki_talk):/;
    next if $pagename eq 'Main_Page';
    next if $alreadydone->{$pagename};
    $alreadydone->{$pagename} = 1;
    print qq~Downloading "$pagename"... ~;
    my $tpage = Req::download ("$baseurl/w/wiki.phtml?title=$pagename&action=edit");
    if ($tpage =~ /<textarea .*? name="wpTextbox1" .*?>(.*?)<\/textarea>/s) {
        my $text = $1;
        $text =~ s/&(\w+?);/ $Req::htmlentityhash->{$1} || die "HTML entity unknown: $1" /ge;

        if ($text =~ /\#REDIRECT\s*:?\s*\[\[(.*?)\]\]/is)
        {
            my $redirtod = Req::dec ($1);
            my $redirto = Req::esc ($redirtod);
            print qq~Redirecting to "$redirtod".\n~;
            $searchresults .= qq~<a href="http://en.wikipedia.org/wiki/$redirto">~;
            next;
        }

        my $prevtext = $text;

        my $didreplace = sub {
            my $replaces = 0;
            my $doreplace = sub {
                my $str = shift;
                $str =~ s/&(\w+?);/$Req::htmlentityhash->{$1} || die "HTML entity unknown: $1" /ge;
                $str =~ tr/_/ /;
                $replaces++ if $str =~ s/\Q$search\E/$replace/g;
                if ($str =~ /^(.*)\|(.*)$/) {
                    my ($a1, $a2) = ($1, $2);
                    return "[[$a1]]" if $a1 eq $a2;
                    return "[[$a1]]$1" if $a2 =~ /^\Q$a1\E(.*)$/;
                }
                return "[[$str]]";
            };
            my $ra = 0;
            while ($text =~ s/((\]|^).*?)\Q$search\E(.*?(\[|$))/$1$replace$3/gs) { $ra = 1; }
            $text =~ s/\[\[(.*?)\]\]/$doreplace->($1)/eg;
            return $ra || $replaces;
        };

        if ($didreplace->()) {
            if ($tpage =~ /<input type=hidden value="(.*?)" name="wpEdittime">/) {
                my $edittime = $1;
                my $posturl = "$baseurl/w/wiki.phtml?title=$pagename&action=submit";
                my $attr = [ 'wpTextbox1'   => $text,
                             'wpMinoredit'  => 1,
                             'wpSave'       => "Save page",
                             'wpSummary'    => $summary,
                             'wpEdittime'   => $edittime,
                           ];
                print qq~Editing "$pagename"...\n~;
                Req::postform ($posturl, $attr);
#               system (qq~start "" "http://www.wikipedia.org/w/wiki.phtml?title=$pagename&action=history"~);
            }
        } else {
            print qq~Nothing changed.\n~;
        }
    }
}

converttables.pl[edit]

Converts HTML tables to wiki-syntax tables. Does not upload or download anything; uses STDIN and STDOUT.

Not tested particularly thoroughly. Use with care.

#!/usr/bin/perl

use strict;

$/ = undef;
my $content = <STDIN>;

my $onerow = 0;

my $conv = sub
{
    my $input = shift;
    my $inp = sub
    {
        my $inpu = shift;
#        1 while ($inpu =~ s#(\w*)=(\"|\')?(.*?)\2#$1=$3#gi);
        return $inpu;
    };
    $input =~ s/[\r\n]//sgi;
    $input =~ s!<tr><td></td></tr>!!sgi;
    $input =~ s!<table(.*?)>! $inp->("{{{{{NEWLINE}}}}}{|$1") !sieg;
    $input =~ s!<tr(.*?)>! $inp->("{{{{{NEWLINE}}}}}|-$1")."{{{{{NEWLINE}}}}}" !sieg;
    $input =~ s!{{{{{NEWLINE}}}}}\s*<td>!{{{{{NEWLINE}}}}}| !gim;
    $input =~ s|{{{{{NEWLINE}}}}}\s*<th>|{{{{{NEWLINE}}}}}! |gim;
    $input =~ s#{{{{{NEWLINE}}}}}\s*<caption>#{{{{{NEWLINE}}}}}|+ #gim;
    $input =~ s!{{{{{NEWLINE}}}}}\s*<td\s*(.*?)>! $inp->("{{{{{NEWLINE}}}}}|$1")."| " !eigm;
    $input =~ s#{{{{{NEWLINE}}}}}\s*<th\s*(.*?)># $inp->("{{{{{NEWLINE}}}}}!$1")."| " #eigm;
    $input =~ s#{{{{{NEWLINE}}}}}\s*<caption\s*(.*?)># $inp->("{{{{{NEWLINE}}}}}|+$1")."| " #eigm;
    if ($onerow)
    {
        $input =~ s!<td>! || !gsi;
        $input =~ s#<th># !! #gsi;
    }
    else
    {
        $input =~ s!<td>!{{{{{NEWLINE}}}}}| !gsi;
        $input =~ s#<th>#{{{{{NEWLINE}}}}}! #gsi;
    }
    $input =~ s#<caption>#{{{{{NEWLINE}}}}}|+ #gis;
    $input =~ s!<td\s*(.*?)>! $inp->(" ||$1")."| " !egis;
    $input =~ s#<th\s*(.*?)># $inp->(" !!$1")."| " #egis;
    $input =~ s#<caption\s*(.*?)># $inp->("{{{{{NEWLINE}}}}}|+$1")."| " #egis;
    $input =~ s!</table>!{{{{{NEWLINE}}}}}|}{{{{{NEWLINE}}}}}!sgi;
    $input =~ s!</(t[dhr]|caption)>!!sgi;
    $input =~ s!(\{\|(.*?){{{{{NEWLINE}}}}})\|-{{{{{NEWLINE}}}}}!
        my $catch = $&;
        ($2 =~ /{{{{{NEWLINE}}}}}/) ? $catch : $1
    !sieg;
    return $input;
};

1 while $content =~ s!<table.*?</table>! $_=$&; s|(.*)(<table.*</table>)| $1.$conv->($2) |sieg; $_ !sieg;
$content =~ s!\{\{\{\{\{NEWLINE\}\}\}\}\}!\n!gs;
1 while $content =~ s/\n\n\n/\n\n/sgi;
$content =~ s/\|\}\n\n\|/|}\n|/sgi;
$content =~ s/^\n+//si;
$content =~ s/  +/ /mg;
print $content;

import.pl[edit]

Can be used to import a database dump into your database with some more options.

The script uses STDIN to read the dump. Notice that it will only work if each line in the dump is a single SQL statement; the "cur" dump has a CREATE TABLE statement at the top which spans several lines, so you'll have to skip that.

While going through the dump, the script will display the first 80 characters of each line. This is useful because it shows if the line is an INSERT statement or not. It will also show the line number; if you run wc -l on your dump first to find out how many lines there are in the file, this gives you a primitive progress meter.

Notice that you have to change line 12 to match your database configuration.

Command-line syntax: import.pl skip limit notactually

  • skip - if specified, the first skip-1 lines are skipped. (Notice the -1! This means it starts on line skip. If you specify 1, it will not skip anything.)
  • limit - if specified and non-zero, will process only up to line limit. (If limit is less than skip, this means it will not do anything.)
  • notactually - if specified and non-zero, will not actually send anything to the DB, but only display stuff. This is useful to check how many lines you need to skip.
#!/usr/bin/perl

use strict;
use DBI;

$| = 1;

my $skip = (shift) + 0;
my $limit = (shift) + 0;
my $notactually = (shift) + 0;

my $dbh = DBI->connect ("DBI:mysql:databasename", "databaseuser", "databasepassword");

if ($skip > 0)
{
    print qq~Skipping... ~;
    for (1..($skip-1))
    {
        if ($_ % 20 == 0) { print "$_ "; }
        my $skipline = <STDIN>;
    }
    print "\n";
}

my $count = $skip-1;

while (my $line = <STDIN>)
{
    chomp $line;
    $count++;
    print STDERR ("$count: " . (substr ($line, 0, 80)) . "... \n");
    unless ($notactually)
    {
        my $sth = $dbh->do ($line);
        if (my $e = $dbh->errstr) { die $e; }
    }
    exit if ($count >= $limit) && ($limit > 0);
}

print STDERR "Done.\n";

That's all.

See also[edit]

Links[edit]