Data dumps/Perl importing script
From Meta, a Wikimedia project coordination wiki
This is a script Tbsmith made to import only pages in certain categories. It works for Mediawiki 1.5.
It is not written up as a tutorial but it will give you a good idea of how to find the pages you want and import them. It does not bring images, although I'm sure it would be easy to do.
Warning
- This will put pages on your site, but it doesn't worry about the links tables or anything else so it's not perfect. There are scripts in /maintenance/ to rebuild the links tables and search index.
/maintenance/tables.sql is where I got most of my information to build this.
#!/usr/bin/perl
use Parse::MediaWikiDump;
use DBI;
use DBD::mysql;
$server = "localhost";
$name = "dbname";
$user = "admin";
$password = "password";
$dsn = "DBI:mysql:database=$name;host=$server;";
$dbh = DBI->connect($dsn, $user, $password);
$pages = '/home/pages_articles.xml';
$pages = Parse::MediaWikiDump::Pages->new($pages);
print "Done parsing.\n";
#print qq{
#$pages->sitename
#$pages->base
#$pages->generator
#$pages->case
#$pages->namespaces
#information about a page record
#$page->redirect
#$page->categories
#$page->title
#$page->id
#$page->revision_id
#$page->timestamp
#$page->username
#$page->userid
#$page->minor
#$page->text
# good_categories is a file with a list of categories
# from which you'd like to import pages, one per line
open I, "good_categories" or die $!;
while (chomp($_=<I>)) {
$cat{$_}= 1;
}
while(defined($page = $pages->page)) {
next if $page->redirect;
$title = $page->title;
$timestamp = $page->timestamp;
$text = $page->text;
$categories = $page->categories;
@cats = @$categories;
$cats = join ', ', @cats;
# This part sees if it matches your category
$matchcat = 0;
for (@cats) {
if ($cat{$_}==1) {
$matchcat = 1;
last;
}
}
next unless $matchcat;
$text = $$text;
$text =~ s/\\/\\\\/g;
$text =~ s/'/\\'/g;
$size = length($text);
$title =~ s/ /_/g;
$title =~ s/\W//g;
# won't insert a page with the same title as an existing page
$sth = $dbh->prepare("select count(*) from page where page_title='$title'");
$sth->execute;
$row = $sth->fetchrow_hashref;
next if $row->{'count(*)'}>0;
$dbh->do("insert into text (old_text) values ('$text')");
$text_old_id = $dbh->{'mysql_insertid'};
# 0 is the main namespace.
$namespace = 0;
$dbh->do("insert into page (page_namespace, page_title, page_is_new, page_random, page_latest, page_len) values ($namespace, '$title', 1, rand(), 0, $size)");
$page_id = $dbh->{'mysql_insertid'};
# This will be the author or the pages
$auth = 'Admin';
$authid = 1;
$dbh->do("insert into revision (rev_page, rev_text_id, rev_user, rev_user_text) values ($page_id, $text_old_id, $authid, '$auth')");
$rev_id = $dbh->{'mysql_insertid'};
$dbh->do("update page set page_latest = $rev_id where page_id = $page_id");
print <<STOP;
Page_id($page_id) -- Old_id($text_old_id) -- Rev_id($rev_id)
Importing Title: ($title)
-----------------------------------------
STOP
}