User:El/History compression
From Meta, a Wikimedia project coordination wiki
< User:El
History compression proposal: All revision texts are split into sections (the delimiter is "\n=="). Unchanged sections are stored only once. Sections are sorted by their headings. Then everything is compressed with deflate().
columns in the following table:
cd = concatenate-deflate (megabytes, old method)
smd = split-merge-deflate (megabytes, proposed method)
| namespace | cd | smd | smd/cd |
|---|---|---|---|
| 0 | 2599 | 1044 | 0.402 |
| 1 | 1126 | 164 | 0.146 |
| 2 | 140 | 59 | 0.421 |
| 3 | 771 | 102 | 0.132 |
| 4 | 5214 | 472 | 0.091 |
| 5 | 391 | 39 | 0.100 |
| 6 | 10 | 12 | 1.2 |
| 7 | 1 | 1 | 1 |
| 8 | 0 | 0 | |
| 9 | 0 | 0 | |
| 10 | 4 | 4 | 1 |
| 11 | 14 | 2 | 0.14 |
| 12 | 2 | 0 | 0 |
| 13 | 0 | 0 | |
| 14 | 2 | 2 | 1 |
| 15 | 7 | 1 | 0.14 |
| 10281 | 1902 | 0.185 |
dumps used: German Wikipedia, 2005-04-06
programs used:
extraction of data from the sql dumps: de:Bild:Mkzsf.ogg (set $ar=1)
compression (both methods):
#!/usr/bin/perl
use Compress::Zlib;
use Digest::MD5 qw(md5_base64);
use bytes;
%unesc = ("\\" => "\\", "n" => "\n", "r" => "\r", "0" => "\0", "Z" => "\x1A", "\"" => "\"", "'" => "'");
exit unless -f $ARGV[0];
open IN, "zcat $ARGV[0]|";
open SMD, ">$ARGV[0].smd";
open CD, ">$ARGV[0].cd";
$nstitle_last="";
$blobnr=0;
while(<IN>) {
chop;
/^(\d+ \S+) \d+ \S+ \S+ \d \d \d+ \d (.*)$/;
$nstitle=$1; $txt = $2;
if(($nstitle ne $nstitle_last && $nRevisions>0) || $nRevisions==20) {
$nSections = $section_nr;
writeBlob() if $blobnr%10==0;
$blobnr++;
$section_nr=0; $heading_nr=0; $concat="";
%section = (); %heading_nr = (); %section_nr = ();
@sections_md5 = ();
$nRevisions=0; $index = "";
$rev_nr=0;
}
$nstitle_last = $nstitle;
$nRevisions++;
next if $blobnr%10>0; # only testing, no need to save every blob
$txt =~ s/\\(.)/$unesc{$1}/g;
$concat .= $txt;
@index = ();
@rev_sections = split(/\n==/, $txt);
for($i=0; $i<@rev_sections; $i++) {
if($i==0) {
$section = $rev_sections[$i];
$heading = "";
} else {
$section = "\n==".$rev_sections[$i];
$section =~ /^\n==[^\n]*?/s;
$heading = $&;
}
$section = ($i==0?"":"\n==").$rev_sections[$i];
if($section =~ /^\n==[^\n]*?/s) {
$heading = $&;
} else {
$heading = "";
}
$section_md5 = md5_base64($section);
$section{$section_md5} = $section;
$heading_nr{$heading} = $heading_nr++ unless defined $heading_nr{$heading};
unless (defined $section_nr{$section_md5}) {
$section_nr{$section_md5} = $section_nr++;
push @{$sections_md5[$heading_nr{$heading}]}, $section_md5;
}
push @index, $section_nr{$section_md5};
}
$index .= join(" ", @index) . "\n";
}
close IN; close SMD; close CD;
sub writeBlob {
my $sections = ""; my $pos=0;
my %poslen;
foreach my $sections_md5 (@sections_md5) {
foreach my $section_md5 (@$sections_md5) {
my $len = length $section{$section_md5};
$sections .= $section{$section_md5};
$poslen[$section_nr{$section_md5}] = "$pos $len\n";
$pos += $len;
}
}
my $poslen = "";
for(my $section_nr=0; $section_nr<$nSections; $section_nr++) {
$poslen .= $poslen[$section_nr];
}
my $index_len = length($index);
my $poslen_len = length($poslen);
my $blob = sprintf "%08d %08d %08d %08d %08d\n", $nRevisions, $nSections, $index_len, $poslen_len, 1;
$blob .= "$index$poslen$sections";
my ($refinf, $status) = deflateInit();
my ($blobgz_a, $status) = $refinf->deflate($blob);
my ($blobgz_b, $status) = $refinf->flush();
print SMD $blobgz_a.$blobgz_b;
my ($cd_a, $cd_b);
($refinf, $status) = deflateInit();
($cd_a, $status) = $refinf->deflate($concat);
($cd_b, $status) = $refinf->flush();
print CD $cd_a.$cd_b;
}
decompression (This program decompresses exactly one Blob.):
#!/usr/bin/perl
use bytes;
use Compress::Zlib;
open IN, $ARGV[0];
while(<IN>) {
$buf .= $_;
}
close IN;
($refinf, $status) = inflateInit();
($buf, $status) = $refinf->inflate($buf);
#print $buf;
($nRevisions, $nSections, $index_len, $poslen_len, $defaultRevision) = $buf =~ /^(\d{8}) (\d{8}) (\d{8}) (\d{8}) (\d{8})\n/s;
@index = split(/\n/, substr($buf, 45, $index_len));
@poslen = split(/\n/, substr($buf, 45+$index_len, $poslen_len));
$pos0 = 45+$index_len+$poslen_len;
open RAW, ">$ARGV[0].raw";
for($revision=0; $revision<$nRevisions; $revision++) {
@section_nr = split(/ /, $index[$revision]);
$txt = "";
foreach $section_nr (@section_nr) {
($pos, $len) = split(/ /, $poslen[$section_nr]);
$txt .= substr($buf, $pos0+$pos, $len);
}
print RAW $txt;
}
close RAW;