User:El~metawiki/History compression

From Meta, a Wikimedia project coordination wiki

History compression proposal: All revision texts are split into sections (the delimiter is "\n=="). Unchanged sections are stored only once. Sections are sorted by their headings. Then everything is compressed with deflate().

columns in the following table:

cd = concatenate-deflate (megabytes, old method)

smd = split-merge-deflate (megabytes, proposed method)

namespace cd smd smd/cd
0 2599 1044 0.402
1 1126 164 0.146
2 140 59 0.421
3 771 102 0.132
4 5214 472 0.091
5 391 39 0.100
6 10 12 1.2
7 1 1 1
8 0 0
9 0 0
10 4 4 1
11 14 2 0.14
12 2 0 0
13 0 0
14 2 2 1
15 7 1 0.14
10281 1902 0.185


dumps used: German Wikipedia, 2005-04-06

programs used:

extraction of data from the sql dumps: de:Bild:Mkzsf.ogg (set $ar=1)

compression (both methods):

#!/usr/bin/perl

use Compress::Zlib;
use Digest::MD5 qw(md5_base64);
use bytes;

%unesc = ("\\" => "\\", "n"  => "\n", "r"  => "\r", "0"  => "\0", "Z"  => "\x1A", "\"" => "\"", "'"  => "'");

exit unless -f $ARGV[0];
open IN, "zcat $ARGV[0]|";
open SMD, ">$ARGV[0].smd";
open CD, ">$ARGV[0].cd";

$nstitle_last="";
$blobnr=0;
while(<IN>) {
  chop;
  /^(\d+ \S+) \d+ \S+ \S+ \d \d \d+ \d (.*)$/;
  $nstitle=$1; $txt = $2;
  if(($nstitle ne $nstitle_last && $nRevisions>0) || $nRevisions==20) {
    $nSections = $section_nr;
    writeBlob() if $blobnr%10==0;
    $blobnr++;
    $section_nr=0; $heading_nr=0; $concat="";
    %section = (); %heading_nr = (); %section_nr = ();
    @sections_md5 = ();
    $nRevisions=0; $index = "";
    $rev_nr=0;
  }
  $nstitle_last = $nstitle;
  $nRevisions++;
  next if $blobnr%10>0; # only testing, no need to save every blob
  $txt =~  s/\\(.)/$unesc{$1}/g;
  $concat .= $txt;
  @index = ();
  @rev_sections = split(/\n==/, $txt);
  for($i=0; $i<@rev_sections; $i++) {
    if($i==0) {
      $section = $rev_sections[$i];
      $heading = "";
    } else {
      $section = "\n==".$rev_sections[$i];
      $section =~ /^\n==[^\n]*?/s;
      $heading = $&;
    }
    $section = ($i==0?"":"\n==").$rev_sections[$i];
    if($section =~ /^\n==[^\n]*?/s) {
      $heading = $&;
    } else {
      $heading = "";
    }
    $section_md5 = md5_base64($section);
    $section{$section_md5} = $section;
    $heading_nr{$heading} = $heading_nr++ unless defined $heading_nr{$heading};
    unless (defined $section_nr{$section_md5}) {
      $section_nr{$section_md5} = $section_nr++;
      push @{$sections_md5[$heading_nr{$heading}]}, $section_md5;
    }
    push @index, $section_nr{$section_md5};
  }
  $index .= join(" ", @index) . "\n";
}
close IN; close SMD; close CD;

sub writeBlob {
  my $sections = ""; my $pos=0;
  my %poslen;
  foreach my $sections_md5 (@sections_md5) {
    foreach my $section_md5 (@$sections_md5) {
      my $len = length $section{$section_md5};
      $sections .= $section{$section_md5};
      $poslen[$section_nr{$section_md5}] = "$pos $len\n";
      $pos += $len;
    }
  }
  
  my $poslen = "";
  for(my $section_nr=0; $section_nr<$nSections; $section_nr++) {
    $poslen .= $poslen[$section_nr];
  }
  
  my $index_len  = length($index);
  my $poslen_len = length($poslen);
  my $blob = sprintf "%08d %08d %08d %08d %08d\n", $nRevisions, $nSections, $index_len, $poslen_len, 1;
  $blob .= "$index$poslen$sections";
  my ($refinf, $status) = deflateInit();
  my ($blobgz_a, $status) = $refinf->deflate($blob);
  my ($blobgz_b, $status) = $refinf->flush();
  print SMD $blobgz_a.$blobgz_b;
  my ($cd_a, $cd_b);
  ($refinf, $status) = deflateInit();
  ($cd_a,  $status) = $refinf->deflate($concat);
  ($cd_b,  $status) = $refinf->flush();
  print CD $cd_a.$cd_b;
}

decompression (This program decompresses exactly one Blob.):

#!/usr/bin/perl

use bytes;
use Compress::Zlib;

open IN, $ARGV[0];
while(<IN>) {
  $buf .= $_;
}
close IN;
($refinf, $status) = inflateInit();
($buf, $status)    = $refinf->inflate($buf);
#print $buf;

($nRevisions, $nSections, $index_len, $poslen_len, $defaultRevision) = $buf =~ /^(\d{8}) (\d{8}) (\d{8}) (\d{8}) (\d{8})\n/s;
@index  = split(/\n/, substr($buf, 45, $index_len));
@poslen = split(/\n/, substr($buf, 45+$index_len, $poslen_len));
$pos0 = 45+$index_len+$poslen_len;

open RAW, ">$ARGV[0].raw";
for($revision=0; $revision<$nRevisions; $revision++) {
  @section_nr = split(/ /, $index[$revision]);
  $txt = "";
  foreach $section_nr (@section_nr) {
    ($pos, $len) = split(/ /, $poslen[$section_nr]);
    $txt .= substr($buf, $pos0+$pos, $len);
  }
  print RAW $txt;
}
close RAW;