#!/usr/bin/perl -w
|
|
# This was written by RevRagnarok (I'm on Wikipedia)
|
# I was having problems with all the split files on a FAT32 partition. I assume
|
# it is because there were so many plus two entries for each (LFNs).
|
# This simply combines all the rec* files again into large chunks of N where
|
# I used 5, but you can set below with $combine.
|
# Verification info below.
|
# Lastly, I needed to modify the Makefile and remove the "split" from the
|
# "wikipedia" target.
|
|
use strict;
|
|
# Using: rec13778enwiki-20070802-pages-articles.xml.bz2
|
my $last = 13778;
|
my $lastd = 5; # How many digits in above (yes, I can compute this, but why?)
|
my $date = 20070802;
|
my $suffix = "enwiki-${date}-pages-articles.xml.bz2";
|
my $combine = 5; # This will combine every 5 into a group
|
# (If this number makes > 4 digit results, it will not sort nicely)
|
my $outputdir = '/data/wikipedia/'; # Don't make it the same place...
|
|
my $joinstr = '';
|
my $fcount = 0;
|
|
for (1 .. $last) {
|
my $num = sprintf "%0${lastd}d", $_;
|
$joinstr .= "rec${num}${suffix} ";
|
if (($_ % $combine) == 0) {
|
&catthem($joinstr, $fcount++);
|
$joinstr = '';
|
}
|
}
|
|
&catthem($joinstr, $fcount++) if ($joinstr ne '');
|
print "All done!\n";
|
|
sub catthem ($$) {
|
my $ofile = sprintf "rec%04d.bz2", $_[1];
|
`/bin/cat $_[0] >${outputdir}${ofile}`; # Lazy again, there are more Perl-ish ways.
|
print ".";
|
}
|
|
__DATA__
|
|
To make sure they were all taken in, you can do this:
|
bash$ bzip2 -tvv *bz2 2>&1 | grep -v ok | grep -v bz2 | wc -l
|
13778
|
|
...which is equal to the number of start blocks, so I know nothing is missing now.
|
|