User:Bp/How to get a database dump
From Memory Alpha, the free Star Trek reference
< User:Bp
Wikia provides database dumps of MA at pages_current.xml.gz and pages_full.xml.gz, but they're usually way out of date. Here is a Perl script to get one using the Mediawiki API and Special:Export. Yes, I know that I'm parsing XML with a regex, but I don't care.
If any of the xml files end with "</html>" (check using tail), then you'll need to lower the value of $pages_per_xml.
See also: Memory Alpha:Bots#Taking care of broken XML export.
#!/usr/bin/perl
use utf8;
use warnings;
use strict;
my $aplimit = 500; # passed to the API, 500 for anon, 5000 for logged in bot
my $pages_per_xml = 10000;
my $current_only = 1; # 1 = pages_current, 0 = pages_full
use Time::HiRes qw[time];
use LWP::UserAgent;
use LWP::ConnCache;
use HTTP::Request::Common;
use HTTP::Cookies;
use URI::Escape qw[uri_escape_utf8];
use HTML::Entities qw[decode_entities];
my $stm = time;
my $br = LWP::UserAgent->new;
$br->conn_cache(LWP::ConnCache->new());
$br->agent("ma_dump/1.0");
$br->cookie_jar(HTTP::Cookies->new(file => "ma_cookies.txt", autosave => 1, ignore_discard => 1));
my @namespaces = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 100, 101, 102, 103, 110, 111); # probably should fetch this list from somewhere
my @pages;
print "Getting page list...\n";
foreach my $ns (@namespaces) {
my $apfrom;
do {
my $url = "http://memory-alpha.org/en/api.php?action=query&list=allpages&apnamespace=$ns&aplimit=$aplimit&format=xml" .
( $apfrom ? "&apfrom=$apfrom" : '' );
undef $apfrom;
my $res = $br->get($url);
if ($res->is_success) {
push @pages, $res->decoded_content =~ m#<p pageid="\d+" ns="\d+" title="(.*?)" />#g;
($apfrom) = $res->decoded_content =~ m#<allpages apfrom="(.*?)" />#;
} else {
die $res->status_line." on $url";
}
} while defined $apfrom;
print "Done with ns-$ns, now have ",scalar @pages," page(s).\n";
}
printf "%d page(s) to fetch, %d at a time, %d part(s) expected...\n", scalar @pages, $pages_per_xml, map( int( /^\d+$/ ? $_ : $_+1 ), @pages / $pages_per_xml );
my %export_parms = (
action => 'submit',
curonly => 1,
);
delete $export_parms{curonly} unless $current_only;
my $part = 0;
while (@pages) {
$part++;
$export_parms{pages} = join("\n", map(decode_entities($_), splice(@pages, 0, $pages_per_xml) ) );
my $req = new HTTP::Request POST => 'http://memory-alpha.org/en/wiki/Special:Export';
$req->content_type('application/x-www-form-urlencoded');
$req->content( join('&', map(sprintf("%s=%s", $_, uri_escape_utf8($export_parms{$_}) ), keys %export_parms) ) );
my $xml_file = sprintf "pages_%s_hard_part%03d.xml", $export_parms{curonly} ? 'current' : 'full', $part;
my $res = $br->request($req, $xml_file);
if ($res->is_success) {
print "OK. $xml_file ",-s $xml_file," bytes.\n";
} else {
die $res->decoded_content, "\n*** ", $res->status_line, "\n";
}
}
print time-$stm," second(s). $part parts.\n";
