User:Bp/How to get a database dump

Wikia provides database dumps of MA at pages_current.xml.gz and pages_full.xml.gz, but they're usually way out of date. Here is a Perl script to get one using the Mediawiki API and Special:Export. Yes, I know that I'm parsing XML with a regex, but I don't care.

If any of the xml files end with " " (check using tail), or it crashes with a 500 error, then you'll need to lower the value of $pages_per_xml.

See also: Memory Alpha:Bots.


 * 1) !/usr/bin/perl

use utf8; use warnings; use strict;

my $wiki = 'memory-alpha.org'; my $api_url = 'http://'.$wiki.'/api.php'; my $export_url = 'http://'.$wiki.'/wiki/Special:Export'; my $aplimit = 500; # number of page names in one API request; passed to the API; 500 for anon, 1000 for logged in bot my $pages_per_xml = 10000; # number of pages in one Special:Export request my $current_only = 1;	# 1 = pages_current, 0 = pages_full my @namespaces = ; # leave empty for all
 * 1) my $wiki = 'farscape.wikia.com'; # other wiki
 * 2) my $wiki = $ARGV[0]; # use the first command line parameter as the wiki, `./get_dump.pl memory-alpha.org` for example
 * 1) my @namespaces = (0); # just the main namespace
 * 2) my @namespaces = (0,4,6,10,14); # main, ma, file, template, category

use Time::HiRes qw[time]; use LWP::UserAgent; use LWP::ConnCache; use HTTP::Request::Common; use HTTP::Cookies; use URI::Escape qw[uri_escape_utf8]; use HTML::Entities qw[decode_entities]; use Digest::MD5 qw[md5_hex]; use Encode qw[encode_utf8]; sub wikia_image { my $page_title = shift; my ($fn) = $page_title =~ /^(?:Image|File):(.*)/; $fn = ucfirst $fn; $fn =~ s/ /_/g; my ($h1,$h2) = ( lc md5_hex encode_utf8($fn) ) =~ /^([a-z0-9])([a-z0-9])/i; return "$h1/$h1$h2/". uri_escape_utf8($fn); }
 * 1) takes a page title and returns the location of the image

my $stm = time;

my $wiki_fn = $wiki; $wiki_fn =~ s~[^\w\.\-]~_~g;

my $br = LWP::UserAgent->new; $br->conn_cache(LWP::ConnCache->new); $br->agent("ma_dump/1.2"); $br->cookie_jar(HTTP::Cookies->new(file => $wiki_fn.'_cookies.txt', autosave => 1, ignore_discard => 1));

my %namespace_info = (0 => ''); print "Getting namespace list...\n"; my $res = $br->get ( $api_url . '?action=query&meta=siteinfo&siprop=namespaces&format=xml' ); if ($res->is_success) { my @nsi = $res->decoded_content =~ m#.*?#g; foreach(@nsi) { my ($nsid, $nst) = m#(.*?)#g; $namespace_info{$nsid} = $nst; } } else { die $res->status_line." on fetch namespaces"; } @namespaces = sort {$a <=> $b} keys %namespace_info unless @namespaces;

my @pages; print "Getting page list...\n"; foreach my $ns (@namespaces) { next if $ns < 0; my $apfrom; do { my $url = $api_url. "?action=query&list=allpages&apnamespace=$ns&aplimit=$aplimit&format=xml". ( $apfrom ? "&apfrom=$apfrom" : '' ); undef $apfrom; my $res = $br->get($url); if ($res->is_success) { push @pages, $res->decoded_content =~ m##g; ($apfrom) = $res->decoded_content =~ m##; } else { die $res->status_line." on $url"; }	} while defined $apfrom; print "Done with ns-$ns ($namespace_info{$ns}), now have ",scalar @pages," page(s).\n"; }

my @images = grep /^(File|Image):/, @pages; # cheap way to do it if ( @images && open ILST, '>', sprintf('%s_image_list.txt', $wiki_fn) ) { print ILST wikia_image(decode_entities $_)."\n" foreach @images; close ILST; }

printf "%d page(s) to fetch, %d at a time, %d part(s) expected...\n", scalar @pages, $pages_per_xml, map( int( /^\d+$/ ? $_ : $_+1 ), @pages / $pages_per_xml );

my %export_parms = (	action => 'submit',	curonly => 1, ); delete $export_parms{curonly} unless $current_only;

my $part = 0; while (@pages) { $part++; $export_parms{pages} = join("\n", map(decode_entities($_), splice(@pages, 0, $pages_per_xml) ) ); my $req = new HTTP::Request POST => $export_url; $req->content_type('application/x-www-form-urlencoded'); $req->content( join('&', map(sprintf('%s=%s', $_, uri_escape_utf8($export_parms{$_}) ), keys %export_parms) ) ); my $xml_file = sprintf '%s_pages_%s_hard_part%03d.xml', $wiki_fn, $export_parms{curonly} ? 'current' : 'full', $part; my $res = $br->request($req, $xml_file); if ($res->is_success) { print "OK. $xml_file ",-s $xml_file," bytes.\n"; } else { die $res->decoded_content, "\n*** ", $res->status_line, "\n"; } }

print time-$stm," second(s). $part parts.\n";

Downloading Images
The script above will create an image list file that can be used with something like puf to fetch all the images. For example:

puf -P ./ma-images -B http://images1.wikia.nocookie.net/memoryalpha/en/images/ -i memory-alpha.org_image_list.txt