From Wikipedia, the free encyclopedia
use strict;
use Storable;
use LWP::UserAgent;
use HTTP::Request::Common;
use XML::Simple;
use URI::Escape;
use Data::Dumper;
use Algorithm::Diff qw(diff);
my $ua = LWP::UserAgent->new('agent' => 'HBC archive builderbot v0.1 - developing (Operated by User:HighInBC)');
my $nowiki = ('nowiki'); # So it doesn't screw up the display of the source code on wiki
my $page = 'Wikipedia:Requests for comment/User names';
my $shortcut;
$shortcut = 'WP:RFCN';
$shortcut ||= $page;
my %revisions = get_complete_history($page);
my(@old_content);
my($old_key);
my $day;
KEY: foreach my $key (sort {$a <=> $b} keys(%revisions))
{
my(@content) = split("\n",${$revisions{$key}}{'text'}{'content'});
my $timestamp = ${$revisions{$key}}{'timestamp'};
my $summary = ${$revisions{$key}}{'comment'};
$summary =~ s|/\*.*\*/\s*||;
my $user = ${$revisions{$key}}{'contributor'}{'username'};
my (@headings);
if (scalar(@content) && scalar(@old_content))
{
my @diffs = diff(\@old_content, \@content);
foreach my $ra_hunk (@diffs)
{
foreach my $ra_diff (@{$ra_hunk})
{
my($action,$content) = @{$ra_diff}[0,2];
if (($content =~ m|==\s?([^=]*)\s?==|) && ($action eq '-'))
{
my $heading = $1;
($heading =~ s|(\{\{.*:.*\}\})|<$nowiki>$1</$nowiki>|) if ($heading =~ m|\{\{.*:.*\}\}|);
push(@headings,$heading);
}
}
}
}
if (scalar(@headings))
{
$timestamp =~ m|(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}):\d{2}Z|;
if ($1 ne $day)
{
$day = $1;
warn "'''$day'''\n";
}
my $time = $2;
my $archive_link = "'''[{{fullurl:$shortcut|oldid=$old_key}} Archive link]'''";
if (scalar(@headings) > 1)
{
warn "* '''$time''': $archive_link - ($summary ([[User:$user|$user]])) - (".scalar(@headings)." entries)\n";
foreach my $heading (@headings)
{
warn "** $heading\n";
}
}
elsif (scalar(@headings) == 1)
{
warn "* '''$time''': $archive_link - $headings[0] - ($summary ([[User:$user|$user]]))\n";
}
}
@old_content = @content;
$old_key = $key;
}
sub get_complete_history # Add Gzip, 100 times smaller, gee where did that ratio come from??
{
mkdir('cache') unless (-d('cache'));
my $page = shift;
my(%revisions);
my $count;
my $offset;
my $fname = 'cache/'.uri_escape($page);
if (-f($fname))
{
warn "Found '$page' in cache, loading...\n";
%revisions = %{retrieve($fname)};
my(@keys) = sort {$a <=> $b} keys(%revisions);
$offset = ($revisions{$keys[scalar(@keys)-1]}{'timestamp'}); # Get timestamp of most recent revision
warn (scalar(keys(%revisions))." loaded from cache.\n");
}
else
{
warn "No cache, starting fresh.\n";
$offset = '0';
}
my $total;
GETMORE:
warn "\nDownloading as many as 100 revisions starting at ".($offset || 'the start')."\n";
my $index = 'http://en.wikipedia.org/w/index.php';
my $res = $ua->request
(
POST $index."?title=Special:Export",
Content_Type => 'application/x-www-form-urlencoded',
Content => [(
'pages' => $page,
'action' => 'submit',
'submit' => 'Export',
'limit' => 100,
'offset' => $offset
)]
);
my $current = $res->content();
unless ($current =~ m|^<mediawiki|)
{
warn "Failed somehow, trying again.\n";
goto GETMORE;
}
my $index = rindex($current, '<timestamp>');
my $string = substr($current,$index,43);
$string =~ m|<timestamp>(.+?)</timestamp>|;
$offset = $1;
my $xml_data = XMLin($current);
$count = 0;
if (!scalar(keys(%{${$xml_data}{page}{revision}}))) {} # do nothing
elsif (${$xml_data}{'page'}{'revision'}{'id'})
{
unless ($revisions{${$xml_data}{'page'}{'revision'}{'id'}}) {$count++;$total++;}
$revisions{${$xml_data}{'page'}{'revision'}{'id'}} = ${$xml_data}{'page'}{'revision'};
}
else
{
foreach my $revision (sort {$a <=> $b} keys(%{${$xml_data}{'page'}{'revision'}}))
{
unless ($revisions{$revision}) {$count++;$total++;}
$revisions{$revision} = ${$xml_data}{'page'}{'revision'}{$revision};
}
warn Dumper($xml_data) unless ($total);
}
warn "Got $count revisions\n";
if ($count == 100)
{
warn "Still more.\n";
goto GETMORE;
}
if ($total > 0)
{
warn "Saving cache...\n";
store(\%revisions, $fname);
warn "done.\n";
}
return %revisions;
}