User:FairuseBot/Pearle.pm

From Wikipedia, the free encyclopedia

### IMPORTANT ###

# This code is released into the public domain.

### RECENT CHANGES ###

# 30 Nov 2005: Created, based off of the  12 Nov 2005 version of Pearle Wisebot
# 15 Feb 2006: Modifed "retry" to work with any function that signals failure by dying, modified to use a simple exponential backoff formula
#              Simplified "limit" code, modified to take an optional parameter
#              Added "config" function as a clean interface to change internal parameters
#              Modified Wiki-access functions for use with the new "retry" function
#              Cleanup of boolean config vars to use standard Perl boolean conventions
# 28 Feb 2006: Added checkLogin bottleneck, option to allow editing while logged out
#              Added support for proxy servers
#  8 Mar 2006: Added support for getting a user's contributions
#              Added support for retrieving logs
#              Separated out some common regex parts into variables
# 29 Mar 2006: Added protection against Unicode in URLs
#              Made thrown exceptions consistent
#              Sanity-checking on postPage: talkpage without article, userpage or user talkpage without user
# 17 May 2005: Improved log retrieval
# 12 Jul 2007: Started support for api.php
#              Log retrieval converted to api.php, added timestamps to retrieval
#              Modified to work with any wiki
#              Modified to use index.php rather than wiki.phtml
#              Converted GetLogArticles to use named parameters
# 14 Jul 2007: Modified logging to use message levels.  Removed "print" and "myPrint" functions
#  6 Aug 2007: Added the "WikiPage" class
#              Modified getPage and putPage to only work with WikiPage objects
#              Renamed to "Pearle.pm"
#              Made a proper module
# 10 Aug 2007: Changed the default XML parser, reduced memory usage when parsing
# 17 Oct 2007: Removed nullEdit() -- MediaWiki hasn't required them in some time.
#              Modified getCategoryArticles, getCategoryImages, and getSubcategories to use api.php
# 21 Oct 2007: Proper Unicode support
# 29 Oct 2007: Made edit summaries mandatory
# 23 Mar 2008: Changed "minor" flag from text to boolean
# 29 Mar 2008: Improved UTF-8 support

# Errors thrown by this package always begin with a three-digit number
#     4xx: HTTP client errors
#     505: Server error: HTTP version not supported
#     509: Server error: Bandwidth exceeded
#
#     900: Unspecified internal error.
#     901: Library not initialized.  You didn't call Pearle::init() before calling this function.
#     902: Parameter error.  You made a function call, but forgot a mandatory parameter, or provided an invalid one.
#
#     920: Unexpected response.  The MediaWiki site returned something unexpected.
#     921: Unexpected logout.  The MediaWiki site logged us out unexpectedly.
#     922: Edit conflict.  Someone edited the article while we were.
#     923: Deleted article conflict.  Someone deleted the article while we were editing.
#     924: Spam filter.  A link in the page tripped the spam filter.
#     925: Protected page.  The page is protected, and the bot doesn't have the rights to edit it.

package Pearle;

use strict;
use warnings;

use Time::HiRes;
use Encode;

use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Request::Common qw(POST);
use HTML::Entities;
use XML::Simple;
use Data::Dumper;               # For debugging
use URI::Escape;

use Pearle::WikiPage;

# Standard regex parts
$Pearle::regex_timestamp = '(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d)';                           # Match and capture a Wikipedia timestamp
$Pearle::regex_timestamp_nc = '\d\d:\d\d, \d\d? \w+ \d\d\d\d';                                          # Match a Wikipedia timestamp
$Pearle::regex_timestamp_ISO = '(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)';     # Match and capture a timestamp of the form 2007-07-13T04:21:39Z
$Pearle::regex_timestamp_ISO_nc = '\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d';              # Match a timestamp of the form 2007-07-13T04:21:39Z

#<a href="/w/index.php?title=User:Angel_dunn&action=edit" class="new" title="User:Angel dunn">
#<a href="/wiki/User:Jimbo_Wales" title="User:Jimbo Wales">
$Pearle::regex_pagelink = '<a href="[^"]*"(?: class="new"|) title="([^"]*)">';    # Match and capture any page
$Pearle::regex_redpagelink = '<a href="[^"]*" class="new" title="([^"]*)">';      # Match and capture nonexistant pages only
$Pearle::regex_bluepagelink = '<a href="[^"]*" title="([^"]*)">';                           # Match and capture existing pages only
$Pearle::regex_pagelink_nc = '<a href="[^"]*"(?: class="new"|) title="[^"]*">';   # Match any page
$Pearle::regex_redpagelink_nc = '<a href="[^"]*" class="new" title="[^"]*">';     # Match nonexistant pages only
$Pearle::regex_bluepagelink_nc = '<a href="[^"]*" title="[^"]*">';                          # Match existing pages only

# Standard MediaWiki namespaces
@Pearle::namespaces = ("", "Talk", "User", "User talk", "Wikipedia", "Wikipedia talk", "Image", "Image talk", "MediaWiki", "MediaWiki talk", "Template", "Template talk", "Help", "Help talk", "Category", "Category talk");

$Pearle::logfile = "";
$Pearle::_inited = 0;
$Pearle::username = "";
$Pearle::password = "";
$Pearle::speedLimit = 10;       # Seconds to wait by default when limit() is called
$Pearle::_speedMult = 1;        # Multiplier for default wait time if the wiki is being slow
$Pearle::roughMode = 0;         # Ignore most errors
$Pearle::nullOK = 0;            # Permit editing non-existent pages
$Pearle::sanityCheck = 0;       # Sanity checking on edits
$Pearle::loglevel = 2;          # Level of message to write to file
$Pearle::printlevel = 3;        # Level of message to print to stdout
$Pearle::logoutOK = 0;          # Permit editing while logged out
$Pearle::proxy = undef;         # Proxy to use
$Pearle::wiki = 'http://en.wikipedia.org/w/';   # URL of the directory containing index.php and api.php
$Pearle::pagebase = 'http://en.wikipedia.org/wiki/';    # Base URL for accessing pages.  May change depending on mod_rewrite rules
$XML::Simple::PREFERRED_PARSER = "XML::Parser";               # Much faster than the default XML::SAX parser
$Pearle::xml_parser = XML::Simple->new();


########## Accessors #########################################################

sub getXMLParser
{
        return $Pearle::xml_parser;
}


########## Other functions ###################################################

# This must be the first function from the library called
sub init
{
        $Pearle::username = $_[0] or die("902 No username provided!\n");
        $Pearle::password = $_[1] or die("902 No password provided!\n");
        $Pearle::logfile = $_[2] or die("902 No logfile name provided!\n");
        $Pearle::cookies = $_[3] or die("902 No cookie file provided!\n");
        $Pearle::useragent = $_[4] or $Pearle::useragent = "PearleLib/0.2";
        

        $Pearle::ua = LWP::UserAgent->new(timeout => 300);
        $Pearle::ua->agent($Pearle::useragent);
        $Pearle::ua->cookie_jar(HTTP::Cookies->new(file => $Pearle::cookies, autosave => 1));
        $Pearle::ua->cookie_jar->load();

        $Pearle::roughMode = "no";

        # Hot pipes
        $| = 1;
        
        $Pearle::_inited = 1;
}

sub config
{
        my %params = @_;
        $Pearle::speedLimit = $params{speedLimit} if(defined($params{speedLimit}));
        $Pearle::roughMode = $params{roughMode} if(defined($params{roughMode}));
        $Pearle::nullOK = $params{nullOK} if(defined($params{nullOK}));
        $Pearle::loglevel = $params{loglevel} if(defined($params{loglevel}));
        $Pearle::printlevel = $params{printlevel} if(defined($params{printlevel}));
        $Pearle::logfile = $params{logfile} if(defined($params{logfile}));
        $Pearle::logoutOK = $params{logoutOK} if(defined($params{logoutOK}));
        $Pearle::sanityCheck = $params{sanityCheck} if(defined($params{sanityCheck}));
        if(defined($params{wiki}) and $params{wiki} ne $Pearle::wiki)
        {
                $params{wiki} .= '/' if($params{wiki} !~ /\/$/);   # Add a trailing slash if needed
                $Pearle::wiki = $params{wiki};
        }
        $Pearle::pagebase = $params{pagebase} if(defined($params{pagebase}));
        
        if(exists($params{proxy}))
        {
                if(defined($params{proxy}))
                {
                        myLog(3, "Proxying: $params{proxy}\n");
                        $Pearle::ua->proxy('http', $params{proxy});
                        $Pearle::proxy = $params{proxy};
                }
                else
                {
                        myLog(3, "Not proxying\n");
                        $Pearle::ua->no_proxy();
                        $Pearle::proxy = undef;
                }
        }
}

# Logging levels:
# 0: Immediately fatal errors.  Call to myLog will be followed by a call to die()
# 1: Non-fatal errors.  The library can recover, turn the function call into a no-op, and return an error indicator
# 2: Serious warning.  The library can still complete the action
# 3: Status messages.  Messages useful for tracing library execution.
# 4: Debugging messages.
sub myLog
{
        my $level = shift;
        my @message = @_;

        if($level <= $Pearle::loglevel)
        {
                die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

                open (LOG, ">>:utf8", $Pearle::logfile) || die "900 Could not append to log!";
                print LOG $_[0];
                close (LOG);
        }
        
        if($level <= $Pearle::printlevel)
        {
                print @message;
        }
}

# Rate-limiting.  Can be sensibly run even if libPearle isn't initialized
sub limit
{
        my ($i);
        $i = ($_[0] or ($Pearle::speedLimit * $Pearle::_speedMult));
        $i = 10 if($i < 10);

        # Rate-limiting to avoid hosing the wiki server
        # Min 30 sec unmarked
        # Min 10 sec marked
        # May be raised by retry() if load is heavy

        ### ATTENTION ###
        # Increasing the speed of the bot to faster than 1 edit every 10
        # seconds violates English Wikipedia rules as of April, 2005, and
        # will cause your bot to be banned.  So don't change $normalDelay
        # unless you know what you are doing.  Other sites may have
        # similar policies, and you are advised to check before using your
        # bot at the default speed.
        #################

        while ($i >= 0)
        {
                sleep (1);
                myLog(3, "Sleeping $i seconds...\r");
                $i--;
        }
        myLog(4, "                                   \r");
}

sub login 
{
        die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

        my $res = $Pearle::ua->post(
                "${Pearle::wiki}index.php?title=Special:Userlogin&action=submitlogin",
                Content => [
                        wpName         => $Pearle::username,
                        wpPassword     => $Pearle::password,
                        wpRemember     => 1,
                        wpLoginAttempt => 1
                ]
        );

        if( 302 == $res->code )
        {
                myLog(3, "Logged in as $Pearle::username\n");
                # This may or may not actually work
                $Pearle::ua->cookie_jar->save();
                return 1;
        }
        else
        {
                myLog(3, "Login failed\n");
                myLog(3, "Code: ".$res->code."\n");
                return 0;
        }
}

sub logout {
        my $res = $Pearle::ua->post(
                "${Pearle::wiki}index.php?title=Special:Userlogout",
        );

        return 1;
}

sub checkLogin
{
        my ($reply_text);
        $reply_text = $_[0];
        
        if ($reply_text !~ m/>My talk<\/a>/ and !($Pearle::logoutOK))
        {
                # We've lost our identity.
                myLog(0, "Wiki server is not recognizing me (1).\n---\n${reply_text}\n---\n");
                die ("921 Wiki server is not recognizing me (1).\n");
        }
}

# Make an HTTP request, performing basic error checking and handling.  Suitable for use with the "retry" function
sub httpRequest
{
        my ($request, $response, $attemptStartTime, $attemptEndTime);
        $request = $_[0];
        
        $response = $Pearle::ua->request($request);

        # Monitor wiki server responsiveness
        $attemptStartTime = Time::HiRes::time();

        if ($response->is_success or $response->is_redirect)
        {
                return $response
        } 
        else 
        {
                # 50X HTTP errors mean there is a problem connecting to the wiki server.  Can be remedied by waiting and trying again
                if (500 <= $response->code and 504 >= $response->code)
                {
                        myLog(2, "HTTP ERR (".$response->status_line.")\n");
                        die("retry:".$response->status_line);
                }
                else
                {
                        # Unhandled HTTP response.  Waiting probably won't fix it
                        myLog(0, "HTTP ERR (".$response->status_line.")\n".$response->decoded_content."\n");
                        die($response->status_line."\n");
                }
        }
        # Monitor wiki server responsiveness
        $attemptEndTime = Time::HiRes::time();

        if($request->method() eq "POST")
        {
                if (($attemptEndTime - $attemptStartTime) > 20)
                {
                        $Pearle::_speedMult = 60;

                        myLog(3, "Wiki is very slow.  Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
                }

                # If the response time is between 10 and 20 seconds...
                elsif (($attemptEndTime - $attemptStartTime) > 10)
                {
                        $Pearle::_speedMult = 6;

                        myLog(3, "Wiki is somewhat slow.  Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
                }

                # If the response time is less than 10 seconds...
                else
                {
                        if ($Pearle::_speedMult != 1)
                        {
                                $Pearle::_speedMult = 1;

                                myLog(3, "Returning to normal minimum wait time.\n");
                        }
                }
        }
}

# Check out a page for editing.
sub getPage
{
        die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

        my ($target, $request, $response, $reply, $text, $text2,
        $editTime, $startTime, $attemptStartTime, $attemptFinishTime,
        $token, $targetSafe);

        $target = $_[0];

        if ($target =~ m/^\s*$/)
        {
                myLog(0, "getPage: Null target.");
                die("902 getPage: Null target.");
        }

#       print join ",", (map {ord($_);} split( //, $target));
        $targetSafe = URI::Escape::uri_escape_utf8($target);

        # Create a request-object
        myLog(3, "GET ${Pearle::wiki}index.php?title=${targetSafe}&action=edit\n");
        $request = HTTP::Request->new(GET => "${Pearle::wiki}index.php?title=${targetSafe}&action=edit");
        $response = startRetry(\&httpRequest, $request);

        $reply = $response->decoded_content;

        # This detects whether or not we're logged in.
        checkLogin($reply);

        # Check for blocking
        if($reply =~ /<h1 class="firstHeading">User is blocked<\/h1>/)
        {
                myLog(0, "Blocked\n");
                die("900 Blocked");
        }
        
        $reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s;
        $text = $1;

        $reply =~ m/value="(\d+)" name="wpEdittime"/;
        $editTime = $1;

        # Added 22 Aug 2005 to correctly handle articles that have
        # been undeleted
        $reply =~ m/value="(\d+)" name="wpStarttime"/;
        $startTime = $1;

        # Added 9 Mar 2005 after recent software change.
        $reply =~ m/value="([^"]+)" name="wpEditToken"/;
        $token = $1;
        ###

        if (($text =~ m/^\s*$/) and !$Pearle::nullOK)
        {
                myLog (1, "getPage($target): Null text!\n");
                myLog (1, "\n---\n$reply\n---\n");
                if ($Pearle::roughMode)
                {
                        return;
                }
                else
                {
                        die ("920 getPage($target): Null text!\n");
                }
        }

        if (($editTime =~ m/^\s*$/) and !$Pearle::nullOK)
        {
                myLog(0, "getPage($target): Null time!\n");
                myLog(0, "\n---\n$reply\n---\n");
                die ("920 getPage($target): Null time!\n");
        }

        if (($text =~ m/>/) or ($text =~ m/</))
        {
                myLog(4, $text);
                myLog(0, "\n---\n$text\n---\n");
                myLog(0, "getPage($target): Bad text suck!\n");
                die ("920 getPage($target): Bad text suck!\n");
        }

        # Change ( " -> " ) etc
        # This function is from HTML::Entities.
        decode_entities($text);

        # This may or may not actually work
        $Pearle::ua->cookie_jar->save();

        return Pearle::WikiPage->new(text => $text, editTime => $editTime, startTime => $startTime, editToken => $token, title => $target);
}


sub postPage
{
        die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

        my ($request, $response, $page, $pageName, $textToPost, $summaryEntry,
        $editTime, $startTime, $actual, $expected, $date, $editToken,
        $minor, $pageNameSafe);

        $page = $_[0];

        $pageName = $page->getTitle();
        $editTime = $page->getEditTime();
        $startTime = $page->getStartTime();
        $editToken = $page->getEditToken();
        $textToPost = $page->getWikiText();

        $summaryEntry = $_[1]; # Max 200 chars!
        $minor = $_[2];

        $summaryEntry = substr($summaryEntry, 0, 200);
        
        if(!defined($minor))
        {
                myLog(0, "postPage(): Not enough parameters.\n");
                die "902 postPage(): Not enough parameters!\n";
        }
        
        if(!$page->isa("Pearle::WikiPage"))
        {
                myLog(0, "postPage(): First parameter is not a WikiPage object\n");
                die "902 postPage(): First parameter is not a WikiPage object\n";
        }

        if ($summaryEntry eq "")
        {
                myLog(0, "postPage(): No edit summary provided\n");
                die "902 postPage(): No edit summary provided\n";
        }

        $pageNameSafe = URI::Escape::uri_escape_utf8($pageName);

        if ($minor)
        {
                $request = POST "${Pearle::wiki}index.php?title=${pageNameSafe}&action=submit",
                [wpTextbox1 => encode("utf8", $textToPost),
                wpSummary => encode("utf8", $summaryEntry),
                wpSave => "Save page",
                wpMinoredit => "on",
                wpEditToken => $editToken,
                wpStarttime => $startTime,
                wpEdittime => $editTime];
                # Optional: wpWatchthis
        }
        else
        {
                $request = POST "${Pearle::wiki}index.php?title=${pageNameSafe}&action=submit",
                [wpTextbox1 => encode("utf8", $textToPost),
                wpSummary => encode("utf8", $summaryEntry),
                wpSave => "Save page",
                wpEditToken => $editToken,
                wpStarttime => $startTime,
                wpEdittime => $editTime];
                # Optional: wpWatchthis, wpMinoredit
        }

        # ---
        ## If posts are failing, you can uncomment the below to see what
        ## HTTP request is being made.
        # myLog($request->as_string());
        # print $request->as_string();       $::speedLimit = 60 * 10;
        # print $::ua->request($request)->as_string;
        # ---

        myLog(3, "POSTing...");
        # Pass request to the user agent and get a response back
        $response = startRetry(\&httpRequest, $request);
        myLog(3, "POSTed.\n");


        if ($response->decoded_content =~ m/Please confirm that really want to recreate this article./)
        {
                myLog(0, $response->decoded_content."\n");
                die ("923 Deleted article conflict! See log!");
        }


        # Check the outcome of the response
        $response->code;
        if ($response->code != 302 and $response->code  != 200)
        {
                myLog(0, "postPage(${pageName}, $editTime)#1 - expected =! actual\n");
                myLog(0, $request->as_string());
                myLog(0, "EXPECTED: 302'\n");
                myLog(0, "  ACTUAL: '" . $response->status_line . "'\n");

                if ($Pearle::roughMode eq "yes")
                {
                        return;
                }
                else
                {
                        die ("920 postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n");
                }
        }

        $expected = "${Pearle::pagebase}${pageName}";

        # Convert the returned address to canonical form
        $actual = $response->headers->header("Location");
        if(!defined($actual))
        {
                my $content = $response->decoded_content;
                if($content =~ /<p>The following text is what triggered our spam filter: (.*)/)
                {
                        my $badurl = $1;
                        myLog(0, "Spam filter blocked edit: $badurl\n");
                        die("924 Spam filter: $badurl\n");
                }
                else
                {
                        myLog(1, "Return problem:\n\n");
                        myLog(1, $content);
                        myLog(1, "\n\n");
                }
        }
        else
        {
                $actual = uri_unescape($actual);
                $actual = decode("utf8", $actual);
                $actual =~ s/_/ /g;
        }
        
        $expected =~ s/_/ /g;

        if (($expected ne $actual) and ($Pearle::roughMode ne "yes") 
            and !(($actual eq "") and ($response->code == 200)))
        {
                myLog(0, "postPage(${pageName}, $editTime)#2 - expected =! actual\n");
                myLog(0, "EXPECTED: '${expected}'\n");
                myLog(0, "  ACTUAL: '${actual}'\n");
                die ("920 postPage(${pageName}, ${actual})#2 - expected =! actual - see log\n");
        }


        if ($response->decoded_content =~ m/<h1 class="firstHeading">Edit conflict/)
        {
                myLog(0, "Edit conflict on '$pageName' at '$editTime'!\n");
                die ("922 Edit conflict on '$pageName' at '$editTime'!\n");
        }
        
        if($Pearle::sanityCheck and $pageName =~ /talk[ _]*:/i) # Check for accidental creation of a talkpage without a mainpage.  Only works with bots using the "monobook" skin.
        {
                # Monobook:<li id="ca-nstab-main" class="new"><a href="/w/index.php?title=Kjsahfjrf&action=edit">Article</a></li> 
                # Classic: <br /><a href="/w/index.php?title=Kjsahfjrf&action=edit" class="new" title="Kjsahfjrf">View article</a>
                if($response->decoded_content =~ /<li id="ca-nstab-[^"]" class="new">/)
                {
                        myLog(0, "postPage(${pageName}) - Talkpage without article!\n");
                        die ("920 postPage(${pageName}) - Talkpage without article!\n");
                }
        }
        
        if($Pearle::sanityCheck and $pageName =~ /^user[ _]*talk[ _]*:/)        # Check for user talkpage for non-existant user
        {
                if($response->decoded_content !~ /User contributions/)
                {
                        myLog(0, "postPge(${pageName}) - User talkpage for non-existant user!\n");
                        die ("920 postPge(${pageName}) - User talkpage for non-existant user!\n");
                }
        }

        $Pearle::ua->cookie_jar->save();
        return ($response->decoded_content);
}


# Get a list of the contents in a given category, filtered by namespace
sub getCategoryContents
{
        die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

        my ($target, $category_offset, @articles, $xml, $parsed_xml, %query_params,
            $numberOfArticles, @namespaces);


        $target = shift;
        
        @namespaces = @_;

        # Category: prefix is mandatory
        if($target !~ /^[Cc]ategory:/)
        {
                $target = "Category:" . $target;
        }
        
        $query_params{list} = 'categorymembers';
        $query_params{cmprop} = 'title';
        $query_params{cmtitle} = $target;
        $query_params{cmlimit} = 500;   # If you're a flagged bot, this could be 5000, but we default to 500 for compatibility and to keep memory usage down
        
        foreach my $namespace (@namespaces)
        {
                $query_params{cmnamespace} .= "${namespace}|";
        }
        if(exists($query_params{cmnamespace}) and defined($query_params{cmnamespace}))
        {
                chop $query_params{cmnamespace};
        }
                
        do
        {
                $xml = APIQuery(%query_params);

                if(!defined($xml))
                {
                        myLog(0, "Unknown error accessing log\n");
                        die "900 Unknown error accessing log";
                }

                $parsed_xml = $Pearle::xml_parser->XMLin($xml);

                myLog(4, Dumper($parsed_xml));
                $xml = undef;

                if(exists($parsed_xml->{query}->{categorymembers}->{cm}) and defined($parsed_xml->{query}->{categorymembers}->{cm}))
                {
                        if(ref($parsed_xml->{query}->{categorymembers}->{cm}) eq 'ARRAY')# Is an array
                        {
                                foreach my $item (@{$parsed_xml->{query}->{categorymembers}->{cm}})
                                {
                                        push @articles, $item->{title};
                                }
                        }
                        else
                        {
                                # Only one item.  The XML parser won't convert it into an array.
                                my $item  = $parsed_xml->{query}->{categorymembers}->{cm};
                                push @articles, $item->{title};
                        }
                }
                
                print Dumper($parsed_xml->{'query-continue'});
                print "\n";
                if(exists($parsed_xml->{'query-continue'}->{categorymembers}->{cmcontinue}))
                {
                        $category_offset = $parsed_xml->{'query-continue'}->{categorymembers}->{cmcontinue};
                        $category_offset =~ s/&/%26/;
                        $query_params{cmcontinue} = $category_offset;
                }
                else
                {
                        $category_offset = undef;
                }

                sleep (1); # Throttle GETs
        }
        while(defined($category_offset));

        $numberOfArticles = scalar(@articles);
        myLog(4, "Got $numberOfArticles articles.\n");

        return @articles;
}

sub getCategoryArticles
{
        return getCategoryContents($_[0], 0);   # Namespace 0: Articles
}

sub getCategoryImages
{
        return getCategoryContents($_[0], 6);   #Namespace 6: Images
}

sub getSubcategories
{
        return getCategoryContents($_[0], 14);  # Namespace 14: Categories
}

# Get up to $max most recent articles edited by a user
sub getUserArticles
{
        die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

        my ($url, $request, $response, $reply, @contribs,
            $target, $namespace, $max, $offset);
        
        $target = $_[0];
        $max = $_[1];
        $offset = $_[2];
        $namespace = namespaceToNumber($_[3]);

        # Create a request-object
        if(defined($namespace))
        {
                $url = "${Pearle::wiki}index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}&namespace=$namespace";
        }
        else
        {
                $url = "${Pearle::wiki}index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}";
        }

        myLog(3, "GET $url\n");
        $request = HTTP::Request->new(GET => "$url");
        $response = startRetry(\&httpRequest, $request);
        $reply = $response->decoded_content;

        # This detects whether or not we're logged in.
        checkLogin($reply);
                
        # Extract the contributions
        # <li>23:18, 6 March 2006 (<a href="/w/index.php?title=User_talk:OrphanBot&action=history" title="User talk:OrphanBot">
        while($reply =~ /<li>$Pearle::regex_timestamp_nc \($Pearle::regex_bluepagelink/g)
        {
                push @contribs, $1;
        }
        
        # Remove duplicates     
#       @contribs = uniquify(@contribs);
        return @contribs;
}

# Gets a list of (articles, actor, summary, timestamp) tuples from the specified log (upload, delete, move, protect).  The list is sorted by timestamp
# with the newest entry first
#
# Takes the following named parameters:
#       user: Filter "actor" to include only actions by this user
#       log: Filter to include only actions in this log (upload, delete, move, protect).
#       limit: Include this many items.  Defaults to 50 items.
#       time: Start checking the log at this time.  Timestamp in ISO 8601 format.
#       dir: Check the log in this direction (newer or older) from the timestamp
sub getLogArticles
{
        die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

        my %params = @_;
        my %query_params = (list => 'logevents', lelimit => 50);
        my @articles = ();

        foreach my $key (keys(%params))
        {
                if($key eq 'user')
                {
                        $query_params{leuser} = $params{user};
                        $query_params{leuser} =~ s/^User://i;   # Strip namespace prefix, if it's there.
                }
                elsif($key eq 'log')
                {
                        $query_params{letype} = $params{log};
                }
                elsif($key eq 'limit')
                {
                        $query_params{lelimit} = $params{limit};
                }
                elsif($key eq 'time')
                {
                        $query_params{lestart} = $params{time};
                }
                elsif($key eq 'dir')
                {
                        $query_params{ledir} = $params{dir};
                }
                else
                {
                        myLog(2, "Error: Unknown parameter $key in getLogArticles\n");
                }
        }

        my $xml = APIQuery(%query_params);
        
        if(!defined($xml))
        {
                myLog(0, "Unknown error accessing log\n");
                die "920 Unknown error accessing log";
        }
        
        my $parsed_xml;
#       eval
#       {
                $parsed_xml = $Pearle::xml_parser->XMLin($xml);
#       }
#       if($@ =~ /not well-formed/)
#       {
#               # Retry with the XML::SAX parser
#       }

        myLog(4, Dumper($parsed_xml));
        $xml = undef;

        if(exists($parsed_xml->{query}->{logevents}->{item}) and defined($parsed_xml->{query}->{logevents}->{item}))
        {
                if(ref($parsed_xml->{query}->{logevents}->{item}) eq 'ARRAY')# Is an array
                {
                        foreach my $item (@{$parsed_xml->{query}->{logevents}->{item}})
                        {
                                push @articles, [$item->{title}, $item->{user}, $item->{comment}, $item->{timestamp}];
                        }
                        $parsed_xml = undef;
                }
                else
                {
                        # Only one item.  The XML parser won't convert it into an array.
                        my $item  = $parsed_xml->{query}->{logevents}->{item};
                        push @articles, [$item->{title}, $item->{user}, $item->{comment}, $item->{timestamp}];
                }
                
                @articles = uniquify_ref(0, @articles);
                @articles = sort {$b->[3] cmp $a->[3]} @articles;
        }
        return @articles;
}

# Use the api.php interface to query the wiki
#
# Takes a hash of parameter,value pairs
#
# Returns raw the XML blob from the wiki, or undef on error
sub APIQuery
{
        my %params = @_;
        my $url = "${Pearle::wiki}api.php?action=query&format=xml";
        my $reply = undef;
        
        foreach my $key (keys(%params))
        {
                if(ref($params{$key}) eq 'ARRAY')       # We've got a list of values
                {
                        my $val_list = join '|', @{$params{$key}};
                        $val_list =~ s/&/%26/g;
                        $key =~ s/&/%26/g;
                        $url .= "&${key}=$val_list";
                }
                else
                {
                        my $val = $params{$key};
                        $key =~ s/&/%26/g;
                        $val =~ s/&/%26/g;
                        $url .= "&${key}=${val}";
                }
        }
        
        myLog(3, "API query: $url\n");
        
        $url = encode("utf8", $url);

        my $request = HTTP::Request->new(GET => "$url");
        my $response = startRetry(\&httpRequest, $request);
        if($response->is_success)
        {
                $reply = $response->decoded_content;
                if(!defined($reply))
                {
                        myLog(1, "Failed to decode response\n");
                        $reply = decode("utf8", $response->content);
                }
                if($reply =~ /<error code="([^"]*)"/)
                {
                        # Format error
                        myLog(1, "Error $1 querying server\n");
                        $reply = undef;
                }
        }
        else
        {
                myLog(1, "HTTP error accessing server\n");
                $reply = undef;
        }
        
        return $reply;
}

# Use the Special:Export interface to get the wikitext of one or more articles
sub Export
{
        my ($request, $response, $reply, $articles);
        
        $articles = join "\n", @_;
        
        $request = POST "http://en.wikipedia.org/w/index.php?title=Special:Export&action=submit", [action => 'submit', pages => $articles, curonly => 1];
        $response = startRetry(\&httpRequest, $request);
        $reply = $response->decoded_content;

        return $reply;
}

# Get the history of an article and parse the first 500 entries into a list of [link day month year] lists
sub parseHistory
{
    my ($pageName, $html, @lines, $line, $date, $hour, $minute, $day, $month, $year,
        $htmlCopy, $link, @result);

    $pageName = $_[0];
        $pageName = URI::Escape::uri_escape_utf8($pageName);
        
    $html = getURL("http://en.wikipedia.org/w/index.php?title=${pageName}&action=history&limit=500");

    $htmlCopy = $html;

    $html =~ s%^.*?<ul id="pagehistory">%%s;
    $html =~ s%(.*?)</ul>.*$%$1%s;

    @lines = split ("</li>", $html);
    foreach $line (@lines)
    {
                $line =~ s/\n/ /g;

                if ($line =~ m/^\s*$/)
                {
                next;
                }
                $line =~ s/<span class='user'>.*?$//;
                $line =~ s/^.*?Select a newer version for comparison//;
                $line =~ s/^.*?Select a older version for comparison//;
                $line =~ s/^.*?name="diff" \/>//;
                
                $line =~ m%<a href="(.*?)" title="(.*?)">$Pearle::regex_timestamp</a>%;
                $link = $1;
                $hour = $3;
                $minute = $4;
                $day = $5;
                $month = $6;
                $year = $7;

                push @result, [$link, $day, $month, $year];
    }
    
    return (@result);
}


sub getURL #($target)
{
        die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
    # Read throttle!
    sleep (1);

    my ($request, $response, $reply, $url);
    
    $url = $_[0];

    # Create a request-object
    myLog(3, "GET ${url}\n");
    $request = HTTP::Request->new(GET => "${url}");
    $response = startRetry(\&httpRequest, $request);

        $reply = $response->decoded_content;
        
        # This may or may not actually work
        $Pearle::ua->cookie_jar->save();

        return ($reply);
}


# Retries a given function repeatedly, with an exponential backoff rate
# The function should throw an exception beginning with "retry:" (case insensitive) if the call should be retried
sub startRetry
{
        my ($call_fn, @args) = @_;
        return retry($Pearle::speedLimit, $call_fn, @args);
}

sub retry
{
        my ($call_fn, @args, $delay, @result, $result);
        
        ($delay, $call_fn, @args) = @_;
        
        if(wantarray())
        {
                @result = eval{ $call_fn->(@args) };
                if($@ =~ /^retry:/i)
                {
                        limit($delay);
                        @result = retry($delay * 2, $call_fn, @args);
                }
                elsif($@)
                {
                        die;
                }
                return @result;
        }
        else
        {
                $result = eval{ &{$call_fn}(@args) };
                if($@ =~ /^retry:/i)
                {
                        limit($delay);
                        $result = retry($delay * 2, $call_fn, @args);
                }
                elsif($@)
                {
                        die;
                }
                return $result;
        }
}


sub namespaceToNumber
{
        my $namespace = $_[0];
        my $i = 0;
        my $name;
        if(defined($namespace))
        {
                foreach $name (@Pearle::namespaces)
                {
                        return $i if(lc($name) eq lc($namespace));
                        $i++;
                }
        }
        else
        {
                return undef;
        }
}

sub numberToNamespace
{
        my $i = shift;
        if(defined($i))
        {
                return $Pearle::namespaces[$i];
        }
        else
        {
                return undef;
        }
}


# Translate from HTTP URL encoding to the native character set.
sub urlDecode
{
        my ($input);

        $input = $_[0];

        $input =~ s/\%([a-f|A-F|0-9][a-f|A-F|0-9])/chr(hex($1))/eg;

        return ($input);
}

sub decodeArray
{
        return map {urlDecode($_)} @_;
}

# Remove duplicates from a list
sub uniquify
{
        my @list = @_;
        @list = sort @list;
        my $last = undef;
        my @new_list;
        my $item;
        
        foreach $item (@list)
        {
                push @new_list, $item if(!defined($last) or ($item ne $last));
                $last = $item;
        }
        return @new_list;
}

# Remove duplicates from a list of array references, grouping on the specified subelement
sub uniquify_ref
{
        my $element = shift;
        my @list = @_;
        @list = sort {$a->[$element] cmp $b->[$element]} @list;
        my $last = undef;
        my @new_list;
        my $item;

        foreach $item (@list)
        {
                push @new_list, $item if(!defined($last) or ($item->[$element] ne $last));
                $last = $item->[$element];
        }
        return @new_list;
}

#1;