From Wikipedia, the free encyclopedia
#!/usr/bin/perl
# libBot: A Perl module of useful routines for running a bot
package libBot;
use strict;
use warnings;
use Pearle;
use Data::Dumper;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(config usernotify wikilog botwarnlog notelog LoadInfoboxPatterns FixupLinks MakeWikiRegex DoIHaveMessages GetPageCategories GetPageLinks GetPageText GetPageList GetFullPageList SaveImage UpdateLink RemoveImageFromPage IsNotified isDated getDate getUploadDates getLastEditDate GetImageUploader loadNotificationList saveNotificationList);
our $VERSION = 1.00;
my $test_only = 0;
my $username = "";
my @infobox_patterns = ();
sub config
{
my %params = @_;
$test_only = $params{test_only} if(defined($params{test_only}));
$username = $params{username} if(defined($params{username}));
}
# Log a warning on a user's talkpage, using an existing edit session
sub usernotify
{
my ($wikipage, $text, $user, $summary);
$wikipage = $_[1];
$summary = $_[2];
$summary = "Logging warning message" if(!defined($summary));
# We've been handed an editing session
Pearle::myLog(4, "Warning with existing edit session\n");
if($test_only)
{
print STDERR $_[0];
return;
}
if($wikipage->getWikiText() =~ /^#redirect/i)
{
botwarnlog("*User talk page [[User talk:$user]] is a redirect\n");
return;
}
$text = $wikipage->getEditableText();
$text .= $_[0];
$wikipage->setEditableText($text);
Pearle::postPage($wikipage, $summary, 0);
print STDERR $_[0];
}
# General-purpose on-Wiki logging routine
sub wikilog
{
my($target, $text, $wikipage, $summary);
$target = $_[0];
$summary = $_[2] || "Logging note";
$wikipage = Pearle::getPage($target);
if($test_only)
{
print STDERR $_[1];
return;
}
$text = $wikipage->getEditableText();
$text .= $_[1];
$wikipage->setEditableText($text);
Pearle::postPage($wikipage, $summary, 0);
print STDERR $_[1];
}
# Log a warning on the talk page of the bot
sub botwarnlog
{
my ($page, $text, $summary);
$text = $_[0];
$summary = $_[1];
$summary = "Logging warning message" if(!defined($summary));
$page = "User talk:${username}/log";
wikilog($page, $text, $summary);
}
# Log a notification message to the console
sub notelog
{
print STDERR @_;
}
# Fix all wikilinks in a string so that they shows as a link, not inline, if it's for a category or image
sub FixupLinks
{
my $link = shift;
$link =~ s/\[\[(Category|Image)/[[:$1/g;
return $link;
}
# Make a string into a Wikipedia-compatible regex
sub MakeWikiRegex
{
my $string = shift;
my @chars = split //, $string;
my $result = '';
foreach my $char (@chars)
{
# Escape metacharacters, and add percent-encoding for certain characters
if($char eq '\\') {$result .= '\\\\';}
elsif($char eq '.') {$result .= '\.';}
elsif($char eq '(') {$result .= '(?:\(|%28)';}
elsif($char eq ')') {$result .= '(?:\)|%29)';}
elsif($char eq '[') {$result .= '\[';}
elsif($char eq ']') {$result .= '\]';}
elsif($char eq '+') {$result .= '\+';}
elsif($char eq '*') {$result .= '\*';}
elsif($char eq '?') {$result .= '(?:\?|%3F)';}
elsif($char eq '^') {$result .= '\^';}
elsif($char eq '$') {$result .= '\$';}
elsif($char eq '&') {$result .= '(?:&|%26)';}
elsif($char eq '!') {$result .= '(?:!|%21)';}
elsif($char eq '~') {$result .= '(?:~|%7E)';}
elsif($char eq "'") {$result .= "(?:'|%27)";}
elsif($char eq '"') {$result .= '(?:"|%22)';}
elsif($char eq ',') {$result .= '(?:,|%2C)';}
else {$result .= $char;}
}
# Process the string to match both with spaces and with underscores
$result =~ s/[ _]/[ _]+/g;
# Process the string to match both upcase and lowercase first characters
if($result =~ /^[A-Za-z]/)
{
$result =~ s/^(.)/"[$1".lc($1)."]"/e;
}
return $result;
}
sub HTMLEncode
{
my $char = shift;
return sprintf("&X%X;", ord($char));
}
# Make a string into something that can match most image name formats
sub MakeFancyRegex
{
my $string = shift;
my @chars = split //, $string;
my $result;
foreach my $char (@chars)
{
if($char eq '\\')
{
$result .= "(\\\\|%5C|%5c|&x5C;)";
}
elsif($char eq '.')
{
}
elsif($char eq '(')
{
}
elsif($char eq ')')
{
}
else
{
$result .= "($char|" . uri_escape_utf8($char) . "|" . lc(uri_escape_utf8($char)) . "|" . HTMLEncode($char) . "|" . lc(HTMLEncode($char)) . ")";
}
}
return $result;
}
# Check for new talk page messages
sub DoIHaveMessages
{
my $xml = shift;
my $parsed_xml = Pearle::getXMLParser()->XMLin($xml);
if(exists($parsed_xml->{query}->{userinfo}->{messages}) and defined($parsed_xml->{query}->{userinfo}->{messages}))
{
return 1;
}
else
{
return 0;
}
}
sub GetPageCategories
{
my $image_data = shift;
my @pages = ();
if(defined($image_data))
{
my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data);
Pearle::myLog(4, Dumper($parsed_xml));
if(exists($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}) and defined($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}))
{
if(ref($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}) eq 'ARRAY')
{
my @all_pages = @{$parsed_xml->{query}->{pages}->{page}->{categories}->{cl}};
@pages = map {$_->{title}} @all_pages;
}
else
{
@pages = ($parsed_xml->{query}->{pages}->{page}->{categories}->{cl}->{title});
}
}
}
return @pages;
}
sub GetPageLinks
{
my $image_data = shift;
my @pages = ();
if(defined($image_data))
{
my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data);
Pearle::myLog(4, Dumper($parsed_xml));
if(exists($parsed_xml->{query}->{pages}->{page}->{links}->{pl}) and defined($parsed_xml->{query}->{pages}->{page}->{links}->{pl}))
{
if(ref($parsed_xml->{query}->{pages}->{page}->{links}->{pl}) eq 'ARRAY')
{
my @all_pages = @{$parsed_xml->{query}->{pages}->{page}->{links}->{pl}};
@pages = map {$_->{title}} @all_pages;
}
else
{
@pages = ($parsed_xml->{query}->{pages}->{page}->{links}->{pl}->{title});
}
}
}
return @pages;
}
sub GetPageText
{
my $image_data = shift;
my $text = undef;
if(defined($image_data))
{
my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data);
Pearle::myLog(4, Dumper($parsed_xml));
if(exists($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}) and defined($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}))
{
$text = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev};
}
}
return $text;
}
sub GetPageList
{
my $image_data = shift;
my $image;
my @pages = ();
if(defined($image_data))
{
my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data);
my $image = $parsed_xml->{query}->{pages}->{page}->{title};
Pearle::myLog(4, Dumper($parsed_xml));
if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu}))
{
if(ref($parsed_xml->{query}->{imageusage}->{iu}) eq 'ARRAY')
{
my @bad_pages = grep {$_->{ns} == 10 or $_->{ns} == 12} @{$parsed_xml->{query}->{imageusage}->{iu}};
my @good_pages = grep {$_->{ns} != 10 and $_->{ns} != 12} @{$parsed_xml->{query}->{imageusage}->{iu}};
@pages = map {$_->{title}} @good_pages;
if(scalar(@bad_pages) > 0 and defined($image)) # If "image" is undefined, we're probably doing a pure usage check, rather than one in preparation for removal
{
my $notice;
foreach my $page (@bad_pages)
{
$notice .= "*Found image [[:$image]] in [[$page->{title}]]\n";
}
botwarnlog($notice);
}
}
else
{
if($parsed_xml->{query}->{imageusage}->{iu}->{ns} != 10 and $parsed_xml->{query}->{imageusage}->{iu}->{ns} != 12)
{
@pages = $parsed_xml->{query}->{imageusage}->{iu}->{title};
}
else
{
if(defined($image))
{
botwarnlog("*Found image [[:$image]] in [[$parsed_xml->{query}->{imageusage}->{iu}->{title}]]\n");
}
}
}
}
}
return @pages;
}
# Get all pages. Don't filter for bad namespaces.
sub GetFullPageList
{
my $image = shift;
my @pages = ();
my $xml = Pearle::APIQuery(list => 'imageusage', iutitle => $image);
if(defined($xml))
{
my $parsed_xml = Pearle::getXMLParser()->XMLin($xml);
Pearle::myLog(4, Dumper($parsed_xml));
if(exists($parsed_xml->{query}->{imageusage}->{iu}) and defined($parsed_xml->{query}->{imageusage}->{iu}))
{
if(ref() eq 'ARRAY')
{
@pages = map {$_->{title}} @{$parsed_xml->{query}->{imageusage}->{iu}};
}
else
{
@pages = $parsed_xml->{query}->{imageusage}->{iu}->{title};
}
}
}
return @pages;
}
sub UpdateLink
{
my $page = shift;
my $from = shift;
my $to = shift;
die "No page to edit" if(!defined($page));
die "No link to change" if(!defined($from));
die "No new link" if(!defined($to));
Pearle::myLog(3, "Updating link from $from to $to\n");
my $wikipage = Pearle::getPage($page);
$wikipage->canonicalizeLinks();
my $text = $wikipage->getEditableText();
my $link_regex = MakeWikiRegex($from);
my $matches = $text =~ s/\x01($link_regex)\x02/\x01${to}|${1}\x02/gi;
$matches += $text =~ s/\x01$link_regex\|/\x01${to}|/gi;
$wikipage->setEditableText($text);
Pearle::postPage( $wikipage, "Updating link to bypass a redirect or disambiguation page", 0);
return $matches;
}
sub RemoveImageFromPage
{
my $image = shift;
my $page = shift;
my $image_regex = shift;
my $removal_prefix = shift;
my $removal_comment = shift;
my $wikipage;
my $text;
my ($match1, $match2);
my $old_length;
my $new_length;
my $change_len;
my $match_len;
tryagain:
# Fetch an article page
$wikipage = Pearle::getPage($page);
$wikipage->canonicalizeLinks();
$text = $wikipage->getEditableText();
if(!defined($text))
{
Pearle::myLog(1, "Error: Bad edit page [[$page]]\n");
botwarnlog(FixupLinks("*Error: Bad edit page [[$page]]\n"));
sleep(300);
return 0;
}
if($text =~ /^\s*$/)
{
# Might be protected instead of empty
Pearle::myLog(1, "Error: Empty or protected page [[$page]]\n");
botwarnlog(FixupLinks("*Error: Empty or protected page [[$page]]\n"));
sleep(300);
return 0;
}
if($text =~ /^#redirect/i)
{
Pearle::myLog(1, "Redirect found for page [[$page]] (image [[:$image]])\n");
botwarnlog(FixupLinks("*Redirect found for page [[$page]] (image [[:$image]])\n"));
print $text;
return 0;
}
# Remove the image
my $regex3 = "(\x01${image_regex}[^\x01]*?(\x01[^\x02]*?\x02[^\x01]*?|)+\x02[ \\t]*)"; # Regex to match images
#my $regex3 = "(
# \x01 # Open double-bracket for the image
# ${image_regex} # The image itself
# [^\x01]*? # Anything up to the first link in the caption, or a closing double bracket (minimal match)
# (\x01 # Open double-bracket for a link in the caption
# [^\x02]*? # Anything but a closing double-bracket
# \x02 # The closing double-bracket for the link
# [^\x01]*?|) # Any non-link text, or nothing
# + # Matches one or more times
# \x02 # The closing double-bracket for the image
# [ \\t]*) # Any trailing whitespace
# ";
my $regex3ex = "\\w[ \\t]*${regex3}[ \\t]*\\w"; # Regex to try to spot inline images
my $regex3g = "(${image_regex}.*)"; # Regex to match gallery images
my ($raw_image) = $image =~ /Image:(.*)/;
my $regex4m = "\x01[ _]*[Mm]edia[ _]*:[ _]*" . MakeWikiRegex($raw_image) . "[ _]*\\|([^]]*)\x02"; # Regex to match inline Media: links
Pearle::myLog(3, "Regex 3: $regex3\n");
notelog("Regex 3: $regex3\n");
notelog("Regex 3 extended: $regex3ex\n");
notelog("Regex 3 gallery: $regex3g\n");
Pearle::myLog(3, "Raw regex: $raw_image\n");
notelog("Regex 4 Media: $regex4m\n");
if($text =~ /$regex3ex/)
{
Pearle::myLog(1, "Possible inline image in [[$page]]\n");
botwarnlog(FixupLinks("*Possible inline image [[:$image]] in [[$page]]\n"));
return 0; # Can't do gallery matching because that also matches regular images, and odds are, we don't have an infobox
}
if($text =~ /-->/) # With the new editing markup, any close-comment means that somebody fucked up their wikimarkup
{
Pearle::myLog(3, "Fractional comment in page [[$page]]\n");
botwarnlog(FixupLinks("*Fractional comment in page [[$page]]\n"));
}
$text =~ /$regex3/;
$match_len = length($1);
$match2 = $text =~ s/$regex3/<!-- $removal_prefix $1 -->/g;
$new_length = length($text);
print "Num: $match2 Len: $match_len\n";
if($match2)
{
# If a whole lot of text was removed, log a warning
if($match_len > (500 + length($image)))
{
botwarnlog(FixupLinks("*Long caption of $match_len bytes replaced in [[$page]]\n"));
if($match_len > (1000 + length($image)))
{
Pearle::myLog(2, "Unusually long caption of $match_len found in [[$page]] ($match2 matches).\n");
print $text, "\n";
# exit;
return 0;
}
}
if($match_len < (2 + length($image)))
{
Pearle::myLog(0, "Short replacement of $match_len bytes (min " . (length($image) + 2) . ") in [[$page]] ($match2 matches). Exiting.\n");
Pearle::myLog(0, "Text:\n$text\n");
exit;
}
# If many matches, log a warning
if($match2 > 2)
{
Pearle::myLog(3, "More than one match ($match2) in page [[$page]]\n");
# botwarnlog(FixupLinks("*More than one match ($match2) in page [[$page]]\n"));
}
if($match2 > 100)
{
Pearle::myLog(1, "Too many matches ($match2) in page [[$page]]. Skipping.\n");
botwarnlog("*Too many matches ($match2) in page [[$page]]. Skipping.\n");
return 0;
}
}
# Put the text back and get it again in order to fold any comments resulting from removing non-gallery images.
# This is because gallery image matching will also match commented images.
$wikipage->setEditableText($text);
$text = $wikipage->getEditableText();
if($text =~ /<gallery/i)
{
Pearle::myLog(3, "*Possible image gallery in page [[$page]]\n");
if($text =~ s/$regex3g/<!-- $removal_prefix $1 -->/g)
{
# if($match2 != 0)
# {
# botwarnlog("*Both a gallery and a non-gallery in [[$page]]\n");
# }
$match2 += 1;
}
}
if($match2 > 0)
{
if($text =~ /\[\[(?: |)<!--/)
{
Pearle::myLog(2, "Possible multiline image in page [[$page]]\n");
botwarnlog(FixupLinks("*Possible multiline image in page [[$page]]\n"));
}
}
# Improved infobox removal
my $infobox_regex = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*" . "[ _]*" . MakeWikiRegex($raw_image) . "[ _]*";
my $infobox_regex_full = "([-A-Za-z0-9_]+[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*=)[\\p{IsSpace}\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]*" . '[Ii]mage[ _]*:[ _]*' . MakeWikiRegex($raw_image);
if($text =~ /$infobox_regex/)
{
Pearle::myLog(3, "Matched on infobox regex: $infobox_regex\n");
Pearle::myLog(3, "Infobox parameter: $1\n");
if($& =~ /puic/)
{
botwarnlog(FixupLinks("*PUIC in page [[$page]]\n"));
}
else
{
my $sub = $1;
$text =~ s/$infobox_regex/$sub/g;
$match2 += 1;
}
}
if($text =~ /$infobox_regex_full/)
{
Pearle::myLog(3, "Matched on infobox regex: $infobox_regex_full\n");
Pearle::myLog(3, "Infobox parameter: $1\n");
if($& =~ /puic/)
{
botwarnlog(FixupLinks("*PUIC in page [[$page]]\n"));
}
else
{
my $sub = $1;
$text =~ s/$infobox_regex_full/$sub/g;
$match2 += 1;
}
}
if($match2) # No need to null-edit articles anymore
{
if($test_only)
{
notelog("Test removal from page succeeded\n");
}
else
{
# Submit the changes
$wikipage->setEditableText($text);
eval
{
Pearle::postPage($wikipage, $removal_comment, 0);
};
if($@)
{
if($@ =~ /^924 Spam filter: (.*)$/)
{
botwarnlog("*Spam filter on page [[$page]], url <nowiki>$1\n");
$match2 = 0; # We weren't able to remove it
}
elsif($@ =~ /^922/)
{
# Edit conflict. Try editing the page again.
botwarnlog("*Edit conflict on page [[$page]]\n");
goto tryagain;
}
else
{
die;
}
}
}
}
return ($match2)
}
# Returns 1 if the user has been notified, or 0 if they haven't
sub IsNotified
{
my $uploader = shift;
my $image_regex = shift;
my $image_name = shift;
my $notes_ref = shift;
my $donts_ref = shift;
# Check notification list
if($notes_ref->{"$uploader,$image_name"})
{
Pearle::myLog(3, "Already notified for this image\n");
return 1;
}
if($donts_ref->{$uploader})
{
Pearle::myLog(3, "On exception list: $uploader\n");
return 1;
}
# # Check uploader's talkpage
my $page_data = Pearle::APIQuery(titles => "User talk:$uploader", prop => 'links', plnamespace => 6);
if($page_data =~ /$image_regex/)
{
Pearle::myLog(3, "Has a link from userpage\n");
return 1;
}
# my $wikipage = Pearle::getPage("User talk:$uploader");
# my $text = $wikipage->getWikiText();
# if($text =~ /$image_regex/)
# {
# Pearle::myLog(3, "Already notified by someone else\n");
# $donts_ref->{"$uploader,$image_name"} = 1;
# return 1;
# }
# else
# {
# Pearle::myLog(3, "Not already notified\n");
# return $wikipage;
# }
return 0;
}
sub isDated
{
my $image_text = shift;
if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/) # Dated template
{
myLog(4, "Dated tag $1 $2 $3\n");
return 1;
}
# as of 6 October 2006">
elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category
{
myLog(4, "Template borked; category $1 $2 $3\n");
return 1;
}
elsif($image_text =~ /{{{day}}} {{{month}}} \d\d\d\d/ or $image_text =~ /\( 2006\)/) # Generic template
{
myLog(4, "Generic tag\n");
return 0;
}
else
{
myLog(4, "No tag match\n");
return 0;
}
}
# Return the tag date if there is one, the upload date if not
# Returns in (day, month, year) format
sub getDate
{
my $image_text = shift;
if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/)
{
myLog(4, "Template date $1-$2-$3\n");
return ($1, $2, $3);
}
elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category
{
myLog(4, "Category date $1-$2-$3\n");
return ($1, $2, $3);
}
elsif($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</)
{
myLog(4, "Upload date $1-$2-$3\n");
# return ($1, $2, $3);
# For now, be conservative:
my ($year, $month, $day) = Today();
return ($day, Month_to_Text($month), $year);
}
else
{
myLog(4, "No date\n");
return (1, "January", 2007);
}
}
# Return a list of upload dates
sub getUploadDates
{
my @dates;
my $image_text = shift;
while($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</g)
{
push @dates, [$1, $2, $3];
}
return @dates;
}
sub getLastEditDate
{
my ($day, $month, $year);
my $image = shift;
my @history = Pearle::parseHistory($image);
(undef, $day, $month, $year) = @{$history[0]};
return ($day, $month, $year);
}
# Find the most recent non-vandal, non-revert uploader
sub GetImageUploader
{
my $image_data = shift;
my ($uploader, $sha1, $comment);
my @uploaders;
my $uploader_data;
my $i = 0;
my $count = 0;
my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data);
Pearle::myLog(4, Dumper($parsed_xml));
if(exists($parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}) and defined($parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}))
{
if(ref($parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}) eq 'ARRAY')
{
@uploaders = @{$parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}};
}
else
{
return $parsed_xml->{query}->{pages}->{page}->{imageinfo}->{ii}->{user};
}
}
else
{
return undef;
}
$uploader = $uploaders[0]->{user};
$sha1 = $uploaders[0]->{sha1};
$comment = $uploaders[0]->{comment} || "";
my $done = 0;
while(!$done)
{
if($comment =~ /^Reverted/)
{
Pearle::myLog(4, "Revert found\n");
$i += 1;
while($uploaders[$i]->{sha1} ne $sha1)
{
$i = $i + 1;
}
}
elsif($comment =~ /optimi(z|s)ed|adjust|tweak|scale|crop|change|resize|remove/i)
{
Pearle::myLog(4, "Tweak found\n");
$i = $i + 1;
}
elsif(!defined($uploader))
{
Pearle::myLog(4, "Something went wrong with finding the uploader\n");
$done = 1;
}
elsif($count > 500)
{
Pearle::myLog(4, "Took too long finding the uploader\n");
$uploader = undef;
$done = 1;
}
else
{
$done = 1;
}
$uploader = $uploaders[$i]->{user};
$sha1 = $uploaders[$i]->{sha1};
$comment = $uploaders[$i]->{comment} || "";
$count = $count + 1;
}
if(defined($uploader))
{
Pearle::myLog(4, "Uploader: $uploader\n");
return $uploader;
}
else
{
return undef;
}
}
sub loadNotificationList
{
my $file = shift;
my %notelist;
my $i = 0;
notelog("File: $file\n");
open INFILE, "<:utf8", $file;
while(<INFILE>)
{
$_ =~ s/\s*#.*$//g;
chomp;
$notelist{$_} = 1;
$i++;
}
close INFILE;
notelog("$i notifications loaded\n");
return %notelist;
}
sub saveNotificationList
{
return if($test_only);
my $file = shift;
my %notelist = @_;
my $key;
open OUTFILE, ">:utf8", $file;
foreach $key (keys(%notelist))
{
print OUTFILE "$key\n";
}
close OUTFILE;
}
1;
</nowiki>