From Wikipedia, the free encyclopedia
#!/usr/bin/perl
# libBot: A library of useful routines for running a bot
use strict;
use warnings;
require "libPearle2.pl";
my $test_only = 0;
my $username = "";
sub config
{
my %params = @_;
$test_only = $params{test_only} if(defined($params{test_only}));
$username = $params{username} if(defined($params{username}));
}
# Log a warning on the talk page of the bot
sub userwarnlog
{
my ($text, $editTime, $startTime, $token, $user, $summary, $session);
$user = $_[1];
$user = $username if(!defined($user));
$summary = $_[2];
$summary = "Logging warning message" if(!defined($summary));
$session = $_[3];
if(defined($session))
{
# We've been handed an editing session
($text, $editTime, $startTime, $token) = @{$session};
Pearle::myLog("Warning with existing edit session\n");
}
else
{
($text, $editTime, $startTime, $token) = Pearle::getPage("User talk:$user");
}
if($test_only)
{
print STDERR $_[0];
return;
}
if($text =~ /^#redirect/i)
{
userwarnlog("*User talk page [[User talk:$user]] is a redirect\n");
return;
}
$text .= $_[0];
Pearle::postPage("User talk:$user", $editTime, $startTime, $token, $text, $summary, "no");
print STDERR $_[0];
}
# Log a notification message to the console
sub notelog
{
print STDERR @_;
}
# Fix all wikilinks in a string so that they shows as a link, not inline, if it's for a category or image
sub FixupLinks
{
my $link = shift;
$link =~ s/\[\[(Category|Image)/[[:$1/g;
return $link;
}
# Make a string into a Wikipedia-compatible regex
sub MakeWikiRegex
{
my $string = shift;
# Escape metacharacters
$string =~ s/\\/\\\\/g;
$string =~ s/\./\\\./g;
$string =~ s/\(/\\\(/g;
$string =~ s/\)/\\\)/g;
$string =~ s/\[/\\\[/g;
$string =~ s/\]/\\\]/g;
$string =~ s/\+/\\\+/g;
$string =~ s/\*/\\\*/g;
$string =~ s/\?/\\\?/g;
$string =~ s/\^/\\\^/g;
$string =~ s/\$/\\\$/g;
# Process the string to match both with spaces and with underscores
$string =~ s/[ _]/[ _]+/g;
# Process the string to match both upcase and lowercase first characters
if($string =~ /^[A-Za-z]/)
{
$string =~ s/^(.)/"[$1".lc($1)."]"/e;
}
return $string;
}
# Check for new talk page messages
sub DoIHaveMessages
{
my $text = shift;
if($text =~ /<div class="usermessage">You have/)
{
return 1;
}
else
{
return 0;
}
}
sub GetPageList
{
my $image = shift;
my $image_text = shift;
my @pages = ();
# Extract the page links
# <ul><li><a href="/wiki/Lee_Hyori" title="Lee Hyori">Lee Hyori</a></li>
# <li><a href="/wiki/Daesung_Entertainment" title="Daesung Entertainment">Daesung Entertainment</a></li>
# </ul>
while($image_text =~ /<li><a href="(\/wiki\/[^"]+)" title="([^"]+)">/g)
{
my $title;
$title = $2;
# Unescape any HTML entities in the title
$title =~ s/</</g;
$title =~ s/>/>/g;
$title =~ s/"/"/g;
$title =~ s/&/&/g;
notelog("Matched article $title\n");
# Filter out bad namespaces
if($title =~ /^(User:|Talk:|User talk:|Template talk:|Image:|Image talk:|Category talk:|Wikipedia:|Wikipedia talk:|Portal talk:)/) # Leave these alone
{
notelog("Ignoring [[$title]] due to namespace\n");
}
elsif($title =~ /^Special:/)
{
# Ignore Special: pages completely
}
elsif($title =~ /^(MediaWiki:|MediaWiki talk:|Template:|Help:|Help talk:)/) # Log a warning about these, but otherwise leave them alone
{
userwarnlog("*Found image [[:$image]] in [[$title]]\n");
}
else # Good namespaces: article, Category:, Portal:
{
push @pages, $title;
}
}
return @pages;
}
# Get all pages. Don't filter for bad namespaces.
sub GetFullPageList
{
my $image = shift;
my $image_text = shift;
my @pages = ();
# Extract the page links
# <ul><li><a href="/wiki/Lee_Hyori" title="Lee Hyori">Lee Hyori</a></li>
# <li><a href="/wiki/Daesung_Entertainment" title="Daesung Entertainment">Daesung Entertainment</a></li>
# </ul>
while($image_text =~ /<li><a href="(\/wiki\/[^"]+)" title="([^"]+)">/g)
{
my $title;
$title = $2;
# Unescape any HTML entities in the title
$title =~ s/</</g;
$title =~ s/>/>/g;
$title =~ s/"/"/g;
$title =~ s/&/&/g;
notelog("Matched article $title\n");
push @pages, $title;
}
return @pages;
}
sub SaveImage
{
my $image = shift;
my $image_text = shift;
my $image_path = shift;
my $image_url;
($image_url) = $image_text =~ /<a href="(http:\/\/upload\.wikimedia\.org\/wikipedia\/en\/[^"]+)"/;
if(defined($image_url))
{
my $filename;
my $image_data;
notelog("Fetching image $image_url\n");
($filename) = $image_url =~ /(\/[^\/]+)$/;
$filename = $image_path . $filename;
if(! -e $filename)
{
if($test_only)
{
notelog("Would save to $filename...");
}
else
{
$image_url = Pearle::urlDecode($image_url);
$image_data = Pearle::getURL($image_url);
notelog("Saving to $filename...");
if(defined($filename) and $filename)
{
open OUTFILE, ">", $filename;
print OUTFILE $image_data;
close OUTFILE;
notelog("Image saved\n");
Pearle::myLog("Image $image saved as $filename\n");
}
else
{
notelog("Failed\n");
}
}
}
else
{
notelog("File already exists\n");
}
}
}
sub RemoveImageFromPage
{
my $image = shift;
my $page = shift;
my $image_regex = shift;
my $removal_prefix = shift;
my $removal_comment = shift;
my ($text, $editTime, $startTime, $token);
my ($match1, $match2);
my $old_length;
my $new_length;
my $change_len;
my $match_len;
# Fetch an article page
($text, $editTime, $startTime, $token) = Pearle::getPage($page);
if(!defined($text))
{
Pearle::myLog("Error: Bad edit page [[$page]]\n");
userwarnlog(FixupLinks("*Error: Bad edit page [[$page]]\n"));
sleep(300);
return 0;
}
if($text =~ /^\s*$/)
{
# Might be protected instead of empty
Pearle::myLog("Error: Empty page [[$page]]\n");
userwarnlog(FixupLinks("*Error: Empty page [[$page]]\n"));
sleep(300);
return 0;
}
if($text =~ /^#redirect/i)
{
Pearle::myLog("Redirect found for page [[$page]] (image [[:$image]])\n");
userwarnlog(FixupLinks("*Redirect found for page [[$page]] (image [[:$image]])\n"));
return 0;
}
# Remove the image
my $regex3 = "(\\[\\[${image_regex}.*?(\\[\\[.*?\\]\\].*?|)+\\]\\][ \\t]*)"; # Regex to match images
my $regex3ex = "\\w[ \\t]*${regex3}[ \\t]*\\w"; # Regex to try to spot inline images
my $regex3c = "<!--.*${regex3}.*-->"; # Regex to spot images in comments
my $regex3g = "(${image_regex}.*)"; # Regex to match gallery images
my $regex3gc = "<!--.*${regex3g}-->"; # Regex to spot gallery images in comments
my ($raw_image) = $image =~ /Image:(.*)/;
my $regex4a = "([Cc]over\\s*=\\s*)" . MakeWikiRegex($raw_image);
my $regex4b = "(image_skyline\\s*=\\s*)" . MakeWikiRegex($raw_image);
my $regex4i = "(image\\s*=\\s*)" . MakeWikiRegex($raw_image); # Regex to match "image = " sections in infoboxes
my $regex4p = "(picture\\s*=\\s*)" . MakeWikiRegex($raw_image); # Regex to match "picture = " sections in infoboxes
my $regex4m = "\\[\\[[ _]*[Mm]edia[ _]*:[ _]*" . MakeWikiRegex($raw_image) . "[ _]*\\|([^]]*)\\]\\]"; # Regex to match inline Media: links
my $regex4g = "(img\\s*=\\s*)" . MakeWikiRegex($raw_image); # Regex to match "img = " sections in infoboxes
Pearle::myLog("Regex 3: $regex3\n");
notelog("Regex 3: $regex3\n");
notelog("Regex 3 extended: $regex3ex\n");
notelog("Regex 3 gallery: $regex3g\n");
Pearle::myLog("Raw regex: $raw_image\n");
notelog("Regex 4 Album: $regex4a\n");
notelog("Regex 4 City: $regex4b\n");
notelog("Regex 4 Image: $regex4i\n");
notelog("Regex 4 Media: $regex4m\n");
notelog("Regex 4 Picture: $regex4p\n");
notelog("Regex 4 Img: $regex4g\n");
if($text =~ /$regex3ex/)
{
Pearle::myLog("Possible inline image in [[$page]]\n");
userwarnlog(FixupLinks("*Possible inline image [[:$image]] in [[$page]]\n"));
return 0; # Can't do gallery matching because that also matches regular images, and odds are, we don't have an infobox
}
if($text =~ /$regex3c/ or $text =~ /$regex3gc/)
{
Pearle::myLog("Image in comment in [[$page]]\n");
# userwarnlog(FixupLinks("*Image in comment in [[$page]]\n"));
return 0; # Can't do gallery matching because that also matches regular images
}
$text =~ /$regex3/;
$match_len = length($1);
$match2 = $text =~ s/$regex3/<!-- $removal_prefix $1 -->/g;
$new_length = length($text);
print "Num: $match2 Len: $match_len\n";
if($match2)
{
# If a whole lot of text was removed, log a warning
if($match_len > (500 + length($image)))
{
userwarnlog(FixupLinks("*Long caption of $match_len bytes replaced in [[$page]]\n"));
if($match_len > (1000 + length($image)))
{
notelog("Unusually long caption found. Exiting.\n");
Pearle::myLog("Unusually long caption of $match_len found in [[$page]] ($match2 matches).\n");
exit;
}
}
if($match_len < (4 + length($image)))
{
notelog("*Short replacement of $match_len bytes in [[$page]]\n");
Pearle::myLog("Short replacement of $match_len bytes (min " . (length($image) + 4) . ") in [[$page]] ($match2 matches). Exiting.\n");
Pearle::myLog("Text:\n$text\n");
exit;
}
# If many matches, log a warning
if($match2 > 2)
{
Pearle::myLog("More than one match ($match2) in page [[$page]]\n");
# userwarnlog(FixupLinks("*More than one match ($match2) in page [[$page]]\n"));
}
if($match2 > 100)
{
Pearle::myLog("Too many matches ($match2) in page [[$page]]. Skipping.\n");
userwarnlog("Too many matches ($match2) in page [[$page]]. Skipping.\n");
return 0;
}
# If there might be a reference, log a warning
# if($text =~ /(?:see (?:image|picture|graph|diagram|right|left)|\(left\)|\(right\)|\(below\)|\(above\))/)
# {
# Pearle::myLog("Possible image reference in page [[$page]]\n");
# userwarnlog("*Possible image reference in page [[$page]]\n");
# }
if($text =~ /-->\]/)
{
Pearle::myLog("Possible bracket mixup in page [[$page]]\n");
userwarnlog(FixupLinks("*Possible bracket mixup in page [[$page]]\n"));
}
# if($text =~ /\[\[(?: |)<!--/)
# {
# Pearle::myLog("Possible multiline image in page [[$page]]\n");
# userwarnlog(FixupLinks("*Possible multiline image in page [[$page]]\n"));
# }
}
elsif($text =~ /<gallery/)
{
Pearle::myLog("*Possible image gallery in page [[$page]]\n");
if($text =~ s/$regex3g/<!-- $removal_prefix $1 -->/)
{
$match2 += 1;
}
}
if($match2 > 0)
{
if($text =~ /\[\[(?: |)<!--/)
{
Pearle::myLog("Possible multiline image in page [[$page]]\n");
userwarnlog(FixupLinks("*Possible multiline image in page [[$page]]\n"));
}
}
# Infobox removal
if($text =~ /{{Album[ _]infobox|{{Infobox[ _]Album/i)
{
if($text =~ s/$regex4a/$1/)
{
Pearle::myLog("*Album infobox in page [[$page]]\n");
$match2 += 1;
}
}
if($text =~ /{{Infobox[ _]City/i)
{
if($text =~ s/$regex4b/$1/)
{
Pearle::myLog("*City infobox in page [[$page]]\n");
$match2 += 1;
}
}
if($text =~ /{{Taxobox/i)
{
if($text =~ s/$regex4i/$1/)
{
Pearle::myLog("*Taxobox in page [[$page]]\n");
$match2 += 1;
}
}
if($text =~ /{{NFL[ _]player/i)
{
if($text =~ s/$regex4i/$1/i)
{
Pearle::myLog("*NFL Playerbox in page [[$page]]\n");
$match2 += 1;
}
}
if($text =~ /{{Infobox[ _]President/i)
{
if($text =~ s/$regex4i/$1/i)
{
Pearle::myLog("*Presidentbox in page [[$page]]\n");
# userwarnlog("*Presidentbox in page [[$page]]\n");
$match2 += 1;
}
}
if($text =~ /{{Infobox[ _]Cricketer/i)
{
if($text =~ s/$regex4p/picture = cricket no pic.png/i)
{
Pearle::myLog("*Cricketer in page [[$page]]\n");
# userwarnlog("*Cricketer in page [[$page]]\n");
$match2 += 1;
}
}
if($text =~ /{{Infobox[ _]Celebrity/)
{
if($text =~ s/$regex4i/$1/i)
{
Pearle::myLog("*Celebrity in page [[$page]]\n");
$match2 += 1;
}
}
if($text =~ /{{Infobox[ _]Wrestler/)
{
if($text =~ s/$regex4i/$1/i)
{
Pearle::myLog("*Wrestler in page [[$page]]\n");
$match2 += 1;
}
}
if($text =~ /{{Infobox musical artist 2/)
{
if($text =~ s/$regex4g/$1/i)
{
Pearle::myLog("*InfoMusArt2 in page [[$page]]\n");
$match2 += 1;
}
}
if($text =~ /{{Infobox Model/)
{
if($text =~ s/$regex4i/$1/i)
{
Pearle::myLog("*Model in page [[$page]]\n");
$match2 += 1;
}
}
if($match2) # No need to null-edit articles anymore
{
if($test_only)
{
notelog("Test removal from page succeeded\n");
}
else
{
# Submit the changes
Pearle::postPage($page, $editTime, $startTime, $token, $text, $removal_comment, "no");
}
}
return ($match2)
}
# Returns 1 if the user has been notified, or a reference to the userpage edit session if they haven't
sub isNotified
{
my $image_text = shift;
my $uploader = shift;
my $image_regex = shift;
my $image_name = shift;
my $notes_ref = shift;
my $donts_ref = shift;
# Check notification list
if($notes_ref->{"$uploader,$image_name"})
{
notelog("Already notified for this image\n");
return 1;
}
if($donts_ref->{$uploader})
{
notelog("On exception list\n");
Pearle::myLog("On exception list: $uploader\n");
return 1;
}
# Check uploader's talkpage
my ($text, $editTime, $startTime, $token) = Pearle::getPage("User talk:$uploader");
if($text =~ /$image_regex/)
{
notelog("Already notified by someone else\n");
$donts_ref->{"$uploader,$image_name"} = 1;
return 1;
}
else
{
print "Not already notified\n";
return [$text, $editTime, $startTime, $token];
}
}
sub isDated
{
my $image_text = shift;
if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/) # Dated template
{
print "Dated tag $1 $2 $3\n";
return 1;
}
# as of 6 October 2006">
elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category
{
print "Template borked; category $1 $2 $3\n";
return 1;
}
elsif($image_text =~ /{{{day}}} {{{month}}} \d\d\d\d/ or $image_text =~ /\( 2006\)/) # Generic template
{
print "Generic tag\n";
return 0;
}
else
{
print "No tag match\n";
return 0;
}
}
# Return the tag date if there is one, the upload date if not
# Returns in (day, month, year) format
sub getDate
{
my $image_text = shift;
if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/)
{
print "Template date $1-$2-$3\n";
return ($1, $2, $3);
}
elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category
{
print "Category date $1-$2-$3\n";
return ($1, $2, $3);
}
elsif($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</)
{
print "Upload date $1-$2-$3\n";
# return ($1, $2, $3);
# For now, be conservative:
my ($year, $month, $day) = Today();
return ($day, Month_to_Text($month), $year);
}
else
{
print "No date\n";
return (1, "January", 2006);
}
}
# Return a list of upload dates
sub getUploadDates
{
my @dates;
my $image_text = shift;
while($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</g)
{
push @dates, [$1, $2, $3];
}
return @dates;
}
sub getLastEditDate
{
my ($day, $month, $year);
my $image = shift;
my @history = Pearle::parseHistory($image);
(undef, $day, $month, $year) = @{$history[0]};
return ($day, $month, $year);
}
# Find the most recent non-vandal, non-revert uploader
sub getUploader
{
my $image_text = shift;
my ($uploader, $dims, $bytes, $comment);
my @uploaders;
my $uploader_data;
my $i = 0;
# title="User:Jamie100">Jamie100</a> (<a href="/wiki/User_talk:Jamie100" title="User talk:Jamie100">Talk</a>) . . 424x216 (25800 bytes) <span class='comment'>(Reverted to earlier revision)</span></li>
# while($image_text =~ />([^<]+?)<\/a> \(<a href="[^"]+?" (?:class="new" |)title="[^"]+?">Talk<\/a>\) \. \. (\d+x\d+) \(([0-9,]+) bytes\)(?: <span class="comment">([^<]*)|)</g)
while($image_text =~ />([^<]+?)<\/a> \(<a href="[^"]+?" (?:class="new" |)title="[^"]+?">Talk<\/a> \| <a href="[^"]*" title="[^"]*">contribs<\/a>\) \. \. (\d+.+?\d+) \(([0-9,]+) bytes\)(?: <span class="comment">([^<]*)|)</g)
{
($uploader, $dims, $bytes, $comment) = ($1, $2, $3, $4);
$bytes =~ s/,//g; # Remove commas to turn into a real number
$comment = "" if(!defined($comment)); # Reduce warnings
push @uploaders, [$uploader, $dims, $bytes, $comment];
notelog("Uploader found: $uploader, $dims, $bytes, $comment\n");
$i++;
die "Too many uploaders: $i\n" if($i > 100);
}
my $max = scalar(@uploaders);
print $max, "\n";
for($i = 0; $i < $max; $i++)
{
$uploader = $uploaders[$i][0];
if($uploaders[$i][3] =~ /Reverted/)
{
$dims = $uploaders[$i][1];
$bytes = $uploaders[$i][2];
notelog("Revert found: $uploader, $dims, $bytes\n");
$i++;
while(($dims ne $uploaders[$i][1] or $bytes ne $uploaders[$i][2]) and $i < $max)
{
notelog("Reversion data: $uploaders[$i][1], $uploaders[$i][2], $i\n");
$uploader = $uploaders[$i][0];
$i++;
}
}
elsif($uploaders[$i][3] =~ /optimi(z|s)|adjust|tweak|scale|crop|change|resize/i)
{
notelog("Optimize found. Skipping.\n");
}
else
{
notelog("Uploader: $uploader ($i)\n");
last;
}
}
$uploader = undef if($i >= $max);
print "Uploader: $uploader\n";
return $uploader;
}
# See if the specified category exists, and if not, create it
sub checkImageCategory
{
my $cat;
my ($text, $editTime, $startTime, $token);
$cat = "Category:Images with unknown source as of $_[0] $_[1] $_[2]";
($text, $editTime, $startTime, $token) = Pearle::getPage($cat);
if($text !~ /\[\[[Cc]ategory:[Ii]mages with unknown source/)
{
$text .= "\n[[Category:Images with unknown source| ]]\n";
if($test_only)
{
notelog("Would create category [[:$cat]]\n");
}
else
{
Pearle::postPage($cat, $editTime, $startTime, $token, $text, "Created category", "no");
userwarnlog("*Created category [[:$cat]]\n");
}
}
}
sub loadNotificationList
{
my $file = shift;
my %notelist;
my $i = 0;
notelog("File: $file\n");
open INFILE, "<", $file;
while(<INFILE>)
{
$_ =~ s/\s*#.*$//g;
chomp;
$notelist{$_} = 1;
$i++;
}
close INFILE;
notelog("$i notifications loaded\n");
return %notelist;
}
sub saveNotificationList
{
return if($test_only);
my $file = shift;
my %notelist = @_;
my $key;
open OUTFILE, ">", $file;
foreach $key (keys(%notelist))
{
print OUTFILE "$key\n";
}
close OUTFILE;
}
1;