User:OrphanBot/libBot.pl

From Wikipedia, the free encyclopedia


#!/usr/bin/perl

# libBot: A library of useful routines for running a bot

use strict;
use warnings;

require "libPearle2.pl";

my $test_only = 0;
my $username = "";

sub config
{
        my %params = @_;
        
        $test_only = $params{test_only} if(defined($params{test_only}));
        $username = $params{username} if(defined($params{username}));
}

# Log a warning on the talk page of the bot
sub userwarnlog
{
        my ($text, $editTime, $startTime, $token, $user, $summary, $session);
        $user = $_[1];
        $user = $username if(!defined($user));
        $summary = $_[2];
        $summary = "Logging warning message" if(!defined($summary));
        $session = $_[3];
        
        if(defined($session))
        {
                # We've been handed an editing session
                ($text, $editTime, $startTime, $token) = @{$session};
                Pearle::myLog("Warning with existing edit session\n");
        }
        else
        {
                ($text, $editTime, $startTime, $token) = Pearle::getPage("User talk:$user");
        }
        
        if($test_only)
        {
                print STDERR $_[0];
                return;
        }
        
        if($text =~ /^#redirect/i)
        {
                userwarnlog("*User talk page [[User talk:$user]] is a redirect\n");
                return;
        }
        $text .= $_[0];
        Pearle::postPage("User talk:$user", $editTime, $startTime, $token, $text, $summary, "no");
        print STDERR $_[0];
}

# Log a notification message to the console
sub notelog
{
        print STDERR @_;
}

# Fix all wikilinks in a string so that they shows as a link, not inline, if it's for a category or image
sub FixupLinks
{
        my $link = shift;
        $link =~ s/\[\[(Category|Image)/[[:$1/g;
        return $link;
}

# Make a string into a Wikipedia-compatible regex
sub MakeWikiRegex
{
        my $string = shift;
        # Escape metacharacters
        $string =~ s/\\/\\\\/g;
        $string =~ s/\./\\\./g;
        $string =~ s/\(/\\\(/g;
        $string =~ s/\)/\\\)/g;
        $string =~ s/\[/\\\[/g;
        $string =~ s/\]/\\\]/g;
        $string =~ s/\+/\\\+/g;
        $string =~ s/\*/\\\*/g;
        $string =~ s/\?/\\\?/g;
        $string =~ s/\^/\\\^/g;
        $string =~ s/\$/\\\$/g;
        # Process the string to match both with spaces and with underscores
        $string =~ s/[ _]/[ _]+/g;

        # Process the string to match both upcase and lowercase first characters
        if($string =~ /^[A-Za-z]/)
        {       
                $string =~ s/^(.)/"[$1".lc($1)."]"/e;
        }
        return $string;
}

# Check for new talk page messages
sub DoIHaveMessages
{
        my $text = shift;
        if($text =~ /<div class="usermessage">You have/)
        {
                return 1;
        }
        else
        {
                return 0;
        }
}


sub GetPageList
{
        my $image = shift;
        my $image_text = shift;
        my @pages = ();
        # Extract the page links
        # <ul><li><a href="/wiki/Lee_Hyori" title="Lee Hyori">Lee Hyori</a></li>
        # <li><a href="/wiki/Daesung_Entertainment" title="Daesung Entertainment">Daesung Entertainment</a></li>
        # </ul>
        while($image_text =~ /<li><a href="(\/wiki\/[^"]+)" title="([^"]+)">/g)
        {
                my $title;
                $title = $2;
                # Unescape any HTML entities in the title
                $title =~ s/</</g;
                $title =~ s/>/>/g;
                $title =~ s/"/"/g;
                $title =~ s/&/&/g;

                notelog("Matched article $title\n");

                # Filter out bad namespaces
                if($title =~ /^(User:|Talk:|User talk:|Template talk:|Image:|Image talk:|Category talk:|Wikipedia:|Wikipedia talk:|Portal talk:)/)      # Leave these alone
                {
                        notelog("Ignoring [[$title]] due to namespace\n");
                }
                elsif($title =~ /^Special:/)
                {
                        # Ignore Special: pages completely
                }
                elsif($title =~ /^(MediaWiki:|MediaWiki talk:|Template:|Help:|Help talk:)/)             # Log a warning about these, but otherwise leave them alone
                {
                        userwarnlog("*Found image [[:$image]] in [[$title]]\n");
                }
                else    # Good namespaces: article, Category:, Portal:
                {
                        push @pages, $title;
                }
        }
        return @pages;
}

# Get all pages.  Don't filter for bad namespaces.
sub GetFullPageList
{
        my $image = shift;
        my $image_text = shift;
        my @pages = ();
        # Extract the page links
        # <ul><li><a href="/wiki/Lee_Hyori" title="Lee Hyori">Lee Hyori</a></li>
        # <li><a href="/wiki/Daesung_Entertainment" title="Daesung Entertainment">Daesung Entertainment</a></li>
        # </ul>
        while($image_text =~ /<li><a href="(\/wiki\/[^"]+)" title="([^"]+)">/g)
        {
                my $title;
                $title = $2;
                # Unescape any HTML entities in the title
                $title =~ s/</</g;
                $title =~ s/>/>/g;
                $title =~ s/"/"/g;
                $title =~ s/&/&/g;

                notelog("Matched article $title\n");

                push @pages, $title;
        }
        return @pages;
}

sub SaveImage
{
        my $image = shift;
        my $image_text = shift;
        my $image_path = shift;
        
        my $image_url;
        
        ($image_url) = $image_text =~ /<a href="(http:\/\/upload\.wikimedia\.org\/wikipedia\/en\/[^"]+)"/;
        if(defined($image_url))
        {
                my $filename;
                my $image_data;
                notelog("Fetching image $image_url\n");
                ($filename) = $image_url =~ /(\/[^\/]+)$/;
                $filename = $image_path . $filename;
                if(! -e $filename)
                {
                        if($test_only)
                        {
                                notelog("Would save to $filename...");
                        }
                        else
                        {
                                $image_url = Pearle::urlDecode($image_url);
                                $image_data = Pearle::getURL($image_url);
                                notelog("Saving to $filename...");
                                if(defined($filename) and $filename)
                                {
                                        open OUTFILE, ">", $filename;
                                        print OUTFILE $image_data;
                                        close OUTFILE;
                                        notelog("Image saved\n");
                                        Pearle::myLog("Image $image saved as $filename\n");
                                }
                                else
                                {
                                        notelog("Failed\n");
                                }
                        }
                }
                else
                {
                        notelog("File already exists\n");
                }
        }                       
}

sub RemoveImageFromPage
{
        my $image = shift;
        my $page = shift;
        my $image_regex = shift;
        my $removal_prefix = shift;
        my $removal_comment = shift;

        my ($text, $editTime, $startTime, $token);
        my ($match1, $match2);
        my $old_length;
        my $new_length;
        my $change_len;
        my $match_len;

        # Fetch an article page
        ($text, $editTime, $startTime, $token) = Pearle::getPage($page);
        
        if(!defined($text))
        {
                Pearle::myLog("Error: Bad edit page [[$page]]\n");
                userwarnlog(FixupLinks("*Error: Bad edit page [[$page]]\n"));
                sleep(300);
                return 0;
        }
        
        if($text =~ /^\s*$/)
        {
                # Might be protected instead of empty
                Pearle::myLog("Error: Empty page [[$page]]\n");
                userwarnlog(FixupLinks("*Error: Empty page [[$page]]\n"));
                sleep(300);
                return 0;
        }
        
        if($text =~ /^#redirect/i)
        {
                Pearle::myLog("Redirect found for page [[$page]] (image [[:$image]])\n");
                userwarnlog(FixupLinks("*Redirect found for page [[$page]] (image [[:$image]])\n"));
                return 0;
        }

        # Remove the image
        my $regex3 = "(\\[\\[${image_regex}.*?(\\[\\[.*?\\]\\].*?|)+\\]\\][ \\t]*)";  # Regex to match images
        my $regex3ex = "\\w[ \\t]*${regex3}[ \\t]*\\w";                                                                       # Regex to try to spot inline images
        my $regex3c = "<!--.*${regex3}.*-->";                                                                                   # Regex to spot images in comments
        my $regex3g = "(${image_regex}.*)";                                                                                           # Regex to match gallery images
        my $regex3gc = "<!--.*${regex3g}-->";                                                                                   # Regex to spot gallery images in comments
        my ($raw_image) = $image =~ /Image:(.*)/;       
        my $regex4a = "([Cc]over\\s*=\\s*)" . MakeWikiRegex($raw_image);
        my $regex4b = "(image_skyline\\s*=\\s*)" . MakeWikiRegex($raw_image);
        my $regex4i = "(image\\s*=\\s*)" . MakeWikiRegex($raw_image);                                         # Regex to match "image = " sections in infoboxes
        my $regex4p = "(picture\\s*=\\s*)" . MakeWikiRegex($raw_image);                                       # Regex to match "picture = " sections in infoboxes

        my $regex4m = "\\[\\[[ _]*[Mm]edia[ _]*:[ _]*" . MakeWikiRegex($raw_image) . "[ _]*\\|([^]]*)\\]\\]";       # Regex to match inline Media: links
        my $regex4g =  "(img\\s*=\\s*)" . MakeWikiRegex($raw_image);  # Regex to match "img = " sections in infoboxes
        Pearle::myLog("Regex 3: $regex3\n");
        notelog("Regex 3: $regex3\n");
        notelog("Regex 3 extended: $regex3ex\n");
        notelog("Regex 3 gallery: $regex3g\n");
        Pearle::myLog("Raw regex: $raw_image\n");
        notelog("Regex 4 Album: $regex4a\n");
        notelog("Regex 4 City: $regex4b\n");
        notelog("Regex 4 Image: $regex4i\n");
        notelog("Regex 4 Media: $regex4m\n");
        notelog("Regex 4 Picture: $regex4p\n");
        notelog("Regex 4 Img: $regex4g\n");
        
        if($text =~ /$regex3ex/)
        {
                Pearle::myLog("Possible inline image in [[$page]]\n");
                userwarnlog(FixupLinks("*Possible inline image [[:$image]] in [[$page]]\n"));
                return 0;       # Can't do gallery matching because that also matches regular images, and odds are, we don't have an infobox
        }
        
        if($text =~ /$regex3c/ or $text =~ /$regex3gc/)
        {
                Pearle::myLog("Image in comment in [[$page]]\n");
#               userwarnlog(FixupLinks("*Image in comment in [[$page]]\n"));
                return 0;       # Can't do gallery matching because that also matches regular images
        }
        
        $text =~ /$regex3/;
        $match_len = length($1);
        $match2 = $text =~ s/$regex3/<!-- $removal_prefix $1 -->/g;

        $new_length = length($text);
        print "Num: $match2 Len: $match_len\n";
        if($match2)
        {
                # If a whole lot of text was removed, log a warning
                if($match_len > (500 + length($image)))
                {
                        userwarnlog(FixupLinks("*Long caption of $match_len bytes replaced in [[$page]]\n"));
                        if($match_len > (1000 + length($image)))
                        {
                                notelog("Unusually long caption found.  Exiting.\n");
                                Pearle::myLog("Unusually long caption of $match_len found in [[$page]] ($match2 matches).\n");
                                exit;
                        }
                }
                if($match_len < (4 + length($image)))
                {
                        notelog("*Short replacement of $match_len bytes in [[$page]]\n");
                        Pearle::myLog("Short replacement of $match_len bytes (min " . (length($image) + 4) . ") in [[$page]] ($match2 matches).  Exiting.\n");
                        Pearle::myLog("Text:\n$text\n");
                        exit;
                }
                # If many matches, log a warning
                if($match2 > 2)
                {
                        Pearle::myLog("More than one match ($match2) in page [[$page]]\n");
#                       userwarnlog(FixupLinks("*More than one match ($match2) in page [[$page]]\n"));
                }
                if($match2 > 100)
                {
                        Pearle::myLog("Too many matches ($match2) in page [[$page]].  Skipping.\n");
                        userwarnlog("Too many matches ($match2) in page [[$page]].  Skipping.\n");
                        return 0;
                }
                # If there might be a reference, log a warning
#               if($text =~ /(?:see (?:image|picture|graph|diagram|right|left)|\(left\)|\(right\)|\(below\)|\(above\))/)
#               {
#                       Pearle::myLog("Possible image reference in page [[$page]]\n");
#                       userwarnlog("*Possible image reference in page [[$page]]\n");
#               }
                if($text =~ /-->\]/)
                {
                        Pearle::myLog("Possible bracket mixup in page [[$page]]\n");
                        userwarnlog(FixupLinks("*Possible bracket mixup in page [[$page]]\n"));
                }
#               if($text =~ /\[\[(?: |)<!--/)
#               {
#                       Pearle::myLog("Possible multiline image in page [[$page]]\n");
#                       userwarnlog(FixupLinks("*Possible multiline image in page [[$page]]\n"));
#               }
        }
        elsif($text =~ /<gallery/)
        {
                Pearle::myLog("*Possible image gallery in page [[$page]]\n");
                if($text =~ s/$regex3g/<!-- $removal_prefix $1 -->/)
                {
                        $match2 += 1;
                }
        }

        if($match2 > 0)
        {
                if($text =~ /\[\[(?: |)<!--/)
                {
                        Pearle::myLog("Possible multiline image in page [[$page]]\n");
                        userwarnlog(FixupLinks("*Possible multiline image in page [[$page]]\n"));
                }
        }

        # Infobox removal
        if($text =~ /{{Album[ _]infobox|{{Infobox[ _]Album/i)
        {
                if($text =~ s/$regex4a/$1/)
                {
                        Pearle::myLog("*Album infobox in page [[$page]]\n");
                        $match2 += 1;
                }
        }
        if($text =~ /{{Infobox[ _]City/i)
        {
                if($text =~ s/$regex4b/$1/)
                {
                        Pearle::myLog("*City infobox in page [[$page]]\n");
                        $match2 += 1;
                }
        }
        if($text =~ /{{Taxobox/i)
        {
                if($text =~ s/$regex4i/$1/)
                {
                        Pearle::myLog("*Taxobox in page [[$page]]\n");
                        $match2 += 1;
                }
        }
        if($text =~ /{{NFL[ _]player/i)
        {
                if($text =~ s/$regex4i/$1/i)
                {
                        Pearle::myLog("*NFL Playerbox in page [[$page]]\n");
                        $match2 += 1;
                }
        }
        if($text =~ /{{Infobox[ _]President/i)
        {
                if($text =~ s/$regex4i/$1/i)
                {
                        Pearle::myLog("*Presidentbox in page [[$page]]\n");
#                       userwarnlog("*Presidentbox in page [[$page]]\n");
                        $match2 += 1;
                }
        }
        if($text =~ /{{Infobox[ _]Cricketer/i)
        {
                if($text =~ s/$regex4p/picture = cricket no pic.png/i)
                {
                        Pearle::myLog("*Cricketer in page [[$page]]\n");
#                       userwarnlog("*Cricketer in page [[$page]]\n");
                        $match2 += 1;
                }
        }
        if($text =~ /{{Infobox[ _]Celebrity/)
        {
                if($text =~ s/$regex4i/$1/i)
                {
                        Pearle::myLog("*Celebrity in page [[$page]]\n");
                        $match2 += 1;
                }
        }
        if($text =~ /{{Infobox[ _]Wrestler/)
        {
                if($text =~ s/$regex4i/$1/i)
                {
                        Pearle::myLog("*Wrestler in page [[$page]]\n");
                        $match2 += 1;
                }
        }
        if($text =~ /{{Infobox musical artist 2/)
        {
                if($text =~ s/$regex4g/$1/i)
                {
                        Pearle::myLog("*InfoMusArt2 in page [[$page]]\n");
                        $match2 += 1;
                }
        }
        if($text =~ /{{Infobox Model/)
        {
                if($text =~ s/$regex4i/$1/i)
                {
                        Pearle::myLog("*Model in page [[$page]]\n");
                        $match2 += 1;
                }
        }

        if($match2)     # No need to null-edit articles anymore
        {
                if($test_only)
                {
                        notelog("Test removal from page succeeded\n");
                }
                else
                {
                        # Submit the changes
                        Pearle::postPage($page, $editTime, $startTime, $token, $text, $removal_comment, "no");
                }
        }
        
        return ($match2)
}

# Returns 1 if the user has been notified, or a reference to the userpage edit session if they haven't
sub isNotified
{
        my $image_text = shift;
        my $uploader = shift;
        my $image_regex = shift;
        my $image_name = shift;
        my $notes_ref = shift;
        my $donts_ref = shift;

        # Check notification list
        if($notes_ref->{"$uploader,$image_name"})
        {
                notelog("Already notified for this image\n");
                return 1;
        }

        if($donts_ref->{$uploader})
        {
                notelog("On exception list\n");
                Pearle::myLog("On exception list: $uploader\n");
                return 1;
        }
        
        # Check uploader's talkpage
        my ($text, $editTime, $startTime, $token) = Pearle::getPage("User talk:$uploader");
        if($text =~ /$image_regex/)
        {
                notelog("Already notified by someone else\n");
                $donts_ref->{"$uploader,$image_name"} = 1;
                return 1;
        }
        else
        {
                print "Not already notified\n";
                return [$text, $editTime, $startTime, $token];
        }
}

sub isDated
{
        my $image_text = shift;
        if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/)       # Dated template
        {
                print "Dated tag $1 $2 $3\n";
                return 1;
        }
        # as of 6 October 2006">
        elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category
        {
                print "Template borked; category $1 $2 $3\n";
                return 1;
        }
        elsif($image_text =~ /{{{day}}} {{{month}}} \d\d\d\d/ or $image_text =~ /\( 2006\)/)    # Generic template
        {
                print "Generic tag\n";
                return 0;
        }
        else
        {
                print "No tag match\n";
                return 0;
        }
}

# Return the tag date if there is one, the upload date if not
# Returns in (day, month, year) format
sub getDate
{
        my $image_text = shift;
        if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/)
        {
                print "Template date $1-$2-$3\n";
                return ($1, $2, $3);
        }
        elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category
        {
                print "Category date $1-$2-$3\n";
                return ($1, $2, $3);
        }
        elsif($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</)
        {
                print "Upload date $1-$2-$3\n";
#               return ($1, $2, $3);
                # For now, be conservative:
                my ($year, $month, $day) = Today();
                return ($day, Month_to_Text($month), $year);
        }
        else
        {
                print "No date\n";
                return (1, "January", 2006);
        }
}

# Return a list of upload dates
sub getUploadDates
{
        my @dates;
        my $image_text = shift;
        while($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</g)
        {
                push @dates, [$1, $2, $3];
        }
        return @dates;
}

sub getLastEditDate
{
        my ($day, $month, $year);
        my $image = shift;
        
        my @history = Pearle::parseHistory($image);
        (undef, $day, $month, $year) = @{$history[0]};
        
        return ($day, $month, $year);
}

# Find the most recent non-vandal, non-revert uploader
sub getUploader
{
        my $image_text = shift;
        my ($uploader, $dims, $bytes, $comment);
        my @uploaders;
        my $uploader_data;
        my $i = 0;
        
        # title="User:Jamie100">Jamie100</a> (<a href="/wiki/User_talk:Jamie100" title="User talk:Jamie100">Talk</a>) . . 424x216 (25800 bytes) <span class='comment'>(Reverted to earlier revision)</span></li>
        
#       while($image_text =~ />([^<]+?)<\/a> \(<a href="[^"]+?" (?:class="new" |)title="[^"]+?">Talk<\/a>\) \. \. (\d+x\d+) \(([0-9,]+) bytes\)(?: <span class="comment">([^<]*)|)</g)
        while($image_text =~ />([^<]+?)<\/a> \(<a href="[^"]+?" (?:class="new" |)title="[^"]+?">Talk<\/a> \| <a href="[^"]*" title="[^"]*">contribs<\/a>\) \. \. (\d+.+?\d+) \(([0-9,]+) bytes\)(?: <span class="comment">([^<]*)|)</g)
        {
                ($uploader, $dims, $bytes, $comment) = ($1, $2, $3, $4);
                $bytes =~ s/,//g;                                               # Remove commas to turn into a real number
                $comment = "" if(!defined($comment)); # Reduce warnings
                push @uploaders, [$uploader, $dims, $bytes, $comment];
                notelog("Uploader found: $uploader, $dims, $bytes, $comment\n");
                $i++;
                die "Too many uploaders: $i\n" if($i > 100);
        }
        my $max = scalar(@uploaders);
        print $max, "\n";
        for($i = 0; $i < $max; $i++)
        {
                $uploader = $uploaders[$i][0];
                if($uploaders[$i][3] =~ /Reverted/)
                {
                        $dims = $uploaders[$i][1];
                        $bytes = $uploaders[$i][2];
                        notelog("Revert found: $uploader, $dims, $bytes\n");
                        $i++;
                        while(($dims ne $uploaders[$i][1] or $bytes ne $uploaders[$i][2]) and $i < $max)
                        {
                                notelog("Reversion data: $uploaders[$i][1], $uploaders[$i][2], $i\n");
                                $uploader = $uploaders[$i][0];
                                $i++;
                        }
                }
                elsif($uploaders[$i][3] =~ /optimi(z|s)|adjust|tweak|scale|crop|change|resize/i)
                {
                        notelog("Optimize found.  Skipping.\n");
                }
                else
                {
                        notelog("Uploader: $uploader ($i)\n");
                        last;
                }
        }
        $uploader = undef if($i >= $max);
        
        print "Uploader: $uploader\n";
        return $uploader;
}

# See if the specified category exists, and if not, create it
sub checkImageCategory
{
        my $cat;
        my ($text, $editTime, $startTime, $token);
        $cat = "Category:Images with unknown source as of $_[0] $_[1] $_[2]";
        
        ($text, $editTime, $startTime, $token) = Pearle::getPage($cat);
        if($text !~ /\[\[[Cc]ategory:[Ii]mages with unknown source/)
        {
                $text .= "\n[[Category:Images with unknown source| ]]\n";
                if($test_only)
                {
                        notelog("Would create category [[:$cat]]\n");
                }
                else
                {
                        Pearle::postPage($cat, $editTime, $startTime, $token, $text, "Created category", "no");
                        userwarnlog("*Created category [[:$cat]]\n");
                }
        }
}


sub loadNotificationList
{
        my $file = shift;
        my %notelist;
        my $i = 0;
        notelog("File: $file\n");
        open INFILE, "<", $file;
        while(<INFILE>)
        {
                $_ =~ s/\s*#.*$//g;
                chomp;
                $notelist{$_} = 1;
                $i++;
        }
        close INFILE;
        notelog("$i notifications loaded\n");
        return %notelist;
}

sub saveNotificationList
{
        return if($test_only);
        
        my $file = shift;
        my %notelist = @_;
        my $key;
        
        open OUTFILE, ">", $file;
        foreach $key (keys(%notelist))
        {
                print OUTFILE "$key\n";
        }
        close OUTFILE;
}

1;