User:OrphanBot/orphanbot.pl

From Wikipedia, the free encyclopedia

The source code for OrphanBot's image-removal task. Requires libBot.pl and libPearle2.pl.

#!/usr/bin/perl


# OrphanBot
#
# A bot to remove images from pages in preparation for deletion

use strict;
use warnings;

use Date::Calc qw(Delta_Days Decode_Month Month_to_Text Today);
use Getopt::Long;

require "libBot.pl";

my $permit_interruptions = 0;   # Allow talkpage messages to stop the bot?
my $last_image = undef;
my @last_images;
my $task = "";                                                        # One of "source", "copyright", "unsure", "special", "fairuse", "disputed"
my %users_notified;                                                # List of users notifed.  0, undef = no; 1 = notified once; 2 = notified and second notice
my %notifications;                                         # List of user,image pairs, used to ensure that no user is ever notified about an image twice.
my %dont_notify = ();                                      # List of users to never notify
my %image_whitelist = ();                          # Images to never remove

my ($remove_type, $removal_comment, $removal_prefix, $template_match, $uploader_warning, $uploader_warning_summary, $write_remove_log, $limit_by_date, $test_only); # Params for changing tasks

GetOptions('task=s' => \$task);

# Generate a signature
sub sig
{
        return " ~~~~~";
}


# No-error-checking removal routine, for special requests
sub RemoveImageSpecial
{
        my $image = shift;
        my $page = shift;
        my $image_regex = shift;
        my $removal_prefix = shift;
        my $removal_comment = shift;

        my ($text, $editTime, $startTime, $token);
        my ($match1, $match2) = (0,0);
        my $old_length;
        my $new_length;
        my $change_len;
        my $match_len = 0;

        # Fetch an article page
        ($text, $editTime, $startTime, $token) = Pearle::getPage($page);
        $old_length = length($text);

        if($text =~ /#redirect/i)
        {
                Pearle::myLog("Redirect found for page [[$page]] (image [[:$image]])\n");
                userwarnlog("*Redirect found for page [[$page]] (image [[:$image]])\n");
                return 0;
        }

        # Remove the image
        my $regex3 = "(\\[\\[${image_regex}.*?(\\[\\[.*?\\]\\].*?|)+\\]\\][ \\t]*)";  # Regex to match images
        Pearle::myLog("Regex 3: $regex3\n");
        notelog("Regex 3: $regex3\n");
        
        if($text =~ /$regex3/)
        {
                $match_len = length($1);
                if(defined($removal_prefix))
                {
                        $match2 = $text =~ s/$regex3/<!-- $removal_prefix $1 -->/g;
                }
                else
                {
                        $match2 = $text =~ s/$regex3//g;
                }

                if($match2)
                {
                        if($match_len < (4 + length($image)))
                        {
                                notelog("*Short replacement of $match_len bytes in [[$page]]\n");
                                Pearle::myLog("Short replacement of $match_len bytes (min " . (length($image) + 4) . ") in [[$page]] ($match2 matches).  Exiting.\n");
                                Pearle::myLog("Text:\n$text\n");
                                exit;
                        }
                        if($match2 > 100)
                        {
                                Pearle::myLog("Too many matches ($match2) in page [[$page]].  Skipping.\n");
                                notelog("Too many matches ($match2) in page [[$page]].  Skipping.\n");
                                exit;
                        }
                        if($text =~ /-->\]/)
                        {
                                Pearle::myLog("Possible bracket mixup in page [[$page]]\n");
                                userwarnlog(FixupLinks("*Possible bracket mixup in page [[$page]]\n"));
                        }
                }
        }

        $new_length = length($text);
        print "Num: $match2 Len: $match_len\n";

        if($test_only)
        {
                notelog("Special removal for page\n");
        }
        else
        {
                Pearle::postPage($page, $editTime, $startTime, $token, $text, $removal_comment, "no");
        }
        
        return ($match2)
}


%notifications = loadNotificationList("./orphanbot.note");
%dont_notify = loadNotificationList("./orphanbot.whitelist");
%image_whitelist = loadNotificationList("./orphanbot.imagewhitelist");
Pearle::init(<< INSERT BOT NAME HERE >>, << INSERT BOT PASSWORD HERE >>, "./orphanbot.log","./cookies.pearle.txt");
Pearle::config(nullOK => 1);
config(username => << INSERT BOT NAME HERE >>);

if(!Pearle::login())
{
        exit;
}

my $last_run = 0;
my @stbot_images;

#while(1)
{
        $last_image =~ s/^[Ii]mage:// if(defined($last_image)); # Remove any prepended namespacing
        my @images;
        my $image;
        my $edited = 0;
        my $images_removed = 0;
        my $nolimits = 0;
#       my ($remove_type, $removal_comment, $removal_prefix, $template_match, $uploader_warning, $uploader_warning_summary, $write_remove_log, $limit_by_date); # Params for changing tasks
        
        @images = ();
        
        userwarnlog("=== Beginning set at " . time() . " for task '$task' ===\n");

        {

                if($task eq "source")
                {
                        my $cat = "Category:All images with unknown source";
                        @images = Pearle::getCategoryImages($cat);
                        
                        $remove_type = 'normal';
                        $removal_comment = "Removing image with no source information.  Such images that are older than seven days may be deleted at any time.";
                        $removal_prefix = "Unsourced image removed:";
                        $template_match = "Unless the copyright status is provided|Unless this information is added to this page";
                        $uploader_warning = "{{subst:User:OrphanBot/nosource|";
                        $uploader_warning_summary = "You've uploaded an unsourced image";
                        $write_remove_log = 1;
                        $limit_by_date = 1;
                        $nolimits = 0;
                        $test_only = 0;
                }
                elsif($task eq "copyright")
                {
                        my $cat = "Category:All images with unknown copyright status";
                        @images = Pearle::getCategoryImages($cat);
                        
                        $remove_type = 'normal';
                        $removal_comment = "Removing image with no copyright information.  Such images that are older than seven days may be deleted at any time.";
                        $removal_prefix = "Image with unknown copyright status removed:";
                        $template_match = "Unless the copyright status is provided|Unless this information is added to this page|This image was uploaded under good faith using the above tag";
                        $uploader_warning = "{{subst:User:OrphanBot/nocopyright|";
                        $uploader_warning_summary = "You've uploaded an image with unknown copyright";
                        $write_remove_log = 1;
                        $limit_by_date = 1;
                        $nolimits = 0;
                        $test_only = 0;
                }
                elsif($task eq 'replaceable')
                {
                        my $cat = "Category:All replaceable fair use images";
                        @images = Pearle::getCategoryImages($cat);
                        
                        $remove_type = 'normal';
                        $removal_comment = "Removing replaceable fair-use image.";
                        $removal_prefix = "Replaceable fair-use image removed:";
                        $template_match = "for which a free image might reasonably be found";
                        $uploader_warning = undef;
                        $uploader_warning_summary = undef;

                        $write_remove_log = 1;
                        $limit_by_date = 1;
                        $nolimits = 0;
                        $test_only = 0;
                }
                elsif($task eq 'special')
                {
                        # Special requests
                        @images = Pearle::getLogArticles("upload", 408, 10, "Johnsatchmo");
                        @images = map {$_->[0]} @images;
                        notelog("Found " . scalar(@images) . " images\n");
                        
                        $remove_type = 'normal';        # Use the standard removal system;
                        $removal_comment = "Removing image by request; see [[User talk:Carnildo#Bot help.3F]]" ;
                        $removal_prefix = "Image with questionable copyright removed:";
                        $template_match = undef;
                        $uploader_warning = undef;
                        $uploader_warning_summary = undef;
                        $write_remove_log = 1;
                        $limit_by_date = 0;
                        $nolimits = 0;
                        $test_only = 0;
                }
                else
                {
                        notelog("Unknown task: $task\n");
                        exit;
                }
        }
        
        if(scalar(@images) == 0)
        {
                print "Finished with category.\n";
                Pearle::myLog("Finished with category.\n");
                exit;
        }

image:  foreach $image (@images)
        {
                my $image_url;
                my $image_regex = $image;
                my $page;
                my @pages = ();
                my $page_remove_log;
                my ($day, $month, $year);
                # Fetch an image page
                my $query = "http://en.wikipedia.org/wiki/$image";
                my $image_text = Pearle::getURL(Pearle::escapeUrl($query));
                my $full_comment = "";

                $page_remove_log = '';
                $last_image = $image;

                if($permit_interruptions and DoIHaveMessages($image_text))
                {
                        print "Talkpage message found; exiting on image $image.\n";
                        Pearle::myLog("Talkpage message found; exiting on image $image.\n");
                        exit;
                }
                
                if($image_whitelist{$image})
                {
                        userwarnlog("*Image $image on whitelist\n");
                        next;
                }

                # Images from Commons
                if($image_text =~ /Wikimedia Commons<\/a>. The description on its /)
                {
                        userwarnlog("*Commons image [[:$image]] found\n");
                        next;
                }

                # The odd case of an image description page without an image
                if($image_text =~ /<p>No file by this name exists; you can <a href=/ and $image_text =~ /$template_match/)
                {
                        userwarnlog("*Image [[:$image]] does not appear to exist.\n");
                        my @historylist = Pearle::parseHistory($image);
                        my $first_entry = pop @historylist;
                        if($first_entry->[4] eq 'STBotI')
                        {
                                push @stbot_images, $image;
                        }
                        next;
                }

                # Check for image existance
                if($image_text =~ /<p>No file by this name exists; you can <a href=/)
                {
                        Pearle::myLog("Image [[:$image]] has been deleted.\n");
                        notelog("Image [[:$image]] has been deleted.\n");
                        next;
                }       

                # Check for image copyright tag         
                if(defined($template_match) and ($image_text !~ /$template_match/))
                {
                        userwarnlog("*Image [[:$image]] in category does not have an appropriate template\n");
                        next;
                }
                
                
                if($task eq 'source')
                {
                        if($image_text =~ /I, the creator of this work/)
                        {
                                next image;
                        }
                        if($image_text =~ /title="Category:[^"]*[Ll]ogos"|title="Category:[^"]*[Cc]overs"/)
                        {
                                Pearle::myLog("*Image [[:$image]] with self-sourcing template found\n");
                                next image;
                        }
                }
                if($task eq 'replaceable')
                {
                        if($image_text =~ /It is disputed whether or not this image violates/)
                        {
                                Pearle::myLog("*Disputed replaceable fair-use image [[:$image]] found\n");
                                next image;
                        }
                }
                
                my ($raw_image) = $image =~ /Image:(.*)/;
                $raw_image = MakeWikiRegex($raw_image);
                if($image !~ /(\.jpg|\.jpeg|\.png|\.gif|\.svg)$/i)
                {
                        $image_regex = "[ _]*(:?[Ii]mage|[Mm]edia)[ _]*:[ _]*${raw_image}[ _]*";
                }
                else
                {
                        $image_regex = "[ _]*[Ii]mage[ _]*:[ _]*${raw_image}[ _]*";
                }
                
                # Sanity check
                if(!defined($raw_image) or $image !~ /$raw_image/)
                {
                        Pearle::myLog("Parse error on image [[:$image]] ($raw_image)\n");
                        userwarnlog("*Parse error on image [[:$image]] ($raw_image)\n");
                        next;
                }
                Pearle::myLog("Image regex: $image_regex\n");
                notelog("Image regex: $image_regex\n");


                ($day, $month, $year) = getDate($image_text);

                # Notify the user
                my $uploader = getUploader($image_text);
                my $is_notified = 0;
                if(defined($uploader_warning) and defined($uploader))
                {
                        $is_notified = isNotified($image_text, $uploader, $image_regex, $image, \%notifications, \%dont_notify);
                }

                if(defined($uploader_warning) and 1 != $is_notified)
                {
                        if(defined($uploader))
                        {
                                if(!($users_notified{$uploader}))
                                {
                                        Pearle::myLog("Warning user $uploader\n");
                                        userwarnlog("${uploader_warning}${image}}}" . sig() . "\n", $uploader, $uploader_warning_summary, $is_notified);
                                        $notifications{"$uploader,$image"} = 1;
                                        $users_notified{$uploader} = 1;
                                }
                                else
                                {
                                        Pearle::myLog("User $uploader has already been warned repeatedly\n");
                                        $users_notified{$uploader} += 1;
                                }
                        }
                        else
                        {
                                Pearle::myLog("Could not determine uploader for [[:$image]]\n");
                        }
                }

                if(!Date::Calc::check_date($year, Decode_Month($month), $day))
                {
                        Pearle::myLog("Date error for image [[:$image]]\n");
                        userwarnlog("*Date error for image [[:$image]]\n");
                }
                
                if((Date::Calc::check_date($year, Decode_Month($month), $day) and (Delta_Days($year, Decode_Month($month), $day, Today() ) >= 4)) or !($limit_by_date))
                {
                        # Ignore any old removal logs
                        $image_text =~ s/<ol>.*?<\/ol>//gs;
                        if($nolimits)
                        {
                                @pages = GetFullPageList($image, $image_text);
                        }
                        else
                        {
                                @pages = GetPageList($image, $image_text);
                        }
                        if(scalar(@pages) == 0)
                        {
                                notelog("Image $image may already be orphaned\n");
                                Pearle::myLog("Image $image may already be orphaned\n");
                        }

                        if(scalar(@pages) > 3)
                        {
                                my $warningtext;
                                $warningtext = "*Found image [[:$image]] on " . scalar(@pages) . " content pages\n";
                                userwarnlog($warningtext);
                        }

                        if(scalar(@pages) > 0)
                        {
                                $images_removed += 1;
                        }

                        foreach $page (@pages)
                        {
                                print "Page for removal: $page\n";
                                my $parsed_removal_comment = $removal_comment;
                                $parsed_removal_comment =~ s/image/[[:$image|image]]/;
                                if(defined($remove_type) and $remove_type eq 'special')
                                {
                                        RemoveImageSpecial($image, $page, $image_regex, $removal_prefix, $parsed_removal_comment);
                                        Pearle::limit();
                                }
                                else
                                {
                                        if(my $hits = RemoveImageFromPage($image, $page, $image_regex, $removal_prefix, $parsed_removal_comment))       # Don't limit if we just touched the article
                                        {
                                                $page_remove_log .= "#[[$page]]\n";
                                                notelog("Removed image: $hits hits.\n");
                                                Pearle::myLog("Removed image $image from article $page\n");
                                                Pearle::limit();
                                        }
                                }
                                $edited = 1;
                        }
                }
                else
                {
                        Pearle::myLog("Recent image: notification only\n");
                        notelog("Recent image: notification only\n");
                }
                
                # Update image description page
                if($write_remove_log)
                {
                        my $edited_idp = 0;
                        # Log all removals on the image description page
                        my ($text, $editTime, $startTime, $token);

                        print "Will write\n";
                        
                        ($text, $editTime, $startTime, $token) = Pearle::getPage($image);
                        if($task eq "source")
                        {
                                if(!isDated($image_text))
                                {
                                        my ($cur_y, $cur_m, $cur_d) = Today();
                                        $cur_m = Month_to_Text($cur_m);
                                        print "Changing date\n";
                                        my $new_template = "{{no source|month=$cur_m|day=$cur_d|year=$cur_y}}";
                                        if($text =~ /{{(?:[Nn]o source|[Nn]sn|[Nn]osource|[Uu]nverified|Di-no source).*?}}/)
                                        {
                                                # Build the substitution regex to replace the notify tag
                                                $text =~ s/{{(?:[Nn]o source|[Nn]sn|[Nn]osource|[Uu]nverified).*?}}/$new_template/;
                                                $full_comment .= "Changing nosource template format; ";
                                        }
                                        else
                                        {
                                                userwarnlog("*Template in [[:$image]] was probably subst'd\n");
                                                Pearle::myLog("Template was probably subst'd\n");
                                                $text .= "\n\n$new_template\n";
                                                $full_comment .= "Adding nosource template.  This may add a second template: the original was probably subst'd.  ";
                                        }
                                        $edited_idp = 1;
                                }
                        }
                        elsif($task eq "copyright")
                        {
                                if(!isDated($image_text))
                                {
                                        my ($cur_y, $cur_m, $cur_d) = Today();
                                        $cur_m = Month_to_Text($cur_m);
                                        print "Changing date\n";
                                        my $new_template = "{{no license|month=$cur_m|day=$cur_d|year=$cur_y}}";
                                        if($text =~ /{{(?:[Nn]o licen[cs]e|[Uu]nknown|[Nn]olicen[cs]e).*?}}/)
                                        {
                                                # Build the substitution regex to replace the notify tag
                                                $text =~ s/{{(?:[Nn]o licen[cs]e|[Uu]nknown|[Nn]olicen[cs]e).*?}}/$new_template/;
                                                $full_comment .= "Changing nolicense template format; ";
                                        }
                                        else
                                        {
                                                userwarnlog("*Template in [[:$image]] was probably subst'd\n");
                                                Pearle::myLog("Template was probably subst'd\n");
                                                $text .= "\n\n$new_template\n";
                                                $full_comment .= "Adding nolicense template.  This may add a second template: the original was probably subst'd.  ";
                                        }
                                        $edited_idp = 1;
                                }
                        }
                        
                        if($page_remove_log ne "")
                        {
                                $text .= "\n\nRemoved from the following pages:\n";
                                $text .= FixupLinks($page_remove_log);
                                $text .= "--~~~~\n";
                                $full_comment .= "Listing pages that the image has been removed from";
                                $edited_idp = 1;
                                print "Remove log\n";
                        }
                        if($edited_idp)
                        {
                                if($test_only)
                                {
                                        notelog("Edited image description page\n");
                                }
                                else
                                {
                                        Pearle::postPage($image, $editTime, $startTime, $token, $text, $full_comment, "no");
                                }
                        }
                }

                if($edited)
                {
                        print "Sleeping for 10 seconds\n";
                        sleep(10);
                }
                else
                {
                        print "Sleeping for two seconds\n";
                        sleep(2);
                }
                $edited = 0;
        }

        notelog("Saving notification list\n");
        saveNotificationList("./orphanbot.note", %notifications);
        Pearle::myLog("Finished with category.\n");
        notelog("Finished with category.\n");
}