User:FairuseBot/10c-removal.pl

From Wikipedia, the free encyclopedia

#!/usr/bin/perl


# 10c-removal
#
# A bot to remove NFCC #10c-incompliant images from pages

use strict;
use warnings;

use Date::Calc;
use Data::Dumper;

use libBot;

my @common_links = ("Copyright", "Copyright infringement", "Fair use", "Logo", "Trademark", "United States copyright law", "Wikimedia",
                    "Computer game", "Counterfeit", "Currency", "Free software", "Portable Network Graphics", "Poster", "Public domain",
                    "Screenshot", "Station identification", "United States Code", "U.S. state", "Video game", "Wikimedia Foundation",
                    "Work of the United States Government");
my $common_links = join "|", @common_links;

my $test = 0;

my $homedir = '/home/mark/Desktop/wikibots/10cbot';
my $permit_interruptions = 1;   # Allow talkpage messages to stop the bot?

Pearle::init("FairuseBot", "", "$homedir/removebot.log","$homedir/removebot-cookies.txt");
Pearle::config(nullOK => 1, printlevel => 4);
config(username => "FairuseBot");

if(!Pearle::login())
{
        exit;
}

# Check for a running copy
if(-e "$homedir/pid")
{
        # Possible other copy.  Compare PIDs
        open PIDFILE, "<", "$homedir/pid";
        my $pid = <PIDFILE>;
        close PIDFILE;

        my $psresult = `ps -p $pid`;
        if($psresult =~ /10c-removal.pl/)
        {
                botwarnlog("*Previous run is taking longer than normal\n");
                exit;
        }
}

open PIDFILE, ">", "$homedir/pid";
print PIDFILE $$;
close PIDFILE;

my $total_images = 0;
my @logs;

{
        my @images;
        my $image;
        my $images_removed = 0;
        
        @images = ();
        
        Pearle::myLog(2, "Beginning set at " . time() . "\n");

        # Get the log
        if($test)
        {
                @images = ('Image:Dummy316.png');
        }
        else
        {
                my $CURRENT_DIR;
                my @files;
                # Scan the directory for log files
                opendir($CURRENT_DIR, $homedir) or (print "Failed: $!\n" and return);
                @files = readdir $CURRENT_DIR or (print "Failed: $!\n" and return);
                closedir $CURRENT_DIR;
                @files = grep {/^partial_failures.*txt$/} @files;
                foreach my $file (@files)
                {
                        my ($year, $month, $day) = $file =~ /_(\d{4})-(\d{1,2})-(\d{1,2})/;
                        if(Date::Calc::Delta_Days( $year, $month, $day, (Date::Calc::Today(1))) > 5)
                        {
                                open INFILE, "<:utf8", "$homedir/$file";
                                my @new_images = <INFILE>;
                                close INFILE;
                                chomp @new_images;
                                push @images, @new_images;
                                
                                push @logs, "$homedir/$file";
                        }
                }
        }
                
        Pearle::myLog(3, join("\n", @images));
        Pearle::myLog(3, "\n" . scalar(@images) . " images found\n");
        
        if(scalar(@images) == 0)
        {
                Pearle::myLog(1, "*No images to remove\n");
        }

        foreach $image (@images)
        {
                my $image_url;
                my $image_regex = $image;
                my $page;
                
                my $full_comment = "";
                my $removal_prefix = "Image with inadequate rationale removed:";
                my $removal_comment = "Removing image with inadequate [[WP:NFCC|rationale]]";
                
                # Fetch image info
                Pearle::myLog(2, "Processing image $image\n");
                # Fetch the image data
                my $image_data;
                if($test)
                {
                        $image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], 
                                                        plnamespace => [0, 2],                                                       # Links
                                                        rvprop => ['content'],                                                       # Article body
                                                        iiprop => ['user', 'comment', 'sha1'], iilimit => 500,                    # Upload history
                                                        meta => 'userinfo', uiprop => ['hasmsg'],                                         # Check for talkpage messages
                                                        list => 'imageusage', iutitle => $image, iunamespace => [0, 2], iulimit => 500);    # Image usage
                }
                else
                {
                        $image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], 
                                                        plnamespace => [0],                                                          # Links
                                                        rvprop => ['content'],                                                       # Article body
                                                        iiprop => ['user', 'comment', 'sha1'], iilimit => 500,                    # Upload history
                                                        meta => 'userinfo', uiprop => ['hasmsg'],                                         # Check for talkpage messages
                                                        list => 'imageusage', iutitle => $image, iunamespace => [0], iulimit => 500);       # Image usage
                }
                
                if(!defined($image_data))
                {
                        Pearle::myLog(0, "Server did not return an appropriate response.  Exiting.\n");
                        last;
                }
        
                # Extract the list of pages where it's used.
                my @pages = GetPageList($image_data);
                my $num_pages = scalar(@pages);
                my @failed_pages;
                # Extract the categories
                my @categories = GetPageCategories($image_data);
                # Extract a list of pages this image links to.
                my @links = GetPageLinks($image_data);
                # Filter out common links
                @links = grep {$_ !~ /^($common_links)$/} @links;

                if($permit_interruptions and DoIHaveMessages($image_data))
                {
                        Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");
                        exit;
                }
                
                # Sanity check: Does the image still exist?
                if($image_data =~ /missing=""/)
                {
                        Pearle::myLog(2, "*Image [[:$image]] has been deleted.\n");
                        next;
                }
                # Sanity check: Is this still tagged as non-free?
                if(!grep {$_ eq 'Category:All non-free media'} @categories)
                {
                        Pearle::myLog(2, "*Image [[:$image]] is no longer marked as non-free.\n");
                        next;
                }
                # Sanity check: Is the image used?
                if(scalar(@pages) == 0)
                {
                        # Orphaned fairuse image
                        Pearle::myLog(2, "*Image [[:$image]] is not used anywhere\n");
                        # Is this image already disputed?
                        if(grep {$_ eq 'Category:All disputed non-free images'} @categories)
                        {
                                Pearle::myLog(2, "*Image [[:$image]] is already marked for deletion.\n");
                        }
                        else
                        {
                                if(!grep {$_ eq 'Category:All orphaned fairuse images'} @categories)
                                {
                                        my $text = "\n{{subst:orfud}}\n";
                                        wikilog($image, $text, "Non-free image is not used in any article\n");
                                }
                        }
                        next;
                }
                # Sanity check: Is the image still tagged as disputed?
                if(!grep {$_ eq 'Category:All disputed non-free images'} @categories)
                {
                        Pearle::myLog(2, "*Image [[:$image]] is not marked for deletion.\n");
                        next;
                }
                
                # Remove the NFCC-failure tag and the list of pages
                # Blindly removing the tag is safe:
                # 1) If the program fails, 10cbot will pick the image up on its next pass
                # 2) If the image is orphaned, or will be orphaned by removal (unlikely), 10cbot or another bot will pick it up
                # 3) If the image is non-compliant on all pages, 10cbot will pick it up on the next pass
                my $wikipage = Pearle::getPage($image);
                my $text = $wikipage->getEditableText();
                $text =~ s/\x03\x44i-missing article links[^\x04]*\x04//s;
                Pearle::myLog(4, "Text after processing:\n$text\n");
                $wikipage->setEditableText($text);
                Pearle::postPage($wikipage, "Removing tag", 0);
                Pearle::limit();
                
                # Build the image-matching regex
                my ($raw_image) = $image =~ /Image:(.*)/;
                $raw_image = MakeWikiRegex($raw_image);
                if($image !~ /(\.jpg|\.jpeg|\.png|\.gif|\.svg)$/i)
                {
                        $image_regex = "[ _]*(:?[Ii][Mm][Aa][Gg][Ee]|[Mm][Ee][Dd][Ii][Aa])[ _]*:[ _]*${raw_image}[ _]*";
                        Pearle::myLog(2, "*Non-image media file [[:$image]] found.\n");
                        next;                   # Non-image media are too hard to work with
                }
                else
                {
                        $image_regex = "[ _]*[Ii][Mm][Aa][Gg][Ee][ _]*:[ _]*${raw_image}[ _]*";
                }
                                
                # Sanity check
                if(!defined($raw_image) or $image !~ /$raw_image/)
                {
                        botwarnlog("*Parse error on image [[:$image]] ($raw_image)\n");
                        next;
                }
                Pearle::myLog(3, "Image regex: $image_regex\n");
                
                # Check for best-case compliance: each use has a matching direct link in the body of the text - tested
                Pearle::myLog(4, "Image is used in " . scalar(@pages) . " pages.\n");
                Pearle::myLog(4, "Image is used on " . join("|", @pages) . "\n");
                Pearle::myLog(4, "Image links to " . join("|", @links) . "\n");
                
                foreach my $page (@links)       # Filter out pages that match a link
                {
                        @pages = grep {$_ ne $page} @pages;
                }
                Pearle::myLog(4, "Image failed best-case test for " . scalar(@pages) . " pages.\n");
                next if(scalar(@pages) == 0);
                        
                # Check for liberal compliance:
                # For each use, remove it from the list if there's a case-insensitive match in the body text - tested
                foreach my $page (@pages)
                {
                        my $page_match_regex = MakeWikiRegex($page);
                        push @failed_pages, $page unless($text =~ /$page_match_regex/i);
                }
                @pages = @failed_pages;
                @failed_pages = ();
                
                Pearle::myLog(4, "Image failed text test for " . scalar(@pages) . " pages.\n");
                next if(scalar(@pages) == 0);
                
                # Check for strict compliance:
                # For each link, chase redirects - tested
                if(scalar(@links) > 0)
                {
                        my $page_data = Pearle::APIQuery(titles => \@links, redirects => 1);
                        my $parsed_xml = Pearle::getXMLParser()->XMLin($page_data);
                        my @redirects;
                        Pearle::myLog(4, Dumper($parsed_xml));
                        if(exists($parsed_xml->{query}->{redirects}->{r}) and defined($parsed_xml->{query}->{redirects}->{r}))
                        {
                                if(ref($parsed_xml->{query}->{redirects}->{r}) eq 'ARRAY')
                                {
                                        @redirects = @{$parsed_xml->{query}->{redirects}->{r}};
                                }
                                else
                                {
                                        @redirects = ($parsed_xml->{query}->{redirects}->{r});
                                }
                        }
                        foreach my $page (@pages)
                        {
                                my $matched = 0;
                                foreach my $redirect (@redirects)
                                {
                                        if($redirect->{to} eq $page)
                                        {
                                                # We can get there by a redirect
                                                UpdateLink($image, $redirect->{from}, $page);
                                                Pearle::limit();
                                                $matched = 1;
                                                last;
                                        }
                                }
                                if(!$matched)
                                {
                                        push @failed_pages, $page;
                                }
                        }
                        @pages = @failed_pages;
                        @failed_pages = ();
                }
                
                Pearle::myLog(4, "Image failed redirect test for " . scalar(@pages) . " pages.\n");
                next if(scalar(@pages) == 0);
        
                # Check for near-compliance:
                # For each use, if we can get to it by means of a disambiguation page, update the link - tested
                foreach my $page (@links)
                {
                        # Fetch the page text and page links
                        my $page_data = Pearle::APIQuery(titles => [$page], prop => ['links', 'revisions'], 
                                                plnamespace => [2],                                                  # Links
                                                rvprop => ['content']);                                      # Article body
                        # If the page text indicates disambig, see if any of the links is one we're looking for
                        my $page_text = GetPageText($page_data);
                        if($page_text =~ /{{disambig}}/i)
                        {
                                my @page_links = GetPageLinks($page_data);
                                foreach my $disambig_link (@page_links)
                                {
                                        if(grep {$_ eq $disambig_link} @pages)
                                        {
                                                # It's a match.  Remove it from the list
                                                @pages = grep {$_ ne $disambig_link} @pages;
                                                # Post to the page
                                                my $success = UpdateLink($image, $page, $disambig_link);
                                                if(!$success)
                                                {
                                                        botwarnlog("*Failed to update disambiguation link for [[:$image]] from [[$page]] to [[$disambig_link]]\n");
                                                }
                                                Pearle::limit();
                                        }
                                }
                        }
                }
                
                Pearle::myLog(4, "Image failed disambiguation test for " . scalar(@pages) . " pages.\n");
                next if(scalar(@pages) == 0);
                
                # Test for compliance
                # Over-use (some compliant, some non-compliant): Remove from any non-compliant articles, OrphanBot-style.  Leave a note on the article talk page.
                if(scalar(@pages) > 0 and $num_pages > scalar(@pages))
                {
                        Pearle::myLog(2, "Image $image failed on " . scalar(@pages) . " pages.\n");
                        
                        my $parsed_removal_comment = $removal_comment;
                        $parsed_removal_comment =~ s/image/[[:$image|image]]/;
                        foreach $page (@pages)
                        {
                                my $hits = 0;
                                notelog("Page for removal: $page\n");
                                if($hits = RemoveImageFromPage($image, $page, $image_regex, $removal_prefix, $parsed_removal_comment))  # Don't limit if we just touched the article
                                {
                                        Pearle::myLog(2, "Removed image $image from article $page ($hits times)\n");
                                        Pearle::limit();
                                }
                                $images_removed += $hits;
                        }
                }
                elsif(scalar(@pages) > 0)
                {
                        # Fully-non-compliant.  Should never occur, but if it does, let 10cbot pick it up on the next pass.
                        Pearle::myLog(2, "Image $image failed on all pages\n");
                }
                else
                {
                        Pearle::myLog(2, "Image $image is now fully-compliant\n");
                }
        }
        Pearle::myLog(2, "Finished with set.  Removed $images_removed images.\n");
        $total_images += $images_removed;
}

# Remove the processed logs
unlink @logs;

unlink "$homedir/pid"