User:FairuseBot/10cbot.pl

From Wikipedia, the free encyclopedia

#!/usr/bin/perl
#
# A bot to assist with NFCC #10c enforcement

use warnings;
use strict;

use Date::Calc qw(Month_to_Text Today);
use Data::Dumper;
use Array::Utils;

binmode STDOUT, ":utf8";

use libBot;

my $test = 0;
my $images_marked = 0;

my %common_pages;          # A list of pages that images have been found on, to see what's linked from templates and therefore shouldn't be included in disambig/redirect searches.

my $homedir = '';
my $permit_interruptions = 1;   # Allow talkpage messages to stop the bot?

# Common links that we don't need to chase for redirect or disambiguation testing
my @common_links = ("Copyright", "Copyright infringement", "Fair use", "Logo", "Trademark", "United States copyright law", "Wikimedia",
                    "Computer game", "Counterfeit", "Currency", "Free software", "Portable Network Graphics", "Poster", "Public domain",
                    "Screenshot", "Station identification", "United States Code", "U.S. state", "Video game", "Wikimedia Foundation",
                    "Work of the United States Government","Blu-ray Disc", "Comic book", "Comic strip", "DVD", "Free content",
                    "Videotape", "Webcomic", "Scouting", "Personality rights", "Screenshot", "2007", "2008", "Crown copyright",
                    "Scalable Vector Graphics", "BSD", "GPL", "JPEG", "Uploading and downloading", "Compression artifact",
                    "Vector graphics", "Uniform", "Magazine", "Coat of arms", "Crest (heraldry)", "Emblem", "Seal (device)", "Flag",
                    "Graphics Interchange Format");
#my $common_links = join "|", @common_links;

# A selection of common free-image categories
my @free_categories;

Pearle::init("FairuseBot", "", "$homedir/fairusebot.log","$homedir/10cbot-cookies.txt");
Pearle::config(nullOK => 1, printlevel => 4);
config(username => "FairuseBot");

my %notifications = loadNotificationList("$homedir/nfcc10c.note");
my %dont_notify = loadNotificationList("$homedir/nfcc10c.whitelist");

if(!Pearle::login())
{
        exit;
}

# Fetch a list of common free licenses
@free_categories = Pearle::getSubcategories("Category:Free images");
print "Free-license categories: ", join(",", @free_categories), "\n";

# Fetch the list of all non-free images
my @images;
my $imagenum;
if($test)
{
        @images = ('Image:Dummy315.png');
}
else
{
        open IMAGECOUNTFILE, "<", "$homedir/imagecount.txt";
        $imagenum = <IMAGECOUNTFILE>;
        chomp $imagenum;
        close IMAGECOUNTFILE;
        
        my $i = 0;
        open IMAGEFILE, "<:utf8", "$homedir/imagelist.txt";
        while($i < $imagenum)
        {
                my $dummy = <IMAGEFILE>;
                $i += 1;
        }
}

sub get_next_image
{
        if($test)
        {
                return shift @images;
        }
        else
        {
                my $image =  <IMAGEFILE>;
                chomp $image;
                return $image;
        }
}

# Process the list
IMAGE:
while(my $image = get_next_image())
{
        $imagenum += 1;
        Pearle::myLog(2, "Processing image $image ($imagenum)\n");
        # Fetch the image data
        my $image_data;
        if($test)
        {
                $image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], 
                                                  plnamespace => [0, 2],                                                     # Links
                                                  rvprop => ['content'],                                                     # Article body
                                                  iiprop => ['user', 'comment', 'sha1'], iilimit => 500,                  # Upload history
                                                  meta => 'userinfo', uiprop => ['hasmsg'],                                       # Check for talkpage messages
                                                  list => 'imageusage', iutitle => $image, iunamespace => [0, 2], iulimit => 500);  # Image usage
        }
        else
        {
                $image_data = Pearle::APIQuery(titles => [$image], prop => ['links', 'revisions', 'imageinfo', 'categories'], 
                                                  plnamespace => [0],                                                                # Links
                                                  rvprop => ['content'],                                                     # Article body
                                                  iiprop => ['user', 'comment', 'sha1'], iilimit => 500,                  # Upload history
                                                  meta => 'userinfo', uiprop => ['hasmsg'],                                       # Check for talkpage messages
                                                  list => 'imageusage', iutitle => $image, iunamespace => [0], iulimit => 500);     # Image usage
        }
        
        if(!defined($image_data))
        {
                Pearle::myLog(1, "Server did not return an appropriate response on initial query.  Deferring image for later.\n");
                open DEFERLOG, ">>:utf8", "$homedir/deferred_images.txt";
                print DEFERLOG "$image\n";
                close DEFERLOG;
                next;
        }
        
        # Extract the list of pages where it's used.
        my @pages = GetPageList($image_data);
        my $num_pages = scalar(@pages);
        my @failed_pages;
        # Extract a list of pages this image links to.
        my @links = GetPageLinks($image_data);
        # Filter out common links
        my @new_links;
        foreach my $link (@links)
        {
                if(grep {$_ eq $link} @common_links)
                {
                        # Do nothing
                }
                else
                {
                        Pearle::myLog(4, "Found valid link $link\n");
                        push @new_links, $link;
                }
        }
        @links = @new_links;
        
        # Collect link statistics
        foreach my $page (@links)
        {
                $common_pages{$page} += 1;
                if($common_pages{$page} == 20)
                {
                        Pearle::myLog(2, "*Adding common page $page to list\n");
                        push @common_links, $page;
                }
        }
        
        # Extract the body text.
        my $text = GetPageText($image_data);
        # Extract the categories
        my @categories = GetPageCategories($image_data);
        
        # Check for interruptions
        if($permit_interruptions and DoIHaveMessages($image_data))
        {
                Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");
                last;
        }
        
        # Sanity check: Does the image still exist? - tested
        if($image_data =~ /missing=""/)
        {
                Pearle::myLog(2, "*Image [[:$image]] has been deleted.\n");
                next;
        }
        # Sanity check: Is this still tagged as non-free? - tested
        if(!grep {$_ eq 'Category:All non-free media'} @categories)
        {
                Pearle::myLog(2, "*Image [[:$image]] is no longer marked as non-free.\n");
                next;
        }
        # Sanity check: Is this image already disputed?
        if((grep {$_ eq 'Category:All disputed non-free images'} @categories) or
           (grep {$_ eq 'Category:All images with no fair use rationale'} @categories))
        {
                Pearle::myLog(2, "*Image [[:$image]] is already marked for deletion.\n");
                next;
        }
        # Sanity check: Is the image used?
        if(scalar(@pages) == 0)
        {
                # Orphaned fairuse image
                Pearle::myLog(2, "*Image [[:$image]] is not used anywhere\n");
                if(!grep {$_ eq 'Category:All orphaned fairuse images'} @categories)
                {
                        my $text = "\n{{subst:orfud}}\n";
                        wikilog($image, $text, "Non-free image is not used in any article\n");
                
                        my $uploader = GetImageUploader($image_data);
                        if(defined($uploader))
                        {
                                if(!IsNotified($uploader, MakeWikiRegex($image), $image, \%notifications, \%dont_notify))
                                {
                                        $text = "\n{{subst:User:OrphanBot/orfud|$image}}\n";
                                        wikilog("User talk:$uploader", $text, "Image [[:$image]] is not used in any article");
                                }
                        }
                }
                next;
        }
        
        # Check: Is the image double-tagged as both free and non-free?
        if(Array::Utils::intersect( @categories, @free_categories))
        {
                Pearle::myLog(2, "Image [[:$image]] is double-tagged as free and non-free\n");
                botwarnlog("*Image [[:$image]] is double-tagged as free and non-free\n");
                next;
        }
        
        # Check for best-case compliance: each use has a matching direct link in the body of the text - tested
        Pearle::myLog(4, "Image is used in " . scalar(@pages) . " pages.\n");
        Pearle::myLog(4, "Image is used on " . join("|", @pages) . "\n");
        Pearle::myLog(4, "Image links to " . join("|", @links) . "\n");
        
        foreach my $page (@links)       # Filter out pages that match a link
        {
                @pages = grep {$_ ne $page} @pages;
        }
        Pearle::myLog(2, "Image failed best-case test for " . scalar(@pages) . " pages.\n");
        next if(scalar(@pages) == 0);
                
        # Check for liberal compliance:
        # For each use, remove it from the list if there's a case-insensitive match in the body text - tested
        foreach my $page (@pages)
        {
                my $page_match_regex = MakeWikiRegex($page);
                push @failed_pages, $page unless($text =~ /$page_match_regex/i);
        }
        @pages = @failed_pages;
        @failed_pages = ();
        
        Pearle::myLog(2, "Image failed text test for " . scalar(@pages) . " pages.\n");
        next if(scalar(@pages) == 0);
        
        # Chase redirects and disambig pages
        foreach my $link (@links)                                       # Foreach link
        {
                my @expanded_links = ChaseLinks($link);                 # Find out where we can get through it
                
                if(scalar(@expanded_links) == 0)
                {
                        # @expanded_links will contain at least one link.
                        # If it doesn't, an error occurred.
                        open DEFERLOG, ">>:utf8", "$homedir/deferred_images.txt";
                        print DEFERLOG "$image\n";
                        close DEFERLOG;
                        next IMAGE;
                }
                
                foreach my $page (@pages)                               # Foreach page
                {
                        if(grep {$page eq $_} @expanded_links)          # If we can get there by chasing this link
                        {
                                UpdateLink($image, $link, $page);       # Update the link to point to the page
                                Pearle::limit();
                        }
                        else
                        {
                                push @failed_pages, $page;              # Otherwise, keep looking for a way to get there
                        }
                }                                                       # Note that because we don't break out of the loop, it will take two
                @pages = @failed_pages;                                 # passes for the bot to do the right thing in the unlikely case that
                @failed_pages = ();                                     # we can get to two pages by expanding one link.
                
                last if(scalar(@pages) == 0);                           # If we've found all the pages, exit the loop.
        }
        
        Pearle::myLog(2, "Image failed link-chasing test for " . scalar(@pages) . " pages.\n");
        next if(scalar(@pages) == 0);
        
        # Test for compliance
        # Over-use (some compliant, some non-compliant): Remove from any non-compliant articles, OrphanBot-style.  Leave a note on the article talk page.
        if(scalar(@pages) > 0 and $num_pages > scalar(@pages))
        {
                Pearle::myLog(2, "Image $image failed on " . scalar(@pages) . " pages.\n");
                my ($cur_y, $cur_m, $cur_d) = Today(1);         # Today in GMT.  Generated each time because this bot does *long* runs.
                
                open IMAGEFAILLOG, ">>:utf8", "$homedir/partial_failures_${cur_y}-${cur_m}-${cur_d}.txt";
                print IMAGEFAILLOG "$image\n";
                close IMAGEFAILLOG;
                
                $cur_m = Month_to_Text($cur_m);
                my $text = "\n{{Di-missing article links|day=$cur_d|month=$cur_m|year=$cur_y|articles=\n";
                foreach my $page (@pages)
                {
                        $text .= "*$page\n";
                }
                $text .= "}}\n";
                wikilog($image, $text, "Image is not compliant with [[WP:NFCC|the non-free content rules]]");
                
                foreach my $page (@pages)
                {
                        if(!IsPageNotified("Talk:$page", MakeWikiRegex($image), $image, \%notifications, \%dont_notify))
                        {
                                NotifyPage("Talk:$page", $image);
                                $notifications{"Talk:$page,$image"} = 1;
                        }
                }
                
                $images_marked += 1;
                Pearle::limit(10);
        }
        # Non-compliance (all uses are non-compliant): Mark for deletion, notify the uploader.
        elsif(scalar(@pages) > 0 and $num_pages == scalar(@pages))
        {
                Pearle::myLog(2, "Image $image failed on all pages.\n");
                my ($cur_y, $cur_m, $cur_d) = Today(1);         # Today in GMT.  Generated each time because this bot does *long* runs.
                $cur_m = Month_to_Text($cur_m);
                $text = "\n{{Di-missing article links|day=$cur_d|month=$cur_m|year=$cur_y|articles=\n";
                foreach my $page (@pages)
                {
                        $text .= "*$page\n";
                }
                $text .= "}}\n";
                wikilog($image, $text, "Image is not compliant with [[WP:NFCC|the non-free content rules]]");
                
                my $uploader = GetImageUploader($image_data);
                if(defined($uploader))
                {
                        if(!IsNotified($uploader, MakeWikiRegex($image), $image, \%notifications, \%dont_notify))
                        {
                                NotifyUser($uploader, $image);
                                $notifications{"$uploader,$image"} = 1;
                        }
                }
                
                $images_marked += 1;
                Pearle::limit(10);
        }
        # No problems
        else
        {
                Pearle::myLog(2, "Image $image is okay.\n");
        }
        
        if($images_marked >= 1000)
        {
                Pearle::myLog(0, "Edit limit reached; exiting\n");
                last;
        }
                
        if(!$test)
        {
                open IMAGENUMFILE, ">", "$homedir/imagecount.txt";
                print IMAGENUMFILE $imagenum;
                close IMAGENUMFILE;
        
                Pearle::myLog(4, "Saving notification list\n");
                saveNotificationList("$homedir/nfcc10c.note", %notifications);
        }
}

$imagenum += 1;
if(!$test)
{
        open IMAGENUMFILE, ">", "$homedir/imagecount.txt";
        print IMAGENUMFILE $imagenum;
        close IMAGENUMFILE;

        Pearle::myLog(4, "Saving notification list\n");
        saveNotificationList("$homedir/nfcc10c.note", %notifications);
}


########## Support functions #############################################
sub NotifyUser
{
        my $uploader = shift;
        my $image = shift;
        
        my $uploader_page = "User talk:$uploader";
        my $wikipage = Pearle::getPage( $uploader_page );
        my $text = $wikipage->getEditableText();
        $text = $wikipage->unfoldComments($text);
        
        if($text =~ /^#redirect/i)
        {
                botwarnlog("*User talk page [[User talk:$uploader]] is a redirect\n");
                return;
        }
        
        my $summary = "Image [[:$image]] is not compliant with [[WP:NFCC|the non-free content rules]]";
        
        if($text =~ /<!-- Additional 10c images go here -->/)
        {
                Pearle::myLog(4, "Adding notification to list\n");
                # Add an image to the list
                $text =~ s/(<!-- Additional 10c images go here -->)/${1}\n*[[:$image]]/;
        }
        elsif($text =~ /<!-- Additional 10c list header goes here -->/)
        {
                Pearle::myLog(4, "Adding notification list\n");
                # Add an "additional images" list
                $text =~ s/<!-- Additional 10c list header goes here -->/{{subst:User:OrphanBot\/nfcc10c-more}}\n*[[:$image]]/;
        }
        else
        {
                Pearle::myLog(4, "Adding notification\n");
                $text .= "\n{{subst:User:OrphanBot/nfcc10c|$image}} --~~~~~\n";
        }
        $wikipage->setEditableText($text);
        Pearle::postPage($wikipage, $summary, 0);
}

sub NotifyPage
{
        my $uploader = shift;
        my $image = shift;
        
        my $uploader_page = "$uploader";
        my $wikipage = Pearle::getPage( $uploader_page );
        my $text = $wikipage->getEditableText();
        $text = $wikipage->unfoldComments($text);
        
        if($text =~ /^#redirect/i)
        {
                botwarnlog("*Talk page [[$uploader_page]] is a redirect\n");
                return;
        }
        
        my $summary = "Image [[:$image]] in this article is not compliant with [[WP:NFCC|the non-free content rules]]";
        
        if($text =~ /<!-- Additional 10c images go here -->/)
        {
                Pearle::myLog(4, "Adding notification to list\n");
                # Add an image to the list
                $text =~ s/(<!-- Additional 10c images go here -->)/${1}\n*[[:$image]]/;
        }
        elsif($text =~ /<!-- Additional 10c list header goes here -->/)
        {
                Pearle::myLog(4, "Adding notification list\n");
                # Add an "additional images" list
                $text =~ s/<!-- Additional 10c list header goes here -->/{{subst:User:OrphanBot\/nfcc10c-more}}\n*[[:      $image]]/;
        }
        else
        {
                Pearle::myLog(4, "Adding notification\n");
                $text .= "\n{{subst:User:OrphanBot/nfcc10c-article|$image}} --~~~~~\n";
        }
        $wikipage->setEditableText($text);
        Pearle::postPage($wikipage, $summary, 0);
}

sub ChaseLinks
{
        my $link = shift;
        my @candidates = ($link);
        my @destinations;
        
        my $iterations = 0;
        my $done = 0;
        while(!$done)
        {
                my @new_candidates = ();
                my $page_data = Pearle::APIQuery(titles => \@candidates, prop => ['links', 'templates'], 
                                                  redirects => 1,
                                                  tlnamespace => [10],                               # Templates
                                                  plnamespace => [0, 2]);                    # Links
                if(!defined($page_data))
                {
                        Pearle::myLog(1, "Server did not return an appropriate response on chase query.  Deferring image for later.\n");
                        return ();
                }
                
                my $parsed_xml = Pearle::getXMLParser()->XMLin($page_data, ForceArray => ['r', 'page', 'pl', 'tl'] );
                Pearle::myLog(4, Dumper($parsed_xml));
                
                # Push all redirect targets to @destinations, because we've already got disambig data, and we don't want to chase redirect loops.
                my $redirects = $parsed_xml->{query}->{redirects}->{r};
                push @destinations, map {$_->{to}} @{$redirects};
                
                # Push all disambig targets to @destinations and to @new_candidates, because we want to do redirect testing on them.
                my $pages = $parsed_xml->{query}->{pages}->{page};
                foreach my $page (@{$pages})
                {
                        if(grep {$_->{title} eq 'Template:Disambig'} @{$page->{templates}->{tl}})
                        {
                                # Disambiguation page.  For each target, if we don't have it already, push it to @destinations and @new_candidates.
                                foreach my $target (@{$page->{links}->{pl}})
                                {
                                        if(grep {$_ eq $target->{title}} @destinations)
                                        {
                                                # Do nothing
                                        }
                                        else
                                        {
                                                push @destinations, $target->{title};
                                                push @new_candidates, $target->{title};
                                        }
                                }
                        }
                        else
                        {
                                # Ordinary page.
                                push @destinations, $page->{title} unless(grep {$_ eq $page->{title}} @destinations);
                        }
                }
                
                if(scalar(@new_candidates) == 0)
                {
                        $done = 1;
                }
                if($iterations > 10)
                {
                        Pearle::myLog(1, "Iteration limit exceeded\n");
                        $done = 1;
                }
                
                $iterations += 1;
                @candidates = @new_candidates;
        }
        Pearle::myLog(4, join(",", @destinations) . "\n");
        return @destinations;
}