User:Polbot/source/Find altnames.pl

From Wikipedia, the free encyclopedia

use strict;
use Perlwikipedia;

my $pw=Perlwikipedia->new();
#$pw->{debug} = 1;
$pw->{mech}->agent('Bot/WP/EN/Quadell/polbot');

print "\nStarting polbot, logging in.\n" ;
my $login_status=$pw->login('bot name','bot password');
die "I can't log in." unless ($login_status eq 0);

my $namechar = "[A-Za-z'._-]";
my %pol_names = ();
my @lines = ();

print "\nReading todo file\n";
my $todo_list = $pw->get_text('User:Polbot/altnames/todo');
@lines = split(/\n/, $todo_list);

#my $wiki_text = '';

foreach my $line (@lines) {
        # Ignore non-listed lines
        # And use just the name, no foolishness
        if ($line =~ s/^\*\s*\[\[([^]]*)\]\]\*(.*)$/$1/) {
                my $article_name = $1;
                my $job_description = $2;
                
                print "Finding altnames for '$article_name'\n";
                #$wiki_text = wikiread($article_name, $pw);

                # Populate the hash.
                $pol_names{$article_name}{'job'} = $job_description;
                
                # First, put in its own name
                {
                        my $main_name = $article_name;
                        # Format suffixes properly
                        $main_name =~ s/([^,]) (Jr\.|Sr\.)$/$1, $2/;
                        $main_name =~ s/, (II|III|IV)$/ $1/;

                        # change "Van Eyk" into "Van_Eyk"
                        $main_name =~ s/\b(Van|van|De|de|de la|La|la|St\.) /$1_/;
                        
                        # change "Jones III" to "Jones_III"
                        if ($main_name =~ m/^(.*) (II$|III$|IV$)/) {
                                $pol_names{$article_name}{'main'}{$1} = $1;                                     
                                $main_name =~ s/ (II$|III$|IV$)/_$1/;
                        }

                        $pol_names{$article_name}{'main'}{$main_name} = $main_name;
                }
                                        
                # Form altnames from these
                foreach my $main_name_i (keys %{$pol_names{$article_name}{'main'}}) {
                        my $main_name = $main_name_i;
                        $pol_names{$article_name}{'alt'}{$main_name} = $main_name;
                        
                        if ($main_name =~ s/ \(.+\)$//) {
                                # e.g. [[John Smith (politician)]]
                                $pol_names{$article_name}{'alt'}{$main_name} = $main_name;
                        }       
                        
                        if ($main_name =~ s/^(.+) "(.+)" (.+)$/$1 $3/) {
                                #e.g. William S. "Bill" Fulton
                                $pol_names{$article_name}{'alt'}{$main_name} = $main_name;
                                $pol_names{$article_name}{'alt'}{"$2 $3"} = "$2 $3";
                        }
                        
                        if ($main_name =~ s/^(.+) '(.+)' (.+)$/$1 $3/) {
                                #e.g. William S. 'Bill' Fulton
                                $pol_names{$article_name}{'alt'}{$main_name} = $main_name;
                                $pol_names{$article_name}{'alt'}{"$2 $3"} = "$2 $3";
                        }
                        
                        if ($main_name =~ s/^(.+) \((.+)\) (.+)$/$1 $3/) {
                                #e.g. William S. (Bill) Fulton
                                $pol_names{$article_name}{'alt'}{$main_name} = $main_name;
                                $pol_names{$article_name}{'alt'}{"$2 $3"} = "$2 $3";
                        }
                        
                        if ($main_name =~ m/^[A-Z]\.( )?[A-Z]\. [A-Zdv]$namechar+$/) {
                                #e.g. C. S. Lewis
                                # Do nothing
                        } elsif ($main_name =~ m/^([A-Z]$namechar+) ([A-Z]\.)([A-Z]\.) ([A-Zdv]$namechar+)$/) {
                                #e.g. William S.P. Fulton
                                $pol_names{$article_name}{'alt'}{"$1 $2 $3 $4"} = "$1 $2 $3 $4";
                                $pol_names{$article_name}{'alt'}{"$1 $4"} = "$1 $4";
                        } elsif ($main_name =~ m/^([A-Z]$namechar+) ([A-Z]\.) ([A-Z]\.) ([A-Zdv]$namechar+)$/) {
                                #e.g. William S. P. Fulton
                                $pol_names{$article_name}{'alt'}{"$1 $2$3 $4"} = "$1 $2 $3 $4";
                                $pol_names{$article_name}{'alt'}{"$1 $4"} = "$1 $4";
                        } elsif ($main_name =~ m/^([A-Z]$namechar+) [A-Z]\. ([A-Zdv]$namechar+)$/) { 
                                #e.g. William S. Fulton
                                $pol_names{$article_name}{'alt'}{"$1 $2"} = "$1 $2";
                        } elsif ($main_name =~ m/^[A-Z]\. ([A-Z]$namechar+) ([A-Zdv]$namechar+)$/) { 
                                #e.g. C. Michael Thompson
                                $pol_names{$article_name}{'alt'}{"$1 $2"} = "$1 $2";
                        } elsif ($main_name =~ m/^([A-Z]$namechar+) ([A-Z])$namechar+ ([A-Zdv]$namechar+)$/) { 
                                #e.g. William Savin Fulton 
                                $pol_names{$article_name}{'alt'}{"$1 $3"} = "$1 $3";
                                #$pol_names{$article_name}{'alt'}{"$1 $2. $3"} = "$1 $2. $3";
                        } elsif ($main_name =~ m/^([A-Z]$namechar+) [A-Zdv]$namechar+ [A-Zdv]$namechar+ ([A-Zdv]$namechar+)$/) { 
                                #e.g. William Savin Edward Fulton 
                                $pol_names{$article_name}{'alt'}{"$1 $2"} = "$1 $2";
                        } elsif ($main_name =~ m/^([A-Z]$namechar+) ([A-Z])($namechar+) ([A-Zdv]$namechar+), (Jr\.|Sr\.)$/) { 
                                #e.g. William Savin Fulton, Jr. 
                                $pol_names{$article_name}{'alt'}{"$1 $2$3 $4"} = "$1 $2$3 $4";
                                #$pol_names{$article_name}{'alt'}{"$1 $2. $4"} = "$1 $2. $4";
                                $pol_names{$article_name}{'alt'}{"$1 $4"} = "$1 $4";
                                #$pol_names{$article_name}{'alt'}{"$1 $2. $4, $5"} = "$1 $2. $4, $5";
                        } elsif ($main_name =~ m/^([A-Z]$namechar+) ([A-Z])\. ([A-Zdv]$namechar+), (Jr\.|Sr\.)$/) { 
                                #e.g. William S. Fulton, Jr. 
                                $pol_names{$article_name}{'alt'}{"$1 $2. $3"} = "$1 $2. $3";
                                $pol_names{$article_name}{'alt'}{"$1 $3"} = "$1 $3";
                        } elsif ($main_name =~ m/^([A-Z]$namechar+) ([A-Zdv]$namechar+), (Jr\.|Sr\.)$/) { 
                                #e.g. William Fulton, Jr. 
                                $pol_names{$article_name}{'alt'}{"$1 $2"} = "$1 $2";
                        }               
                }
        }
}


print "\n\nDone reading.\n\n";

# Convert to the inprocess hash
my %inprocess_names = {};

for my $article_name (sort keys %pol_names) {
        my $line_out = "";
        
        for my $altname (sort keys %{$pol_names{$article_name}{'alt'}}) {
                if ($altname ne $article_name) {
                        $line_out .= "|[[$altname]]";
                }
        }
        if ($line_out) {
                $inprocess_names{$article_name} = "* [[$article_name]]$line_out*" . $pol_names{$article_name}{'job'};
        }
}

# Update my lists

print "Writing in-process list\n";

my $wiki_code = "";

foreach my $inprocess_key (sort keys %inprocess_names) {
        $wiki_code .= $inprocess_names{$inprocess_key} . "\n";
}

$pw->edit('User:Polbot/altnames/inprocess', $wiki_code, "Auto-updating based on input at todo list");

open(outfile, ">inprocess.txt");
print outfile "$wiki_code\n";
close(outfile);

print "Done!\n";

sub wikiread {
        my $article = shift;
        my $connection = shift;
        my $i = 0;
        my $wiki = '';
        
        $wiki = $connection->get_text($article);
        while ($wiki eq "0") { 
                $i++;
                if ($i > 5) {
                        return '';
                }
                
                sleep $i;
                print "   retry. . .\n";
                $wiki = $connection->get_text($article);
        }

        return $wiki;
}