User:Polbot/source/Polbot.pm

From Wikipedia, the free encyclopedia

package Polbot;

use strict;
use LWP::UserAgent;

# Here is an example for this sub's usage:
#
# my $url1 = 'http://bioguide.congress.gov/scripts/biodisplay.pl?index=H000671'; 
# print Polbot::bio2wiki($url1);

sub bio2wiki {
        my $url = shift;

        # Constants
        my $pronoun = 'He'; #Unfortunately, there is no way to tell if the person is male or female from the bioguide. I hate assuming male here, but what can you do?
        my $preps = 'in|near|to|at|of';
        my $months = 'January|February|March|April|May|June|July|August|September|October|November|December';
        my $states = 'Alaska|Alabama|Arkansas|Arizona|California|Colorado|Connecticut|Deleware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Montana|Missouri|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming|Ireland|France|England|Scotland|Wales|Holland|Spain|Germany';
        my $He_list = 'attended|became|commenced|completed|continued|declined|did|died|engaged|entered|established|graduated|is|journeyed|left|lived|lives|moved|owned|owns|participated|pursued|received|remained|remains|represented|represents|resigned|resumed|retired|returned|served|settled|signed|studied|successfully|taught|unsuccessfully|was|went|worked|works';
        my $Hewas_list = 'a|an|admitted|affiliated|appointed|assigned|author|discharged|editor|educated|employed|engaged|entombed|impeached|interred|interested|not|one|owner|promoted|publisher|reelected|re-elected|reinterred';
        my $Servedas_list = 'Court|Democratic|Republican|adjutant|aide|assistant|associate|businessman|businesswoman|captain|chair|chairman|clerk|collector|colonel|commissioner|defense|delegate|director|district|general|governor|inspector|judge|justice|lieutenant|magistrate|master|mayor|[mM]ember|naval|overseer|president|presidential|proprietor|prosecuting|solicitor|special|staff|vice|war';
        
        # Connect to the URL
        my $ua = new LWP::UserAgent;
        $ua->agent("Mozilla/6.0");  
        my $req = new HTTP::Request GET => $url;
        my $res = $ua->request($req);
        $res->is_success or die "Could not get content";
        
        # Get the content 
        my $content = $res->content;
        $content =~ s/^.*<P><FONT SIZE=4 COLOR=\"\#800040\">([^<]*), ?<\/FONT>([^<]*)<\/(TD|P)>.*$/$2/s;  # Just the main text (minus name)
        my $reversedname = $1;
        $content =~ s/\n//sg;  # as a single line
        
        # Parse name
        $reversedname =~ s/\s+/ /g;
        $reversedname =~ m/^([^,]*), ([^,]*)(, .*)?$/;
        my $firstname = $2;
        my $lastname = $1;
        my $suffix = $3;
        #die ">$foundname<  =>  >$foundfirstname< >$foundlastname< >$foundsuffix<\n";
        $lastname =~ s/(\w+)/\u\L$1/g;
        $reversedname = "$lastname, $firstname$suffix";
        my $fullname = "$firstname $lastname$suffix";
        
        # Do universal search & replaces
        $content =~ s/\s+/ /g; #take out dbl spaces;
        $content = unabbreviate_states($content); #expand all state names
        $content = link_cities_from_pattern($content); 
        $content = link_dates_from_pattern($content);
        $content = link_colleges_from_pattern($content);
        $content = replace_recognized_tokens($content);
        
        # split into individual lines
        my @lines = split(/; /, $content);  
        foreach my $line (@lines) { $line =~ s/^ // }  #take out leading space (if there)
        
        # Set up initial variables
        my $familyinfo = '';
        my $iswas = 'is';
        my $initial_description = '';
        my $birthdeath = 'unknown birth and death';
        my $birth = '';
        my $birthyear = '';
        my $death = '';
        my $deathyear = '';
        my $body = '';
        my %cats = (); # for categories like "Senator from Kentucky"
        
        # line 1. First off, does it start with " (son of . . .), " or something similar?
        # e.g. brother of John Fitzgerald Kennedy and Robert Francis Kennedy, grandson of John Francis Fitzgerald
        my $line = shift(@lines);
        
        if ($line =~ m/^\(([^)]*)\)/) {
                $familyinfo = $1;
                $line =~ s/^\([^)]*\), (.*)$/$1/;       
                
                $familyinfo =~ s/of ([^,]*),/of [[$1]],/g;
                $familyinfo =~ s/of ([^,]*)$/of [[$1]]/g;
                $familyinfo =~ s/([^],]) and /$1]] and [[/g;    
        }
        
        # Now, make line1 into the initial description, and pick categories.
        $initial_description = $line;
        
        while ($initial_description =~ m/(a Senator and a Representative|a Representative and a Senator) from ($states)/g) {
                #senator and rep
                $cats{"[[Category:United States Senators from $2]]"} = $2;
                $cats{"[[Category:Members of the United States House of Representatives from $2]]"} = $2;
        }
        
        while ($initial_description =~ m/Senator from ($states)/g) {
                $cats{"[[Category:United States Senators from $1]]"} = $1;
        }
        
        while ($initial_description =~ m/Representative from ($states)/g) {
                $cats{"[[Category:Members of the United States House of Representatives from $1]]"} = $1;
        }
        
        $initial_description =~ s/(Territory of )?($states)/[[$1$2]]/g;
        $initial_description =~ s/Senator/[[United States Senate|U.S. Senator]]/g;
        $initial_description =~ s/Representative/[[United States House of Representatives|U.S. Representative]]/g;
        
        # Next line: look for birth place and date.
        my $line = shift(@lines);

        if ($line =~ m/(born|Born)/) {
                if ($line =~ m/^(.*), in (\d+)$/) {
                        $birthyear = $2;
                        $birth = $2;
                        $line = $1;
                } elsif ($line =~ m/^(.*), about (\d+)$/) {
                        $birth = "ca. $2";
                        $birthyear = $2;
                        $line = $1
                } elsif ($line =~ m/^(.*?)(?:,)? (?:on )?(\[\[\w* \d+\]\], \[\[(\d+)\]\])$/) {
                        $birth = $2;
                        $birthyear = $3;
                        $line = $1;
                } elsif ($line =~ m/^(.*), birth date (unknown)/) {
                        $birth = $2;
                        $line = $1;
                } else {
                        $birth = 'unknown';
                }
                
                if ($line =~ s/^(was |probably )?born/Born/) {
                        $body .= "$line, $lastname";
                } elsif ($line eq 'birth date unknown') {
                        $body = $lastname;
                } else {
                        die "I didn't expect: $line";
                }
        } else {
                $birth = 'unknown';
                $body = prepend_line($lastname, $lastname, $line);
        }

        # Next line. . .
        my $line = shift(@lines);
        $line = prepend_line('', $lastname, $line);
        $body .= $line;
        
        # Subsequent lines. . .
        while ($line = shift(@lines)) {
                if ($line eq 'birth date unknown') {
                        $birth = 'unknown';
                        $birthyear = '';
                        next;
                } 
                
                if ($line =~ m/^[dD]eath date unknown\.? ?$/) {
                        $death = 'unknown';
                        $deathyear = '';
                        $iswas = 'was';
                        next;
                } 
                        
                $line = prepend_line($pronoun, $lastname, $line);
                
                # look for death
                if ($line =~ m/(died|death(?! of)).*(\d\d\d\d)/) {
                        $deathyear = $2;
                        $death = $deathyear;
                        $iswas = 'was';
                        
                        #TODO - change this to ignore "death of", check against http://bioguide.congress.gov/scripts/biodisplay.pl?index=A000022
                        if ($line =~ m/(died|death(?! of)).*(\[\[($months) \d+\]\], \[\[\d\d\d\d\]\])/) {
                                $death = $2;
                        }
                }
        
                $body .= $line;
        }
        
        # Finalize Initial description.
        if ($birth) {
                if ($death) {
                        $birthdeath = "$birth - $death";
                        if ($birthdeath eq 'unknown - unknown') { $birthdeath = 'birth and death dates unknown'; }
                } else {
                        if ($birth eq 'unknown') {
                                $birthdeath = 'unknown date of birth';
                        } else {
                                $birthdeath = "born $birth";
                        }
                }
        }
        
        my $boilerplate = "<!" . "-- This article was automatically created by [[User:polbot]] from $url. The prose may be stilted, and there may be grammatical and Wikification errors. Please improve in any way you see fit. --" . ">";
        $initial_description = "$boilerplate'''" . $fullname . "''' ($birthdeath) $iswas " . $initial_description;
        if ($familyinfo) {
                $initial_description .= ", " . $familyinfo;
        }
                
        # Add ending stuff
        $url =~ m/^.*=(.*)$/;
        my $ending_stuff = "==Source==\n{{CongBio|$1}}\n\n{{DEFAULTSORT:$reversedname}}\n";
        
        if ($birthyear) {
                $cats{"[[Category:$birthyear births]]"} = 'a'
                #$ending_stuff .= "[[Category:$birthyear births]]\n";
        } else {
                $cats{"[[Category:Year of birth unknown]]"} = 'a'
                #$ending_stuff .= "\n";
        }
        if ($iswas eq 'is') {
                $cats{"[[Category:Living people]]"} = 'a'
                #$ending_stuff .= "\n";
        } elsif ($death =~ m/\d\d\d\d/) {
                $cats{"[[Category:$deathyear deaths]]"} = 'a'
                #$ending_stuff .= "\n";
        } else {
                $cats{"[[Category:Year of death unknown]]"} = 'a'
                #$ending_stuff .= "\n";
        }
        $ending_stuff .= join("\n", sort keys %cats);

        
        
        # Done!
        $body = "$initial_description.\n\n$body\n$ending_stuff";
        return $body;
                
        # ===================================================================================================
        # ====================   Inner subs   ===============================================================
        # ===================================================================================================
        
        sub prepend_line
        {
                my $starter = shift;
                my $lastname = shift;
                my $line = shift;
                
                my $analyzeline = $line;
                
                # If the line starts with these, skip them.
                $analyzeline =~ s/^after the war//;
                $analyzeline =~ s/^again//;
                $analyzeline =~ s/^also//;
                $analyzeline =~ s/^originally//;
                $analyzeline =~ s/^several times//;
                $analyzeline =~ s/^soon afterward//;
                $analyzeline =~ s/^subsequently//;
                
                #Get 
                my ($initchar) = ($analyzeline =~ m/(.)/);
                my ($initword) = ($analyzeline =~ m/(\w+)/);
        
                if ($initchar eq '[') {
                        $line = "$starter was in the $line.\n";
                } elsif ($initword =~ /^(successful|lawyer|teacher)$/) {
                        $line = "$starter was a $line.\n";
                } elsif ($initword eq 'unsuccessful') {
                        $line = "$starter was an $line.\n";
                } elsif ($initword eq 'elected') {
                        $line = "\n$lastname was $line.\n";
                } elsif ($initword =~ m/^($He_list)$/) {
                        $line = "$starter $line.\n";
                } elsif ($initword =~ m/^($Hewas_list)$/) {
                        $line = "$starter was $line.\n";
                } elsif ($initword =~ m/^($Servedas_list)$/) {
                        $line = "$starter served as $line.\n";
                } elsif ($initword =~ /^(re)?interment$/) {
                        $line =~ s/^(re)?interment/$starter was $1interred/;
                        $line = "$line.\n";
                        $iswas = 'was';
                } else {
                        $line =~ s/^([a-z])/\U$1/;
                        $line = "<!" . "-- A grammar fix may be needed here. --" . ">$line.\n";
                }
                
                # clean up
                $line =~ s/(\.? \.|\. )$/./;
                return $line;
        }
        
        sub replace_recognized_tokens
        {
                my $content = shift;
                
                # links

                $content =~ s/Amherst College/[[Amherst College]]/g;
                $content =~ s/Civil War/[[American Civil War|Civil War]]/g;
                $content =~ s/Confederate Army/[[Confederate States Army]]/g;
                $content =~ s/Confederate States of America/[[Confederate States of America]]/g;
                $content =~ s/Constitution of the United States/[[United States Constitution|Constitution of the United States]]/g;
                $content =~ s/Democratic National Committee/[[Democratic National Committee]]/g;
                $content =~ s/Democratic Party/[[Democratic Party (United States)|Democratic Party]]/g;
                $content =~ s/Democratic-Republican Party/[[Democratic-Republican Party (United States)|Democratic-Republican Party]]/g;
                $content =~ s/Democratic Republican Party/[[Democratic-Republican Party (United States)|Democratic Republican Party]]/g;
                $content =~ s/Department of Defense/[[United States Department of Defense|Department of Defense]]/g;
                $content =~ s/Department of War/[[United States Department of War|Department of War]]/g;
                $content =~ s/Eton College/[[Eton College]]/g;
                $content =~ s/Federalist Party/[[Federalist Party (United States)|Federalist Party]]/g;         
                $content =~ s/Free-Soil Party/[[Free Soil Party|Free-Soil Party]]/g;
                $content =~ s/Harvard College/[[Harvard College]]/g;
                $content =~ s/justice of the peace/[[Justice of the Peace]]/g;
                $content =~ s/Opposition Party/[[Opposition Party (United States)|Opposition Party]]/g;
                $content =~ s/Republican National Committee/[[Republican National Committee]]/g;
                $content =~ s/Revolutionary War/[[American Revolutionary War|Revolutionary War]]/g;
                $content =~ s/Union Army/[[Union Army]]/g;
                $content =~ s/Union College/[[Union College]]/g;
                $content =~ s/United States Air Force/[[United States Air Force]]/g;
                $content =~ s/United States Army Medical Corps/[[Army Medical Department (United States)|United States Army Medical Corps]]/g;
                $content =~ s/United States Army Reserve/[[United States Army Reserve]]/g;
                $content =~ s/United States House of Representatives/[[United States House of Representatives]]/g;
                $content =~ s/United States Marine Corps/[[United States Marine Corps]]/g;
                $content =~ s/United States Marines/[[United States Marine Corps]]/g;
                $content =~ s/United States Navy/[[United States Navy]]/g;
                $content =~ s/United States Representative/[[United States Representative]]/g;
                $content =~ s/United States Senate/[[United States Senate]]/g;
                $content =~ s/United States Senator/[[United States Senator]]/g;
                $content =~ s/United States Supreme Court/[[Supreme Court of the United States|United States Supreme Court]]/g;
                $content =~ s/United States Treasury Department/[[United States Treasury Department]]/g;
                $content =~ s/(Vice )?President of the United States/[[$1President of the United States]]/g;
                $content =~ s/Washington, D.C./[[Washington, D.C.]]/g;
                $content =~ s/William and Mary College/[[William and Mary College]]/g;
                $content =~ s/Yale College/[[Yale College]]/g;  

                $content =~ s/Republican Party/[[Republican Party (United States)|Republican Party]]/g;
                $content =~ s/United States Army/[[United States Army]]/g;
                $content =~ s/as a Democrat/as a [[Democratic Party (United States)|Democrat]]/g;
                $content =~ s/as a Federalist/as a [[Federalist Party (United States)|Federalist]]/g;
                $content =~ s/as a Republican/as a [[Republican Party (United States)|Republican]]/g;
                $content =~ s/as a Whig/as a [[Whig Party (United States)|Whig]]/g;
                
                $content =~ s/($states) (state )?senate/[[$1 Senate]]/g;
                $content =~ s/($states) (state )?house of representatives/[[$1 House of Representatives]]/g;

                # grammar-related replacements
                $content =~ s/graduated, /graduated from /g;
                $content =~ s/lawyer, private/lawyer in private/g;
                $content =~ s/, (\d\d\d\d) ?- ?(\d\d\d\d)/ from $1 to $2/g;
                $content =~ s/\(([^)]*)\;/($1, and/g;
                $content =~ s/(member|chairman|chair), /$1 of the /g;
                $content =~ s/\&\#146\;/'/g;
                $content =~ s/\&\#14[78]\;/"/g;
                
                return $content;
        }
                
        
        sub link_colleges_from_pattern
        {
                my $content = shift;
                
                # "Something University"
                $content =~ s/(([A-Z][a-z]+ (and )?)*[A-Z][a-z]+ (University|Academy))/\[\[$1\]\]/g;
                
                # "University of Something"
                $content =~ s/(University of [A-Z][a-z]+( (at )?[A-Z][a-z]+)*)/\[\[$1\]\]/g;
                
                return $content;
        }
        
        sub link_dates_from_pattern
        {
                my $content = shift;
                
                $content =~ s/($months) (\d+), *(\d\d\d\d)/[[$1 $2]], [[$3]]/g;
                
                return $content;
        }
        
        sub link_cities_from_pattern
        {
                my $content = shift;
                
                #prep City, State (or prep County, State)
                $content =~ s/ ($preps) ([A-Z][a-z]*( [A-Z][a-z]*)*, ($states))/ $1 [[$2]]/g;
                
                #prep City, Something County, State
                $content =~ s/ ($preps) ([A-Z][a-z]*( [A-Z][a-z]*)*),( [A-Z][a-z]*)* County, (($states))/ $1 [[$2, $5]]/g;
                
                #, City, Something County, State
                $content =~ s/, ([A-Z][a-z]*( [A-Z][a-z]*)*),( [A-Z][a-z]*)* County, (($states))/, [[$1, $4]]/g;
                
                #, Something, State
                $content =~ s/, ([A-Z][a-z]*( [A-Z][a-z]*)*, ($states))/, [[$1]]/g;
                
                return $content;
        }
        
        sub unabbreviate_states 
        {
                my $content = shift;
                
                $content =~ s/Ala\./Alabama/g;
                $content =~ s/Ariz\./Arizona/g;
                $content =~ s/Ark\./Arkansas/g;
                $content =~ s/Calif\./California/g;
                $content =~ s/Colo\./Colorado/g;
                $content =~ s/Conn\./Connecticut/g;
                $content =~ s/Del\./Delaware/g;
                $content =~ s/Fla\./Florida/g;
                $content =~ s/Ga\./Georgia/g;
                $content =~ s/Ill\./Illinois/g;
                $content =~ s/Ind\./Indiana/g;
                $content =~ s/Kans\./Kansas/g;
                $content =~ s/Ky\./Kentucky/g;
                $content =~ s/La\./Louisiana/g;
                $content =~ s/Md\./Maryland/g;
                $content =~ s/Mass\./Massachusetts/g;
                $content =~ s/Mich\./Michigan/g;
                $content =~ s/Minn\./Minnesota/g;
                $content =~ s/Miss\./Mississippi/g;
                $content =~ s/Mo\./Missouri/g;
                $content =~ s/Mont\./Montana/g;
                $content =~ s/Nebr\./Nebraska/g;
                $content =~ s/Nev\./Nevada/g;
                $content =~ s/N\.H\./New Hampshire/g;
                $content =~ s/N\.J\./New Jersey/g;
                $content =~ s/N\.M\./New Mexico/g;
                $content =~ s/N\.Y\./New York/g;
                $content =~ s/N\.C\./North Carolina/g;
                $content =~ s/N\.D\./North Dakota/g;
                $content =~ s/Okla\./Oklahoma/g;
                $content =~ s/Ore\./Oregon/g;
                $content =~ s/Pa\./Pennsylvania/g;
                $content =~ s/R\.I\./Rhode Island/g;
                $content =~ s/S\.C\./South Carolina/g;
                $content =~ s/S\.D\./South Dakota/g;
                $content =~ s/Tenn\./Tennessee/g;
                $content =~ s/Tex\./Texas/g;
                $content =~ s/Vt\./Vermont/g;
                $content =~ s/Va\./Virginia/g;
                $content =~ s/Wash\./Washington/g;
                $content =~ s/W\.Va\./West Virginia/g;
                $content =~ s/Wis\./Wisconsin/g;
                $content =~ s/Wyo\./Wyoming/g;
                
                return $content;
        }
}

# Here is an example for this sub's usage:
# $URL = Polbot::Get_URL_from_name("Mitch McConnell");

sub Get_URL_from_name
{
        my $article_name = shift;
        
        my @URLs = ();
        my $ErrMsg;
        my $fname;
        my $lname;

        $article_name =~ s/ \(.*\)//g;  # Take out anything parenthesized.
        
        if ($article_name =~ m/^(.*) ([^ ]*)(, Jr.|, Sr.| II| III)$/) {
                $fname = $1 . $3;
                $lname = $2;
        } elsif ($article_name =~ m/^(.*) ([^ ]*)$/) {
                $fname = $1;
                $lname = $2;
        } else {
                return "Malformed article name '$article_name'";
        }
        
        @URLs = Get_matching_URLs($fname, $lname);
        my $nummatches = scalar(@URLs);
        
        if ($nummatches eq 1) {
                return  $URLs[0];
        } elsif ($nummatches > 1) {
                return "Multiple hits for '$lname, $fname'.";
        }

        $ErrMsg = "No hits for '$lname, $fname'.";

        # Take off the suffix
        if ($fname =~ s/(, Jr\.|, Sr\.| II| III)$//) {
                @URLs = Get_matching_URLs($fname, $lname);
                my $nummatches = scalar(@URLs);
                
                if ($nummatches eq 1) {
                        return  $URLs[0];
                } elsif ($nummatches > 1) {
                        $ErrMsg .= " Multiple hits for '$lname, $fname'.";
                        return $ErrMsg;
                }
                $ErrMsg .= " No hits for '$lname, $fname'.";
        }
        
        # Try like "C. Everett Coop"
        if ($fname =~ s/^.\. //) {
                @URLs = Get_matching_URLs($fname, $lname);
                my $nummatches = scalar(@URLs);
                
                if ($nummatches eq 1) {
                        return  $URLs[0];
                } elsif ($nummatches > 1) {
                        $ErrMsg .= " Multiple hits for '$lname, $fname'.";
                        return $ErrMsg;
                }
                $ErrMsg .= " No hits for '$lname, $fname'.";
        }
        
        # Try like "John Q. Adams"
        if ($fname =~ s/\..*$//) {
                @URLs = Get_matching_URLs($fname, $lname);
                my $nummatches = scalar(@URLs);
                
                if ($nummatches eq 1) {
                        return  $URLs[0];
                } elsif ($nummatches > 1) {
                        $ErrMsg .= " Multiple hits for '$lname, $fname'.";
                        return $ErrMsg;
                }
                $ErrMsg .= " No hits for '$lname, $fname'.";
        }
        
        return $ErrMsg;
}
        
sub Get_matching_URLs
{
        my $firstname = shift;
        my $lastname = shift;
        
        my $url = 'http://bioguide.congress.gov/biosearch/biosearch1.asp';
        
        my $ua = LWP::UserAgent->new;
        $ua->agent("Mozilla/6.0");
        my @links = ();
                
        my $res = $ua->post($url, ['lastname' => $lastname, 'firstname' => $firstname]);
        if ($res->is_success) {
                my $content = $res->content;
                @links = ($content =~ m/<td><A HREF=\"([^"]*)\">/g);
        } else {                
                print "could not connect, lastname = $lastname, firstname=$firstname"
        }

        return @links;
}

sub fix_dates {
        my $txt = shift;
        
    # century without AD,BC etc
    $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi;
    $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi;
    $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\]\]/$1 centuries/gi;
    # century with AD,BC etc
    $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi;
    $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi;
    $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\s(AD|BC|CE|BCE)\]\]/$1 centuries $2/gi;
    $txt =~ s/(\d(?:st|nd|rd|th))[ \-]Century/$1 century/gi;

    # piped decades and years
    $txt =~ s/\[\[(\d{1,4}\'?s)\]\]/$1/gi;
    $txt =~ s/\[\[(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4})\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,4}s?\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,4}s?\|(\d{1,2}s?)\]\]/$1/gi;

    # months
    $txt =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December)\]\]/$1/gi;
    $txt =~ s/\[\[January\|(Jan)\]\]/$1/gi;
    $txt =~ s/\[\[February\|(Feb)\]\]/$1/gi;
    $txt =~ s/\[\[March\|(Mar)\]\]/$1/gi;
    $txt =~ s/\[\[April\|(Apr)\]\]/$1/gi;
    $txt =~ s/\[\[May\|(May)\]\]/$1/gi;
    $txt =~ s/\[\[June\|(Jun)\]\]/$1/gi;
    $txt =~ s/\[\[July\|(Jul)\]\]/$1/gi;
    $txt =~ s/\[\[August\|(Aug)\]\]/$1/gi;
    $txt =~ s/\[\[September\|(Sep)\]\]/$1/gi;
    $txt =~ s/\[\[October\|(Oct)\]\]/$1/gi;
    $txt =~ s/\[\[November\|(Nov)\]\]/$1/gi;
    $txt =~ s/\[\[December\|(Dec)\]\]/$1/gi;

    #month+year
    $txt =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d{3,4})\]\]/$1/gi;

    #Month+day_number "March 7th" -> "March 7"
    $txt =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December) (\d?\d)(?:th|st|nd|rd)\]\]/\[\[$1 $2\]\]/gi;
    $txt =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](?:th|st|nd|rd)/\[\[$1\]\]/gi;
    $txt =~ s/\[\[(\d?\d)(?:th|st|nd|rd) (January|February|March|April|May|June|July|August|September|October|November|December)\]\]/\[\[$1 $2\]\]/gi;

    #Month+day_number piped into number. Preferences do not work. They don't work in sequence because digits in the two dates must be adjacent
    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with ndash or mdash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with slash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with ndash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with slash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with ndash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with slash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with ndash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;
    #same again but with slash instead of hyphen
    $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

    $txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2})\]\]/$1/gi;

    $txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2})\]\]/$1/gi;


    # solitary day_numbers
    $txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi;
    $txt =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi;
    $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi;

    # days of the week in full. Optional plurals
    $txt =~ s/\[\[(Mondays?|Tuesdays?|Wednesdays?|Thursdays?|Fridays?|Saturdays?|Sundays?)\]\]/$1/gi;
    # days of the week abbreviated. Leave out 'Sun' as potentially valid link to the Sun. Leave out 'SAT' in upper case as potential link to 'Scholastic achievement/aptitude test'.
    $txt =~ s/\[\[(Mon|Tue|Tues|Wed|Thu|Thur|Thurs|Fri)\]\]/$1/gi;
    $txt =~ s/\[\[(Sat)\]\]/$1/g;
    $txt =~ s/\[\[Mondays?\|(Mondays?)\]\]/$1/gi;
    $txt =~ s/\[\[Tuesdays?\|(Tuesdays?)\]\]/$1/gi;
    $txt =~ s/\[\[Wednesdays?\|(Wednesdays?)\]\]/$1/gi;
    $txt =~ s/\[\[Thursdays?\|(Thursdays?)\]\]/$1/gi;
    $txt =~ s/\[\[Fridays?\|(Fridays?)\]\]/$1/gi;
    $txt =~ s/\[\[Saturdays?\|(Saturdays?)\]\]/$1/gi;
    $txt =~ s/\[\[Sundays?\|(Sundays?)\]\]/$1/gi;

    #4 digit years piped into 2
    $txt =~ s/\[\[\d{1,4}\|(\d{1,2})\]\]/$1/gi;

    #year: examine characters in link on left for date, examine characters in link on right for date
    $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi;
    #year pair: examine characters in link on left for date, examine characters in link on right for date
    $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc\s\-]))/$1$2$3$4$5/gi;

    #year: examine characters in link on left for date, avoid links on right
    $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi;
    #year pair: examine characters in link on left for date, avoid links on right
    $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;

    #year: check for line-ends, text on left, avoid links on right. Run twice to deal better with lists.
    $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi;
    $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi;
    #year pair: check for line-ends, text on left, avoid links on right
    $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3$4$5/gi;

    #year: avoid links on left, examine characters in link on right for date
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi;
    #year pair: avoid links on left, examine characters in link on right for date
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3$4$5/gi;

    #year:avoid links on left, text on right
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3/gi;
    #year pair: avoid links on left, text on right
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3$4$5/gi;

    #year:text on left, text on right
    $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3/gi;
    #year pair: avoid links on left, text on right
    $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3$4$5/gi;

    #year:avoid links on left, hyphen but no digits (to avoid ISO date) in link on right. Currently suspended because it isn't fully tested.
    #$txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[[^\d])/$1$2$3/gi;

    #year:avoid links on both sides
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi;
    #year pair: avoid links on both sides
    $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;

    #'present'
    $txt =~ s/\[\[Present \(time\)\|(Present)\]\]/$1/gi;

    #Eliminate 'surprise links' also known as 'easter egg links'
    $txt =~ s/\[\[\d{1,4}s?\sin\s[^\|]{1,30}\|(\d{1,4}s?)\]\]/$1/gi;

        return $txt;
}

sub replace_unlinked_tokens
{
        my $content = shift;
        
        # links

        $content =~ s/([^[|:])Amherst College/$1\[\[Amherst College\]\]/;
        $content =~ s/([^[|:])Confederate Army/$1\[\[Confederate States Army\]\]/;
        $content =~ s/([^[|:])Constitution of the United States/$1\[\[United States Constitution|Constitution of the United States\]\]/;
        $content =~ s/([^[|:])Democratic National Committee/$1\[\[Democratic National Committee\]\]/;
        $content =~ s/([^[|:])Democratic-Republican Party/$1\[\[Democratic-Republican Party (United States)|Democratic-Republican Party\]\]/;
        $content =~ s/([^[|:])Democratic Republican Party/$1\[\[Democratic-Republican Party (United States)|Democratic Republican Party\]\]/;
        $content =~ s/Department of Defense([^]|])/\[\[United States Department of Defense|Department of Defense\]\]$1/;
        $content =~ s/Department of War([^]|])/\[\[United States Department of War|Department of War\]\]$1/;
        $content =~ s/([^[|:])Eton College/$1\[\[Eton College\]\]/;
        $content =~ s/([^[|:])Free-Soil Party/$1\[\[Free Soil Party|Free-Soil Party\]\]/;
        $content =~ s/([^[|:])Harvard College/$1\[\[Harvard College\]\]/;
        $content =~ s/([^[|:])Republican National Committee/$1\[\[Republican National Committee\]\]/;
        $content =~ s/([^[|:])Union Army/$1\[\[Union Army\]\]/;
        $content =~ s/([^[|:])Union College/$1\[\[Union College\]\]/;
        $content =~ s/([^[|:])United States Army Medical Corps/$1\[\[Army Medical Department (United States)|United States Army Medical Corps\]\]/;
        $content =~ s/([^[|:])United States Army Reserve/$1\[\[United States Army Reserve\]\]/;
        $content =~ s/([^[|:])United States Treasury Department/$1\[\[United States Treasury Department\]\]/;
        $content =~ s/([^[|:])Washington, D\.C\./$1\[\[Washington, D.C.\]\]/;
        $content =~ s/([^[|:])William and Mary College/$1\[\[William and Mary College\]\]/;
        $content =~ s/([^[|:])Yale College/$1\[\[Yale College\]\]/;     

        $content =~ s/as a Democrat/as a \[\[Democratic Party (United States)|Democrat\]\]/;
        $content =~ s/as a Federalist/as a \[\[Federalist Party (United States)|Federalist\]\]/;
        $content =~ s/as a Republican/as a \[\[Republican Party (United States)|Republican\]\]/;
        $content =~ s/as a Whig/as a \[\[Whig Party (United States)|Whig\]\]/;
        
        # grammar-related replacements
        $content =~ s/graduated, /graduated from /g;
        $content =~ s/lawyer, private/lawyer in private/g;
        $content =~ s/, (\d\d\d\d) ?- ?(\d\d\d\d)/ from $1 to $2/g;
        $content =~ s/(member|chairman|chair), /$1 of the /g;
        $content =~ s/\&\#146\;/'/g;
        $content =~ s/\&\#14[78]\;/"/g;
        
        return $content;
}

1;
User:Polbot/source/Polbot.pm

From Wikipedia, the free encyclopedia

Views

Navigation

Interaction

Search