User:Pearle/pearle.pl
From Wikipedia, the free encyclopedia
Please copy the original wikitext, not the viewable text, when downloading. Also be sure to remove the "pre" tags at the top and bottom and everything outside of them. Thanks.
### IMPORTANT ### # This code is released into the public domain. CONTRIBUTIONS are # welcome, but will also hereby be RELEASED TO THE PUBLIC DOMAIN. # See the documentation distributed with this code for important # warnings and caveats. # Publication date: 12 Nov 2005 (UTC) ### CLONING NOTES ### # Clone operators: You may wish to undo certain items marked "TEMPORARY". # -- Beland 21 Aug 2005 # Clone operators: You will need to change $historyFile at the top of # opentaskUpdate(). You may also wish to chage $target there. # -- Beland 10 Sep 2005 ### RECENT CHANGES ### # Fixes made before the first publication: # - Now retains sort keys # - Now properly retains sort keys # - Support for (hopefully all) non-ASCII characters in titles # - Category moves are now done in one edit, not two # - Slow down if Wikipedia is slow # - Automatically retry if HTTP 500 or 503 (but wait 1, 10, or 60 # minutes first) # - Follow popular conventions for category/interwiki block style # - Automatically TRANSFER_TEXT_ACTUALLY before doing a category move # and flag for manual review if needed. # - Don't wholesale delete interwiki links # - Don't add the category manually after doing a null edit # 30 Apr 2005: Publish initial code # 15 May 2005: Add HTTP error 502 handling. # 22 May 2005: Add {{msg:foo}} -> {{foo}} conversion. # 10 Aug 2005: Fix bug surrounding category moves that require null edits # 18 Aug 2005: Add CLEANUP_DATE capabilities # *** 21 Aug 2005: Publish update *** # 22 Aug 2005: Canonicalize dk to da # 22 Aug 2005: Carry "wpStarttime", which prevents problems when # editing undeleted articles. # 22 Aug 2005: Null-edit fallback for CLEANUP_DATE # 23 Aug 2005: CLEANUP_DATE enhancements for weird cases # 25 Aug 2005: Fix some regexps with \Q and \E # 04 Sep 2005: Add logic to handle {{cfm}} # 04 Sep 2005: Mark changeCategory() edits as minor, by request # 04 Sep 2005: Fix editing bug in transferText() # 10 Sep 2005: Add OPENTASK_UPDATE functionality # *** 10 Sep 2005: Publish update *** # 12 Sep 2005: add getCategoryImages() and add it to depopulateCat() # 14 Sep 2005: urlEncode() improvements # 17 Sep 2005: Add "cleanup" to OPENTASK_UPDATE # 18 Sep 2005: Add "authority" feature to DEPOPULATE_CAT # 18 Sep 2005: Prevent infinite loop in interpretCommands() # 19 Sep 2005: moveCategoryContents() always retains sortkeys; remove # extraneous arguments. # 19 Sep 2005: Preserve whitespace in sortkeys. # 12 Oct 2005: Print a helpful report from OPENTASK_UPDATE # 17 Oct 2005: Fix history-losing bug for OPENTASK_UPDATE # 22 Oct 2005: Increase OPENTASK_UPDATE character limit to 130 # 27 Oct 2005: Update editbox scrape regexp # 27 Oct 2005: Add ability to get more than 200 articles from a category # 27 Oct 2005: Add 3 more major categories to OPENTASK_UPDATE # 28 Oct 2005: Allow 3 cleanup month categories to be featured at once # in OPENTASK_UPDATE # 29 Oct 2005: Add "Category:Wikipedia articles needing priority # cleanup" to CLEANUP in OPENTASK_UPDATE # 04 Nov 2005: Add "rough mode" to make batching of null edits less # painful # *** 12 Nov 2005: Publish update *** ################# use strict; use Time::HiRes; # The following may be helpful in debugging character encoding # problems. use utf8; #use encoding 'utf8'; # Initialization use LWP::UserAgent; use HTTP::Cookies; use HTTP::Request::Common qw(POST); use HTML::Entities; print "\n"; # LWP:UserAgent is a library which allows us to create a "user agent" # object that handles the low-level details of making HTTP requests. $::ua = LWP::UserAgent->new(timeout => 300); $::ua->agent("Pearle Wisebot/0.1"); $::ua->cookie_jar(HTTP::Cookies->new(file => "/home/beland/wikipedia/pearle-wisebot/cookies.pearle.txt", autosave => 1)); $::ua->cookie_jar->load(); # Hot pipes $| = 1; # --- # test(); #sub test #{ # my ($target, $text, $editTime, $startTime, $token); # # $target = "Wikipedia:Sandbox"; # ($text, $editTime, $startTime, $token) = getPage($target); # print $text; # $text .= "\Eat my electrons! -- Pearle\n"; # print "---\n"; # postPage ($target, $editTime, $startTime, $token, $text, "Test 008"); # die ("Test complete."); #} # --- interpretCommand(@ARGV); sub interpretCommand { my ($command, @arguments, $i, $line, $argument, @newArguments, $from, $to, $page, $pageCopy); ($command, @arguments) = @_; $command =~ s/\*\s*//; myLog(`date`); myLog ($command.": ".join(" ", @arguments)."\n"); print `date`; print $command.": ".join(" ", @arguments)."\n"; if ($command eq "POST_STDIN") { if ($arguments[2] ne "") { myLog ("Too many arguments to POST_STDIN.\n"); die ("Too many arguments to POST_STDIN.\n"); } postSTDIN($arguments[0],$arguments[1]); } elsif ($command eq "POST_STDIN_NULLOK") { if ($arguments[2] ne "") { myLog ("Too many arguments to POST_STDIN.\n"); die ("Too many arguments to POST_STDIN.\n"); } $::nullOK = "yes"; postSTDIN($arguments[0],$arguments[1]); $::nullOK = "no"; } elsif ($command eq "MOVE_CONTENTS") { if ($arguments[2] ne "") { if (($arguments[3] eq "") and ($arguments[1] eq "->")) { moveCategoryContents($arguments[0],$arguments[2]); return(); } else { myLog ("Too many arguments to MOVE_CONTENTS.\n"); die ("Too many arguments to MOVE_CONTENTS.\n"); } } moveCategoryContents($arguments[0],$arguments[1],"no"); } elsif ($command eq "MOVE_CONTENTS_INCL_CATS") { if ($arguments[2] ne "") { if (($arguments[3] eq "") and ($arguments[1] eq "->")) { moveCategoryContents($arguments[0],$arguments[2],"yes"); return(); } else { myLog ("Too many arguments to MOVE_CONTENTS_INCL_CATS.\n"); die ("Too many arguments to MOVE_CONTENTS_INCL_CATS.\n"); } } moveCategoryContents($arguments[0],$arguments[1],"yes"); } elsif ($command eq "REMOVE_X_FROM_CAT") { if ($arguments[2] ne "") { myLog ("Too many arguments to REMOVE_X_FROM_CAT.\n"); die ("Too many arguments to REMOVE_X_FROM_CAT.\n"); } removeXFromCat($arguments[0],$arguments[1],""); } elsif ($command eq "DEPOPULATE_CAT") { if ($arguments[1] eq "per") { if ($arguments[3] ne "") { myLog ("Too many arguments to DEPOPULATE_CAT.\n"); die ("Too many arguments to DEPOPULATE_CAT.\n"); } depopulateCat($arguments[0], $arguments[2]); } elsif ($arguments[1] ne "") { myLog ("Too many arguments to DEPOPULATE_CAT.\n"); die ("Too many arguments to DEPOPULATE_CAT.\n"); } depopulateCat($arguments[0]); } elsif ($command eq "PRINT_WIKITEXT") { if ($arguments[1] ne "") { myLog ("Too many arguments to PRINT_WIKITEXT.\n"); die ("Too many arguments to PRINT_WIKITEXT.\n"); } printWikitext($arguments[0]); } elsif ($command eq "ADD_CFD_TAG") { if ($arguments[1] ne "") { myLog ("Too many arguments to ADD_CFD_TAG.\n"); die ("Too many arguments to ADD_CFD_TAG.\n"); } addCFDTag($arguments[0]); } elsif ($command eq "REMOVE_CFD_TAG") { if ($arguments[1] ne "") { myLog ("Too many arguments to REMOVE_CFD_TAG.\n"); die ("Too many arguments to REMOVE_CFD_TAG.\n"); } removeCFDTag($arguments[0]); } elsif ($command eq "ADD_TO_CAT") { if ($arguments[2] ne "") { myLog ("Too many arguments to ADD_TO_CAT.\n"); die ("Too many arguments to ADD_TO_CAT.\n"); } addToCat($arguments[0],$arguments[1],""); } elsif ($command eq "ADD_TO_CAT_NULL_OK") { if ($arguments[2] ne "") { myLog ("Too many arguments to ADD_TO_CAT_NULL_OK.\n"); die ("Too many arguments to ADD_TO_CAT_NULL_OK.\n"); } $::nullOK = "yes"; addToCat($arguments[0],$arguments[1],""); $::nullOK = "no"; } elsif ($command eq "TRANSFER_TEXT") { if ($arguments[2] ne "") { myLog ("Too many arguments to TRANSFER_TEXT.\n"); die ("Too many arguments to TRANSFER_TEXT.\n"); } transferText($arguments[0], $arguments[1]); } # DON'T USE THE BELOW COMMAND; IT'S NOT IMPLEMENTED PROPERLY YET. # elsif ($command eq "LIST_TO_CAT_CHECK") # { # if ($arguments[2] ne "") # { # myLog ("Too many arguments to LIST_TO_CAT_CHECK.\n"); # die ("Too many arguments to LIST_TO_CAT_CHECK.\n"); # } # listToCat($arguments[0], $arguments[1], "no"); # } elsif ($command eq "CHANGE_CATEGORY") { if ($arguments[3] ne "") { myLog ("Too many arguments to CHANGE_CATEGORY.\n"); die ("Too many arguments to CHANGE_CATEGORY.\n"); } changeCategory($arguments[0], $arguments[1], $arguments[2]); } elsif ($command eq "CLEANUP_DATE") { if ($arguments[0] ne "") { myLog ("Too many arguments to CLEANUP_DATE.\n"); die ("Too many arguments to CLEANUP_DATE.\n"); } cleanupDate(); } elsif ($command eq "OPENTASK_UPDATE") { if ($arguments[0] ne "") { myLog ("Too many arguments to OPENTASK_UPDATE.\n"); die ("Too many arguments to OPENTASK_UPDATE.\n"); } opentaskUpdate(); } elsif ($command eq "NULL_EDIT") { if ($arguments[1] ne "") { myLog ("Too many arguments to NULL_EDIT.\n"); die ("Too many arguments to NULL_EDIT.\n"); } nullEdit($arguments[0]); } # DON'T USE THE BELOW COMMAND; IT'S NOT IMPLEMENTED PROPERLY YET. #elsif ($command eq "ENFORCE_CATEGORY_REDIRECTS_CHECK") #{ # enforceCategoryRedirects("no"); #} # This command is for remedial cleanup only. #elsif ($command eq "INTERWIKI_LOOP") #{ # interwikiLoop(); #} elsif ($command eq "ENFORCE_CATEGORY_INTERWIKI") { if ($arguments[1] ne "") { myLog ("Too many arguments to ENFORCE_CATEGORY_INTERWIKI.\n"); die ("Too many arguments to ENFORCE_CATEGORY_INTERWIKI.\n"); } enforceCategoryInterwiki($arguments[0]); } ## Broken due to recent changes on WP:CFD # elsif ($command eq "ENFORCE_CFD") # { # enforceCFD(); # } elsif ($command eq "STOP") { myLog ("Stopped."); die ("Stopped."); } elsif (($command eq "READ_COMMANDS") or ($command eq "")) { while (<STDIN>) { $line = $_; if ($line =~ m/READ_COMMANDS/) { myLog ("interpretCommands(): Infinite loop!"); die ("interpretCommands(): Infinite loop!"); } if ($line =~ m/^\s*$/) { next; } $line =~ s/\s+$//s; $line =~ s/\*\s*//; if ($line =~ m/\[\[:?(.*?)\]\] -> \[\[:?(.*?)\]\]/) { $line =~ s/\[\[:?(.*?)\]\] -> \[\[:?(.*?)\]\]//; $from = $1; $to = $2; $line =~ s/\s*$//; $from =~ s/ /_/g; $to =~ s/ /_/g; interpretCommand($line, $from, $to); } else { while ($line =~ m/\[\[:?(.*?)\]\]/) { $line =~ m/\[\[:?(.*?)\]\]/; $page = $1; $pageCopy = $page; $page =~ s/ /_/g; $line =~ s/\[\[:?\Q$pageCopy\E\]\]/$page/; if ($i++ > 100) { die ("Possible infinite loop in interpretCommands() #2"); } } interpretCommand(split (" ", $line)); } # unless (($line =~ m/TRANSFER_TEXT_CHECK/) or # ($line =~ m/ENFORCE_CATEGORY_INTERWIKI/)) unless ($line =~ m/TRANSFER_TEXT_CHECK/) { limit(); } } myLog ("Execution complete.\n"); print ("Execution complete.\n"); } else { myLog ("Unrecognized command '".$command."': ".join(" ", @arguments)."\n"); die ("Unrecognized command '".$command."': ".join(" ", @arguments)); } } sub limit { my ($i); # Rate-limiting to avoid hosing the wiki server # Min 30 sec unmarked # Min 10 sec marked # May be raised by retry() if load is heavy ### ATTENTION ### # Increasing the speed of the bot to faster than 1 edit every 10 # seconds violates English Wikipedia rules as of April, 2005, and # will cause your bot to be banned. So don't change $normalDelay # unless you know what you are doing. Other sites may have # similar policies, and you are advised to check before using your # bot at the default speed. ################# if ($::speedLimit < 10) { $::speedLimit = 10; } $i = $::speedLimit; while ($i >= 0) { sleep (1); print STDERR "Sleeping $i seconds...\r"; $i--; } print STDERR " \r"; } # perl pearle.pl POST_STDIN User:Pearle/categories-alpha "Update from 13 Oct 2004 database dump" sub postSTDIN { my ($text, $articleName, $comment, $editTime, $startTime, $junk, $token); $articleName = $_[0]; $comment = $_[1]; while (<STDIN>) { $text .= $_; } if ($text =~ m/^\s*$/) { myLog ("postSTDIN(): Null input.\n"); die ("postSTDIN(): Null input.\n"); } $::nullOK = "yes"; ($junk, $editTime, $startTime, $token) = getPage($articleName); $::nullOK = "no"; if ($comment eq "") { $comment = "Automated post"; } postPage ($articleName, $editTime, $startTime, $token, $text, $comment); } # perl pearle.pl ADD_TO_CAT Page_name Category:Category_name sortkey sub addToCat { my ($text, $articleName, $category, $editTime, $startTime, $comment, $status, @junk, $sortkey, $token); $articleName = $_[0]; $category = $_[1]; $sortkey = $_[2]; ($text, $editTime, $startTime, $token) = getPage($articleName); $comment = "Added to ${category}"; ($status, $text, @junk) = addCatToText($category, $text, $sortkey, $articleName); if ($status ne "success") { return(); } postPage ($articleName, $editTime, $startTime, $token, $text, $comment); } sub myLog { open (LOG, ">>/home/beland/wikipedia/pearle-wisebot/log.txt") || die "Could not append to log!"; print LOG $_[0]; close (LOG); } sub getPage { my ($target, $request, $response, $reply, $text, $text2, $editTime, $startTime, $attemptStartTime, $attemptFinishTime, $token, $targetSafe); $target = $_[0]; if ($target =~ m/^\s*$/) { myLog("getPage: Null target."); die("getPage: Null target."); } $targetSafe = $target; $targetSafe =~ s/\&/%26/g; # Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time(); # Create a request-object print "GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n"; myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit"); $response = $::ua->request($request); if ($response->is_success) { $reply = $response->content; # Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "getPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); # This detects whether or not we're logged in. unless ($reply =~ m%<a href="/wiki/User_talk:Pearle">My talk</a>%) { # We've lost our identity. myLog ("Wiki server is not recognizing me (1).\n---\n${reply}\n---\n"); die ("Wiki server is not recognizing me (1).\n"); } $reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s; $text = $1; $reply =~ m/value="(\d+)" name="wpEdittime"/; $editTime = $1; # Added 22 Aug 2005 to correctly handle articles that have # been undeleted $reply =~ m/value="(\d+)" name="wpStarttime"/; $startTime = $1; # Added 9 Mar 2005 after recent software change. $reply =~ m/value="(\w+)" name="wpEditToken"/; $token = $1; ### if (($text =~ m/^\s*$/) and ($::nullOK ne "yes")) { myLog ("getPage($target): Null text!\n"); myLog "\n---\n$reply\n---\n"; if ($::roughMode eq "yes") { return; } else { die ("getPage($target): Null text!\n"); } } if (($editTime =~ m/^\s*$/) and ($::nullOK ne "yes")) { myLog ("getPage($target): Null time!\n"); myLog "\n---\n$reply\n---\n"; die ("getPage($target): Null time!\n"); } if (($text =~ m/>/) or ($text =~ m/</)) { print $text; myLog "\n---\n$text\n---\n"; myLog ("getPage($target): Bad text suck!\n"); die ("getPage($target): Bad text suck!\n"); } # Change ( " -> " ) etc # This function is from HTML::Entities. decode_entities($text); # This may or may not actually work $::ua->cookie_jar->save(); return ($text, $editTime, $startTime, $token); } else { myLog ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n".$response->content."\n"); print ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { return(retry("getPage", @_)); } else { # Unhandled HTTP response die ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n"); } } } sub postPage { my ($request, $response, $pageName, $textToPost, $summaryEntry, $editTime, $startTime, $actual, $expected, $attemptStartTime, $attemptFinishTime, $date, $editToken, $minor, $pageNameSafe); $pageName = $_[0]; $editTime = $_[1]; $startTime = $_[2]; $editToken = $_[3]; $textToPost = $_[4]; $summaryEntry = $_[5]; # Max 200 chars! $minor = $_[6]; $summaryEntry = substr($summaryEntry, 0, 200); if ($pageName eq "") { myLog ("postPage(): Empty pageName.\n"); die ("postPage(): Empty pageName.\n"); } if ($summaryEntry eq "") { $summaryEntry = "Automated editing."; } # Monitor server responsiveness $attemptStartTime = Time::HiRes::time(); $pageNameSafe = $pageName; $pageNameSafe =~ s/\&/%26/g; if ($minor eq "yes") { $request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpMinoredit => "on", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis } else { $request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis, wpMinoredit } # --- ## If posts are failing, you can uncomment the below to see what ## HTTP request is being made. # myLog($request->as_string()); # print $request->as_string(); $::speedLimit = 60 * 10; # print $::ua->request($request)->as_string; # --- myLog("POSTing..."); print "POSTing..."; # Pass request to the user agent and get a response back $response = $::ua->request($request); myLog("POSTed.\n"); print "POSTed.\n"; if ($response->content =~ m/Please confirm that really want to recreate this article./) { myLog ($response->content."\n"); die ("Deleted article conflict! See log!"); } # Check the outcome of the response if (($response->is_success) or ($response->is_redirect)) { # Monitor server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "postPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); $expected = "302 Moved Temporarily"; $actual = $response->status_line; if (($expected ne $actual) and ($actual ne "200 OK")) { myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n"); myLog ($request->as_string()); myLog ("EXPECTED: '${expected}'\n"); myLog (" ACTUAL: '${actual}'\n"); if ($::roughMode eq "yes") { return(); } else { die ("postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n"); } } $expected = "http://en.wikipedia.org/wiki/${pageName}"; #$expected =~ s/\'/%27/g; #$expected =~ s/\(/%28/g; #$expected =~ s/\)/%29/g; #$expected =~ s/,/%2C/g; $expected = urlEncode($expected); $actual = $response->headers->header("Location"); if (($expected ne $actual) and ($::roughMode ne "yes") and !(($actual eq "") and ($response->status_line eq "200 OK"))) { myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n"); myLog ("EXPECTED: '${expected}'\n"); myLog (" ACTUAL: '${actual}'\n"); die ("postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n"); } if ($response->content =~ m/<h1 class="firstHeading">Edit conflict/) { myLog ("Edit conflict on '$pageName' at '$editTime'!\n"); die ("Edit conflict on '$pageName' at '$editTime'!\n"); } $::ua->cookie_jar->save(); return ($response->content); } else { $date = `date`; $date =~ s/\n//g; myLog ("Bad response to POST to $pageNameSafe at $date.\n".$response->status_line."\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { print "Bad response to POST to $pageNameSafe at $date.\n".$response->status_line."\n".$response->content."\n"; return(retry("postPage", @_)); } else { # Unhandled HTTP response die ("Bad response to POST to $pageNameSafe at $date.\n".$response->status_line."\n"); } } } sub urlSafe { # This function is no longer called because the LWP::UserAgent and # HTTP::Request libraries handle character escaping. } # perl pearle.pl MOVE_CONTENTS_INCL_CATS Category:From_here Category:To_here sub moveCategoryContents { my (@articles, $categoryFrom, $categoryTo, $article, $status, @subcats, $includeCategories, $subcat, @junk); # -- INITIALIZATION -- $categoryFrom = $_[0]; $categoryTo = $_[1]; $includeCategories = $_[2]; if ($categoryFrom =~ m/^\[\[:(Category:.*?)\]\]/) { $categoryFrom =~ s/^\[\[:(Category:.*?)\]\]/$1/; $categoryFrom =~ s/\s+/_/g; } if ($categoryTo =~ m/^\[\[:(Category:.*?)\]\]/) { $categoryTo =~ s/^\[\[:(Category:.*?)\]\]/$1/; $categoryTo =~ s/\s+/_/g; } $categoryFrom =~ s/^\[\[://; $categoryTo =~ s/^\[\[://; $categoryFrom =~ s/\]\]$//; $categoryTo =~ s/\]\]$//; unless (($categoryFrom =~ m/^Category:/) and ($categoryTo =~ m/^Category:/)) { myLog ("moveCategoryContents(): Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); die ("moveCategoryContents(): Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); } transferText ($categoryFrom, $categoryTo); # Subcategory transfer if ($includeCategories eq "yes") { @subcats = getSubcategories($categoryFrom); foreach $subcat (@subcats) { if ($subcat =~ m/^\s*$/) { next; } $subcat = urlDecode($subcat); print "changeCategory($subcat, $categoryFrom, $categoryTo) c\n"; myLog "changeCategory($subcat, $categoryFrom, $categoryTo) c\n"; changeCategory($subcat, $categoryFrom, $categoryTo); limit(); } } # Article transfer @articles = getCategoryArticles($categoryFrom); # foreach $article (reverse(@articles)) foreach $article (@articles) { if ($article =~ m/^\s*$/) { next; } $article = urlDecode($article); print "changeCategory($article, $categoryFrom, $categoryTo) a\n"; myLog "changeCategory($article, $categoryFrom, $categoryTo) a\n"; changeCategory($article, $categoryFrom, $categoryTo); limit(); } } # perl pearle.pl DEPOPULATE_CAT Category:To_be_depopulated sub depopulateCat #($category); { my (@articles, $category, $article, $status, @subcats, $subcat, @junk, $authority); $category = $_[0]; $authority = $_[1]; if ($category =~ m/^\[\[:(Category:.*?)\]\]/) { $category =~ s/^\[\[:(Category:.*?)\]\]/$1/; $category =~ s/\s+/_/g; } unless ($category =~ m/^Category:/) { myLog ("depopulateCat(): Are you sure '$category' is a category?\n"); die ("depopulateCat(): Are you sure '$category' is a category?\n"); } # Remove all subcategories @subcats = getSubcategories($category); foreach $subcat (@subcats) { $subcat = urlDecode($subcat); print "removeXFromCat($subcat, $category) c\n"; myLog "removeXFromCat($subcat, $category) c\n"; ($status, @junk) = removeXFromCat($subcat, $category, $authority); unless ($status == 0) { myLog ("Status: $status\n"); print "Status: $status\n"; } limit(); } # Remove all articles @articles = getCategoryArticles($category); #foreach $article (reverse(@articles)) foreach $article (@articles) { $article = urlDecode($article); print "removeXFromCat($article, $category) a\n"; myLog "removeXFromCat($article, $category) a\n"; ($status, @junk) = removeXFromCat($article, $category, $authority); unless ($status == 0) { myLog ("Status: $status\n"); print "Status: $status\n"; } limit(); } # Remove all images @articles = getCategoryImages($category); #@articles = reverse(getCategoryImages($category)); foreach $article (@articles) { $article = urlDecode($article); print "removeXFromCat($article, $category) i\n"; myLog "removeXFromCat($article, $category) i\n"; ($status, @junk) = removeXFromCat($article, $category, $authority); unless ($status == 0) { myLog ("Status: $status\n"); print "Status: $status\n"; } limit(); } } # perl pearle.pl REMOVE_X_FROM_CAT Article_name Category:Where_the_article_is sub removeXFromCat { my ($text, $articleName, $category, $editTime, $startTime, $comment, $catTmp, $sortkey, @junk, $token, $categoryUnd, $categoryHuman, $authority); $articleName = $_[0]; $category = $_[1]; $authority = $_[2]; #urlSafe($articleName); #urlSafe($category); unless ($category =~ m/^Category:\w+/) { myLog ("addToCat(): Bad format on category.\n"); die ("addToCat(): Bad format on category.\n"); } ($text, $editTime, $startTime, $token) = getPage($articleName); $comment = "Removed from ${category}"; if ($authority ne "") { $authority =~ s/_/ /g; $comment = "Removed from ${category} (per [[${authority}]])"; } # Convert underscore to spaces; this is human-readable. $category =~ s/_/ /g; $categoryHuman = $category; # Insert possible whitespace $category =~ s/^Category://; # $category = "Category:\\s*\\Q".$category."\\E"; # THIS DOES NOT WORK $category = "Category:\\s*".$category; $category =~ s%\(%\\(%g; $category =~ s%\)%\\)%g; $category =~ s%\'%\\\'%g; $categoryUnd = $category; $categoryUnd =~ s/ /_/g; unless (($text =~ m/\[\[\s*${category}\s*\]\]/is) or ($text =~ m/\[\[\s*${category}\s*\|.*?\]\]/is) or ($text =~ m/\[\[\s*${categoryUnd}\s*\]\]/is) or ($text =~ m/\[\[\s*${categoryUnd}\s*\|.*?\]\]/is)) { print "removeXFromCat(): $articleName is not in '$category'.\n"; myLog ("removeXFromCat(): $articleName is not in '$category'.\n"); ### TEMPORARY ### ### Uncomment these lines if you want category remove attempts ### to trigger null edits. This is useful if you have have ### changed the category on a template, but due to a bug this ### does not actually move member articles until they are ### edited. ($text, @junk) = fixCategoryInterwiki($text); postPage ($articleName, $editTime, $startTime, $token, $text, "Mostly null edit to actually remove from ${categoryHuman}", "yes"); limit(); ### TEMPORARY ### return(1); } if ($text =~ m/^\s*\#REDIRECT/is) { print "addToCat(): $articleName is a redirect!\n"; myLog ("addToCat(): $articleName is a redirect!\n"); return(2); } # Remember to PRESERVE WHITESPACE for sortkeys! $text =~ m/\[\[\s*${category}\s*\|(.*?)\]\]/is; $sortkey = $1; if ($sortkey eq "") { $text =~ m/\[\[\s*${categoryUnd}\s*\|(.*?)\]\]/is; } # Remove the page from the category and any trailing newline. $text =~ s/\[\[\s*${category}\s*\|?(.*?)\]\]\n?//isg; $text =~ s/\[\[\s*${categoryUnd}\s*\|?(.*?)\]\]\n?//isg; ($text, @junk) = fixCategoryInterwiki($text); postPage ($articleName, $editTime, $startTime, $token, $text, $comment); return(0, $sortkey); } # perl pearle.pl PRINT_WIKITEXT Article_you_want_to_get ## Warning: Saves to a file in the current directory with the same name ## as the article, plus another file with the .html extention. sub printWikitext { my ($editTime, $startTime, $text, $target, $token); $target = $_[0]; $target =~ s/^\[\[://; $target =~ s/\]\]$//; ($text, $editTime, $startTime, $token) = getPage($target); # Save the wikicode version to a file. open (WIKITEXT, ">./${target}"); print WIKITEXT $text; close (WIKITEXT); # Save the HTML version to a file. print `wget http://en.wikipedia.org/wiki/${target} -O ./${target}.html`; } # Get a list of the names of articles in a given category. sub getCategoryArticles { my ($target, $request, $response, $reply, $articles, $article, @articles, $attemptStartTime, $attemptFinishTime, $targetSpace, $offset, $numberOfArticles, $url, @moreArticles); $target = $_[0]; $offset = $_[1]; # Need both _ and spaces for precise matching later $target =~ s/ /_/g; $targetSpace = $target; $targetSpace =~ s/_/ /g; #urlSafe ($target); unless ($target =~ m/^Category:/) { myLog ("getCategoryArticles(): Are you sure '$target' is a category?\n"); die ("getCategoryArticles(): Are you sure '$target' is a category?\n"); } # Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time(); if ($offset eq "") { $url = "http://en.wikipedia.org/wiki/${target}"; } else { $url = "http://en.wikipedia.org/w/index.php?title=${target}&from=${offset}"; } # Create a request-object if ($offset eq "") { print "GET ${url}\n"; } myLog("GET ${url}\n"); $request = HTTP::Request->new(GET => "${url}"); $response = $::ua->request($request); if ($response->is_success) { # Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "getCategoryArticles", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); $reply = $response->content; # This detects whether or not we're logged in. unless ($reply =~ m%<a href="/wiki/User_talk:Pearle">My talk</a>%) { # We've lost our identity. myLog ("Wiki server is not recognizing me (2).\n---\n${reply}\n---\n"); die ("Wiki server is not recognizing me (2).\n"); } $articles = $reply; $articles =~ s%^.*?<h2>Articles in category.*?</h2>%%s; $articles =~ s%<div class="printfooter">.*?$%%s; @articles = $articles =~ m%<li><a href="/wiki/(.*?)" title=%sg; if ($reply =~ m%<a\s+href=\"/w/index.php\?title=${target}\&from=(.*?)\"\s+title=\"${targetSpace}\">next 200</a>%s) { sleep (1); # Throttle GETs @moreArticles = getCategoryArticles($target, $1); @articles = (@articles, @moreArticles); } $::ua->cookie_jar->save(); $numberOfArticles = @articles; if ($offset eq "") { print "Got $numberOfArticles articles.\n"; myLog ("Got $numberOfArticles articles.\n"); } return decodeArray(@articles); } else { myLog ("getCategoryArticles($target): HTTP ERR (".$response->status_line.") ${url}\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { print "getCategoryArticles($target): HTTP ERR (".$response->status_line.") ${url}\n".$response->content."\n"; return(retry("getCategoryArticles", @_)); } else { # Unhandled HTTP response die ("getCategoryArticles($target): HTTP ERR (".$response->status_line.") ${url}\n"); } } } sub decodeArray { my($title, @newTitles); foreach $title (@_) { $title = urlDecode ($title); @newTitles = (@newTitles, $title); } return @newTitles; } # Get a list of the names of subcategories of a given category. sub getSubcategories { my ($target, $request, $response, $reply, $subcats, $subcat, @subcats, $attemptStartTime, $attemptFinishTime); $target = $_[0]; #urlSafe ($target); unless ($target =~ m/^Category:/) { myLog ("getSubcategories(): Are you sure '$target' is a category?\n"); die ("getSubcategories(): Are you sure '$target' is a category?\n"); } # Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time(); # Create a request-object print "GET http://en.wikipedia.org/wiki/${target}\n"; myLog("GET http://en.wikipedia.org/wiki/${target}\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}"); $response = $::ua->request($request); if ($response->is_success) { # Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "getSubcategories", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); $reply = $response->content; # This detects whether or not we're logged in. unless ($reply =~ m%<a href="/wiki/User_talk:Pearle">My talk</a>%) { # We've lost our identity. myLog ("Wikipedia is not recognizing me (3).\n---\n${reply}\n---\n"); die ("Wikipedia is not recognizing me (3).\n"); } $subcats = $reply; if ($subcats =~ m%^.*?<h2>Subcategories</h2>(.*?)<h2>Articles in category.*?</h2>.*?$%s) { $subcats =~ s%^.*?<h2>Subcategories</h2>(.*?)<h2>Articles in category.*?</h2>.*?$%$1%s; } else { return (); } @subcats = $subcats =~ m%<li><a href="/wiki/(.*?)" title=%sg; $::ua->cookie_jar->save(); return decodeArray(@subcats); } else { myLog ("getSubcategories($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { print "getSubcategories($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n"; return(decodeArray(retry("getCategoryArticles", @_))); } else { # Unhandled HTTP response die ("getSubcategories($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n"); } } } # perl pearle.pl ADD_CFD_TAG Category:Category_name sub addCFDTag { my ($text, $category, $editTime, $startTime, $comment, $catTmp, @junk, $token); $category = $_[0]; #urlSafe($category); unless ($category =~ m/^Category:\w+/) { myLog ("addCFDTag(): Bad format on category.\n"); die ("addCFDTag(): Bad format on category.\n"); } $::nullOK = "yes"; ($text, $editTime, $startTime, $token) = getPage($category); $::nullOK = "no"; $comment = "Nominated for deletion or renaming"; if (($text =~ m/\{\{cfd\}\}/is) or ($text =~ m/\{\{cfm/is) or ($text =~ m/\{\{cfr/is)) { print "addCFDTag(): $category is already tagged.\n"; myLog ("addCFDTag(): $category is already tagged.\n"); return(); } if ($text =~ m/^\s*\#REDIRECT/is) { print "addCFDTag(): $category is a redirect!\n"; myLog ("addCFDTag(): $category is a redirect!\n"); return(); } # Add the CFD tag to the beginning of the page. $text = "{{cfd}}\n".$text; ($text, @junk) = fixCategoryInterwiki($text); postPage ($category, $editTime, $startTime, $token, $text, $comment); } # perl pearle.pl REMOVE_CFD_TAG Category:Category_name sub removeCFDTag { my ($text, $category, $editTime, $startTime, $comment, $catTmp, @junk, $token); $category = $_[0]; #urlSafe($category); unless ($category =~ m/^Category:\w+/) { myLog ("removeCFDTag(): Bad format on category.\n"); die ("removeCFDTag(): Bad format on category.\n"); } $::nullOK = "yes"; ($text, $editTime, $startTime, $token) = getPage($category); $::nullOK = "no"; $comment = "De-listed from [[Wikipedia:Categories for deletion]]"; unless (($text =~ m/\{\{cfd\}\}/is) or ($text =~ m/\{\{cfm/is) or ($text =~ m/\{\{cfr/is)) { print "removeCFDTag(): $category is not tagged.\n"; myLog ("removeCFDTag(): $category is not tagged.\n"); return(); } if ($text =~ m/^\s*\#REDIRECT/is) { print "removeCFDTag(): $category is a redirect!\n"; myLog ("removeCFDTag(): $category is a redirect!\n"); return(); } # Remove the CFD tag. $text =~ s/{{cfd}}\s*//gis; $text =~ s/\{\{cfr.*?\}\}\s*//is; $text =~ s/\{\{cfm.*?\}\}\s*//is; $text =~ s/\{\{cfru.*?\}\}\s*//is; ($text, @junk) = fixCategoryInterwiki($text); postPage ($category, $editTime, $startTime, $token, $text, $comment); } # perl pearle.pl TRANSFER_TEXT Category:From_here Category:To_there ## Note that this code is called automatically whenever moving a ## category, so you probably don't need to call it yourself from the ## command line. sub transferText { my ($source, $destination, $sourceText, $destinationText, $sourceTime, $destinationTime, @sourceCategories, @destinationCategories, $category, $lastCategory, $sourceTextOrig, $destinationTextOrig, $comment, $sourceHuman, $destinationHuman, $noMergeFlag, $sourceToken, $destinationToken, $junk, $sourceStartTime, $destinationStartTime); $source = $_[0]; $destination = $_[1]; $comment = "Cleanup per [[WP:CFD]] (moving $source to $destination)"; # Make human-readable versions of these variables for use in edit summaries $sourceHuman = $source; $sourceHuman =~ s/_/ /g; $destinationHuman = $destination; $destinationHuman =~ s/_/ /g; unless (($source =~ m/^Category:/) and ($destination =~ m/^Category:/)) { myLog ("transferText(): Are you sure these are categories? ".$source."/".$destination."\n"); die ("transferText(): Are you sure these are categories? ".$source."/".$destination."\n"); } ($sourceText, $sourceTime, $sourceStartTime, $sourceToken) = getPage($source); # Avoid double runs! # This text must be the same as that which is implanted below, and # it should be an HTML comment, so that it's invisible. if ($sourceText =~ m/<\!--PEARLE-MOVE-CATEGORY-CONTENTS-SRC-FLAG-->/) { return; } $sourceTextOrig = $sourceText; $sourceText =~ s/{{cfd}}//is; $sourceText =~ s/\{\{cfr.*?\}\}\s*//is; $sourceText =~ s/\{\{cfm.*?\}\}\s*//is; $sourceText =~ s/\{\{cfru.*?\}\}\s*//is; $sourceText =~ s/^\s+//s; $sourceText =~ s/\s+$//s; $::nullOK = "yes"; ($destinationText, $destinationTime, $destinationStartTime, $destinationToken) = getPage($destination); $::nullOK = "no"; $destinationTextOrig = $destinationText; $destinationText =~ s/{{cfd}}//is; $destinationText =~ s/\{\{cfm.*?\}\}\s*//is; $destinationText =~ s/\{\{cfr.*?\}\}\s*//is; $destinationText =~ s/\{\{cfru.*?\}\}\s*//is; $destinationText =~ s/^\s+//s; $destinationText =~ s/\s+$//s; # To help keep things straight when we're in a loop. print STDOUT "\n----\n"; if (($sourceText eq "") and ($destinationText ne "")) { # The HTML comment must be the same as that above. $sourceText = "{{cfd}}\nThis category has been moved to [[:$destinationHuman]]. Any remaining articles and subcategories will soon be moved by a bot unless otherwise noted on [[WP:CFD]].\n<!--PEARLE-MOVE-CATEGORY-CONTENTS-SRC-FLAG-->\n"; } elsif (($sourceText ne "") and ($destinationText eq "")) { $destinationText = $sourceText; # The HTML comment must be the same as that above. $sourceText = "{{cfd}}\nThis category has been moved to [[:$destinationHuman]]. Any remaining articles and subcategories will soon be moved by a bot unless otherwise noted on [[WP:CFD]].\n<!--PEARLE-MOVE-CATEGORY-CONTENTS-SRC-FLAG-->\n"; } elsif (($sourceText ne "") and ($destinationText ne "")) { @sourceCategories = $sourceText =~ m/\[\[\s*(Category:.*?)\s*\]\]/gs; @destinationCategories = $destinationText =~ m/\[\[\s*(Category:.*?)\s*\]\]/gs; $sourceText =~ s/\[\[\s*(Category:.*?)\s*\]\]\s*//gs; $sourceText =~ s/^\s+//s; $sourceText =~ s/\s+$//s; $destinationText =~ s/\[\[\s*(Category:.*?)\s*\]\]\s*//gs; $destinationText =~ s/^\s+//s; $destinationText =~ s/\s+$//s; $destinationText = $sourceText."\n".$destinationText; $destinationText =~ s/^\s+//s; $destinationText =~ s/\s+$//s; foreach $category (sort (@sourceCategories, @destinationCategories)) { if ($category eq $lastCategory) { next; } $destinationText .= "\n[[${category}]]"; $lastCategory = $category; } # The HTML comment must be the same as that above. $sourceText = "{{cfd}}\nThis category has been moved to [[:$destinationHuman]]. Any remaining articles and subcategories will soon be moved by a bot unless otherwise noted on [[WP:CFD]].\n<!--PEARLE-MOVE-CATEGORY-CONTENTS-SRC-FLAG-->\n"; } $sourceText =~ s/\n\s+\n/\n\n/sg; $destinationText =~ s/\n\s+\n/\n\n/sg; # You may need to futz with this, depending on the templates # currently in use. unless (($sourceTextOrig =~ m/\{\{cfd\}\}/i) or ($sourceTextOrig =~ m/\{\{cfr/i) or ($sourceTextOrig =~ m/\{\{cfm/i)) { print STDOUT "FATAL ERROR: $source was not tagged {{cfd}}, {{cfm}}, {{cfr}}, or {{cfru}}!\n"; myLog("FATAL ERROR: $source was not tagged {{cfd}}, {{cfr}}, {{cfm}}, or {{cfru}}!\n"); die("FATAL ERROR: $source was not tagged {{cfd}}, {{cfr}}, {{cfm}}, or {{cfru}}!\n"); } if (($sourceText eq $sourceTextOrig) and ($destinationText eq $destinationTextOrig)) { print STDOUT "No changes for $source and $destination.\n"; return(); } if ($destinationTextOrig =~ m/^\s*$/) { print "No merging was required from $source into $destination.\n"; $noMergeFlag = "yes"; } unless ($noMergeFlag eq "yes") { $destinationText .= "{{pearle-manual-cleanup}}\n"; } # Make sure category and interwiki links conform to style # guidelines. ($destinationText, $junk) = fixCategoryInterwiki($destinationText); # If we did have to change things around, print the changes and post them to the wiki. if ($sourceText ne $sourceTextOrig) { unless ($noMergeFlag eq "yes") { print STDOUT "SOURCE FROM:\n%%%${sourceTextOrig}%%%\nSOURCE TO:\n%%%${sourceText}%%%\n"; } postPage ($source, $sourceTime, $sourceStartTime, $sourceToken, $sourceText, $comment); } if ($destinationText ne $destinationTextOrig) { unless ($noMergeFlag eq "yes") { print STDOUT "DESTINATION FROM:\n%%%${destinationTextOrig}%%%\nDESTINATION TO:\n%%%${destinationText}%%%\n"; } postPage ($destination, $destinationTime, $destinationStartTime, $destinationToken, $destinationText, $comment); } } # Translate from HTTP URL encoding to the native character set. sub urlDecode { my ($input); $input = $_[0]; $input =~ s/\%([a-f|A-F|0-9][a-f|A-F|0-9])/chr(hex($1))/eg; return ($input); } # Translate from the native character set to HTTP URL encoding. sub urlEncode { my ($char, $input, $output); $input = $_[0]; foreach $char (split("",$input)) { # if ($char =~ m/[a-z|A-Z|0-9|\-_\.\!\~\*\'\(\)]/) # The below exclusions should conform to Wikipedia practice # (possibly non-standard) if ($char =~ m/[a-z|A-Z|0-9|\-_\.\/:]/) { $output .= $char; } elsif ($char eq " ") { $output .= "+"; } else { $output .= uc(sprintf("%%%x", ord($char))); # %HH where HH is the (Unicode?) hex code of $char } } return ($output); } # perl pearle.pl CHANGE_CATEGORY Article_name Category:From Category:To sub changeCategory { my ($articleName, $categoryFrom, $categoryTo, $editTime, $startTime, $text, $comment, $catTmp, $sortkey, $token, $junk, $categoryFromUnd); $articleName = $_[0]; $categoryFrom = $_[1]; $categoryTo = $_[2]; #urlSafe($articleName); #urlSafe($categoryFrom); #urlSafe($categoryTo); unless (($categoryFrom =~ m/^Category:/) and ($categoryTo =~ m/^Category:/)) { myLog ("moveCategoryContents(): Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); die ("moveCategoryContents(): Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); } if ($articleName =~ m/^\s*$/) { myLog("changeCategory(): Null target."); die("changeCategory(): Null target."); } ($text, $editTime, $startTime, $token) = getPage($articleName); $comment = "Moving from ${categoryFrom} to ${categoryTo}"; # --- Start the removing part --- # Convert underscore to spaces; this is human-readable. $categoryFrom =~ s/_/ /g; # Insert possible whitespace $categoryFrom =~ s/^Category://; $categoryFrom = "Category:\\s*".$categoryFrom; # Escape special characters $categoryFrom =~ s%\(%\\(%g; $categoryFrom =~ s%\)%\\)%g; $categoryFrom =~ s%\'%\\\'%g; $categoryFromUnd = $categoryFrom; $categoryFromUnd =~ s/ /_/g; unless (($text =~ m/\[\[\s*${categoryFrom}\s*\]\]/is) or ($text =~ m/\[\[\s*${categoryFrom}\s*\|.*?\]\]/is) or ($text =~ m/\[\[\s*${categoryFromUnd}\s*\]\]/is) or ($text =~ m/\[\[\s*${categoryFromUnd}\s*\|.*?\]\]/is)) { myLog ("changeCategory.r(): $articleName is not in '$categoryFrom'.\n"); my ($nullEditFlag); # Set this to "yes" if you want mass category change attempts # to trigger null edits automatically. You should check the # category later to see if everything worked or not, to see if # any templates should be changed. The below will add a small # amount of unnecessary server load to try the null edits if # template changes haven't already been made. $nullEditFlag = "yes"; if ($nullEditFlag eq "yes") { myLog ("changeCategory(): Attempting null edit on $articleName.\n"); print "changeCategory(): Attempting null edit on $articleName.\n"; nullEdit($articleName); return(); } else { print "###${text}###\n"; die ("changeCategory.r(): $articleName is not in '$categoryFrom'.\n"); } } if ($text =~ m/^\s*\#REDIRECT/is) { myLog ("changeCategory.r(): $articleName is a redirect!\n"); die ("changeCategory.r(): $articleName is a redirect!\n"); } # We're lazy and don't fully parse the document to properly check # for escaped category tags, so there may be some unnecssary # aborts from the following, but they are rare and easily # overridden by manually editing the page in question. if ($text =~ m/.*?category.*?<\/nowiki>/is) { myLog ("changeCategory.r(): $articleName has a dangerous nowiki tag!\n"); die ("changeCategory.r(): $articleName has a dangerous nowiki tag!\n"); } $text =~ m/\[\[\s*${categoryFrom}\s*\|(.*?)\]\]/is; $sortkey = $1; if ($sortkey eq "") { $text =~ m/\[\[\s*${categoryFromUnd}\s*\|(.*?)\]\]/is; } # Remove the page from the category and any trailing newline. $text =~ s/\[\[\s*${categoryFrom}\s*\|?(.*?)\]\]\n?//isg; $text =~ s/\[\[\s*${categoryFromUnd}\s*\|?(.*?)\]\]\n?//isg; # --- Start the adding part --- # Remove any newlines at the end of the document. $text =~ s/\n*$//s; $catTmp = $categoryTo; # _ and spaces are equivalent and may be intermingled in wikicode. $catTmp =~ s/Category:\s*/Category:\\s*/g; $catTmp =~ s/_/[_ ]/g; $catTmp =~ s%\(%\\\(%g; $catTmp =~ s%\)%\\\)%g; $catTmp =~ s%\.%\\\.%g; if (($text =~ m/(\[\[\s*${catTmp}\s*\|.*?\]\])/is) or ($text =~ m/(\[\[\s*${catTmp}\s*\]\])/is)) { myLog ("changeCategory.a(): $articleName is already in '$categoryTo'.\n"); print "\n1: '${1}'\n"; print "\ncattmp: '${catTmp}'\n"; print "changeCategory.a(): $articleName is already in '$categoryTo'.\n"; ## It's generally OK to merge it in, so don't do this: # die "changeCategory.a(): $articleName is already in '$categoryTo'.\n"; # return(); } elsif ($text =~ m/^\s*\#REDIRECT/is) { print "changeCategory.a(): $articleName is a redirect!\n"; myLog ("changeCategory.a(): $articleName is a redirect!\n"); return(); } else { # Convert underscore to spaces; this is human-readable. $categoryTo =~ s/_/ /g; # Add the category on a new line. if ($sortkey eq "") { $text .= "\n[[${categoryTo}]]"; } else { $text .= "\n[[${categoryTo}|${sortkey}]]"; } } # --- Post-processing --- ($text, $junk) = fixCategoryInterwiki($text); postPage ($articleName, $editTime, $startTime, $token, $text, $comment, "yes"); } # This function is not yet finished. Right now it simply compares the # membership of a given list and a given category. Eventually, it is # intended to be used to convert lists into categories. This is not # yet authorized behavior. sub listToCat { my ($lists, $cats, $list, $cat, $listText, @junk, @articlesInList, @articlesInCat, %articlesInCat, $article, $implement); $lists = $_[0]; $cats = $_[1]; $implement = $_[2]; if ($implement ne "yes") { print "Diffing membership of '$lists' and '$cats'\n"; } foreach $list (split(";", $lists)) { $list =~ s/^\[\[:?//; $list =~ s/\]\]$//; ($listText, @junk) = getPage($list); $listText =~ s%<nowiki>.*?%%gis; $listText =~ s%<pre>.*?
%%gis; #
@articlesInList = (@articlesInList, $listText =~ m%\[\[(.*?)\]\]%sg); sleep 1; } foreach $cat (split(";", $cats)) { $cat =~ s/^\[\[:?//; $cat =~ s/\]\]$//; $cat =~ s/^:Category/Category/; @articlesInCat = (@articlesInCat, getCategoryArticles($cat)); sleep 1; } foreach $article (@articlesInCat) { $article = urlDecode ($article); $articlesInCat{$article} = 1; # print "In cat: $article\n"; } foreach $article (@articlesInList) { $article =~ s/\s+/_/gs; $article =~ s/\|.*$//; if (exists $articlesInCat{$article}) { # print "OK: $article\n"; delete $articlesInCat{$article}; } else { print "Only in list(s): $article\n"; } } foreach $article (sort(keys(%articlesInCat))) { print "Only in cat(s): $article\n"; } } # A little paranoia never hurt anyone. sub shellfix { my ($string, $stringTmp); $string = $_[0]; $string =~ s/([\*\?\!\(\)\&\>\<])\"\'/\\$1/g; $stringTmp = $string; $stringTmp =~ s/[Å\p{IsWord}[:alpha:][:digit:]\*,:_.\'\"\)\(\?\-\/\&\>\<\!]//g; if ($stringTmp ne "") { die ("\nUnsafe character(s) in '${string}': '$stringTmp'\n"); } return $string; } # You will not be able to use this function; it requires a dataset # processed by scripts which have not been included. (It's not # finished, anyway.) sub enforceCategoryRedirects { my ($implementActually, $line, $lineTmp, $articlesToMove, $article, $flatResults, $entry, $contents, $catTo, $lineTmp2); $implementActually = $_[0]; $flatResults = `cat data/reverse-category-links-sorted.txt | grep ^Category:Wikipedia_category_redirects`; foreach $line (split("\n", $flatResults)) { $line =~ s/^Category:Wikipedia_category_redirects <\- //; $lineTmp = shellfix($line); $lineTmp2 = $lineTmp; $lineTmp2 =~ s/^Category://; if ($line =~ m/^Category/) { $articlesToMove = `cat data/reverse-category-links-sorted.txt | grep ^${lineTmp}`; if ($articlesToMove eq "") { next; } print "ATM: $articlesToMove\n"; $entry = `egrep \"^\\([0-9]+,14,'$lineTmp2'\" data/entries-categoryredirect.txt `; $entry =~ m/^\([0-9]+,14,'$lineTmp2','(.*?)',/; $contents = $1; $contents =~ m/\{\{categoryredirect\|(.*?)\}\}/; $catTo = $1; $catTo = ":Category:".$catTo; $catTo =~ s/_/ /g; $lineTmp = $line; $lineTmp =~ s/^Category/:Category/i; $lineTmp =~ s/_/ /g; foreach $article (split("\n", $articlesToMove)) { print "ARTICLE: $article\n"; print "LINE: $line\n"; $article =~ s/^$line <\- //; print "* Move [[$article]] from [[$lineTmp]] to [[$catTo]]\n"; } } } } # A call to this recursive function handles any retries necessary to # wait out network or server problems. It's a bit of a hack. sub retry { my ($callType, @args, $i, $normalDelay, $firstRetry, $secondRetry, $thirdRetry); ($callType, @args) = @_; ### ATTENTION ### # Increasing the speed of the bot to faster than 1 edit every 10 # seconds violates English Wikipedia rules as of April, 2005, and # will cause your bot to be banned. So don't change $normalDelay # unless you know what you are doing. Other sites may have # similar policies, and you are advised to check before using your # bot at the default speed. ################# # HTTP failures are usually an indication of high server load. # The retry settings here are designed to give human editors # priority use of the server, by allowing it ample recovering time # when load is high. # Time to wait before retry on failure, in seconds $normalDelay = 10; # Normal interval between edits is 10 seconds $firstRetry = 60; # First delay on fail is 1 minute $secondRetry = 60 * 10; # Second delay on fail is 10 minutes $thirdRetry = 60 * 60; # Third delay on fail is 1 hour # SUCCESS CASE # e.g. retry ("success", "getPage", "0.23"); if ($callType eq "success") { myLog("Response time for ".$args[0]." (sec): ".$args[1]."\n"); $::retryDelay = $normalDelay; if ($args[0] eq "postPage") { # If the response time is greater than 20 seconds... if ($args[1] > 20) { print "Wikipedia is very slow. Increasing minimum wait to 10 min...\n"; myLog("Wikipedia is very slow. Increasing minimum wait to 10 min...\n"); $::speedLimit = 60 * 10; } # If the response time is between 10 and 20 seconds... elsif ($args[1] > 10) { print "Wikipedia is somewhat slow. Setting minimum wait to 60 sec...\n"; myLog("Wikipedia is somewhat slow. Setting minimum wait to 60 sec...\n"); $::speedLimit = 60; } # If the response time is less than 10 seconds... else { if ($::speedLimit > 10) { print "Returning to normal minimum wait time.\n"; myLog("Returning to normal minimum wait time.\n"); $::speedLimit = 10; } } } return(); } # e.g. retry ("getPage", "George_Washington") # FAILURE CASES elsif (($::retryDelay == $normalDelay) or ($::retryDelay == 0)) { print "First retry for ".$args[0]."\n"; myLog("First retry for ".$args[0]."\n"); $::retryDelay = $firstRetry; $::speedLimit = 60 * 10; } elsif ($::retryDelay == $firstRetry) { print "Second retry for ".$args[0]."\n"; myLog("Second retry for ".$args[0]."\n"); $::retryDelay = $secondRetry; $::speedLimit = 60 * 10; } elsif ($::retryDelay == $secondRetry) { print "Third retry for ".$args[0]."\n"; myLog("Third retry for ".$args[0]."\n"); $::retryDelay = $thirdRetry; $::speedLimit = 60 * 10; } elsif ($::retryDelay == $thirdRetry) { print "Nth retry for ".$args[0]."\n"; myLog("Nth retry for ".$args[0]."\n"); $::retryDelay = $thirdRetry; $::speedLimit = 60 * 10; } else { die ("retry(): Internal error - unknown delay factor '".$::retryDelay."'\n"); } # DEFAULT TO FAILURE CASE HANDLING $i = $::retryDelay; while ($i >= 0) { sleep (1); print STDERR "Waiting $i seconds for retry...\r"; $i--; } print " \r"; # DO THE ACTUAL RETRY if ($callType eq "getPage") { return(getPage(@args)); } elsif ($callType eq "postPage") { return(postPage(@args)); } elsif ($callType eq "getCategoryArticles") { return(getCategoryArticles(@args)); } elsif ($callType eq "getSubcategories") { return(getSubcategories(@args)); } elsif ($callType eq "getURL") { return(getURL(@args)); } else { myLog ("retry(): Unknown callType: $callType\n"); die ("retry(): Unknown callType: $callType\n"); } } # perl pearle ENFORCE_CFD ## This just compares the contents of Category:Categories_for_deletion ## with WP:CFD and /resolved and /unresolved. It is broken now due to ## recent changes which list all nominations on subpages. It also ## does not check above the first 200 members of the category, due to ## recent changes which paginates in 200-page blocks. sub enforceCFD { my (@subcats, $subcat, $cfd, $editTime, $startTime, $token, $cfdU, $cfdR); @subcats = getSubcategories("Category:Categories_for_deletion"); ($cfd, $editTime, $startTime, $token) = getPage("Wikipedia:Categories_for_deletion"); ($cfdU, $editTime, $startTime, $token) = getPage("Wikipedia:Categories_for_deletion/unresolved"); ($cfdR, $editTime, $startTime, $token) = getPage("Wikipedia:Categories_for_deletion/resolved"); $cfd =~ s/[\r\n_]/ /g; $cfd =~ s/\s+/ /g; $cfdU =~ s/[\r\n_]/ /g; $cfdU =~ s/\s+/ /g; $cfdR =~ s/[\r\n_]/ /g; $cfdR =~ s/\s+/ /g; foreach $subcat (@subcats) { $subcat =~ s/[\r\n_]/ /g; $subcat =~ s/\s+/ /g; $subcat = urlDecode ($subcat); unless ($cfd =~ m/$subcat/) { print "$subcat is not in WP:CFD"; if ($cfdR =~ m/$subcat/) { print " (listed on /resolved)"; } if ($cfdU =~ m/$subcat/) { print " (listed on /unresolved)"; } print "\n"; } } } # An internal function that handles the complexity of adding a # category tag to the wikicode of a page. sub addCatToText { my ($category, $text, $catTmp, $sortkey, $articleName, $junk); $category = $_[0]; $text = $_[1]; $sortkey = $_[2]; $articleName = $_[3]; unless ($category =~ m/^Category:\w+/) { myLog ("addCatToText(): Bad format on category.\n"); die ("addCatToText(): Bad format on category.\n"); } $catTmp = $category; # _ and spaces are equivalent and may be intermingled. $catTmp =~ s/Category:\s*/Category:\\s*/g; $catTmp =~ s/_/[_ ]/g; $catTmp =~ s%\(%\\\(%g; $catTmp =~ s%\)%\\\)%g; $catTmp =~ s%\.%\\\.%g; if (($text =~ m/(\[\[\s*${catTmp}\s*\|.*?\]\])/is) or ($text =~ m/(\[\[\s*${catTmp}\s*\]\])/is)) { print "addCatToText(): $articleName is already in '$category'.\n"; myLog ("addCatToText(): $articleName is already in '$category'.\n"); print "\n1: '${1}'\n"; print "\ncattmp: '${catTmp}'\n"; return("fail", $text); } if ($text =~ m/^\s*\#REDIRECT/is) { print "addCatToText(): $articleName is a redirect!\n"; myLog ("addCatToText(): $articleName is a redirect!\n"); return("fail", $text); } # Convert underscore to spaces; this is human-readable. $category =~ s/_/ /g; # Add the category $text .= "\n[[$category]]"; # Move the category to the right place ($text, $junk) = fixCategoryInterwiki($text); return ("success", $text); } ### THIS ROUTINE IS CURRENTLY UNUSED ### # It will probably not be useful to you, anyway, since it requires # pre-processed database dumps which are not included in Pearle. sub getPageOffline { my ($target, $result, $targetTmp); $target = $_[0]; # Must run the following before using this function, from 200YMMDD/data: # cat entries.txt | perl ../../scripts/rewrite-entries.pl > entries-simple.txt # Even after this pre-processing, this routine is incredibly slow. # Set up and use MySQL instead if you care about speed. $target =~ s/\s/_/g; # Double escape the tab, once for Perl, once for the shell # -P means "treat as Perl regexp" (yay!) # $result = `grep -P '^${target}\\t' /home/beland/wikipedia/20050107/data/entries-simple.txt`; $targetTmp = shellfix($target); $result = `grep -P '^${targetTmp}\\t' /home/beland/wikipedia/20050107/data/matches2.txt`; $result =~ s/^${target}\t//; $result =~ s/\\n/\n/g; return ($result, "junk"); } # --- CATEGORY AND INTERWIKI STYLE CLEANUP ROUTINES --- # perl pearle.pl INTERWIKI_LOOP # ## This command is for remedial cleanup only, and so is probably not ## useful anymore. This loop takes input of the form: ## "ArticleName\tBodyText\n{repeat...}" on STDIN. # sub interwikiLoop { my ($article, $text, @junk, $enforceCategoryInterwikiCalls); while (<STDIN>) { if ($_ =~ m/^\s*$/) { next; } ($article, $text, @junk) = split ("\t", $_); $text =~ s/\\n/\n/g; enforceCategoryInterwiki($article, $text); $enforceCategoryInterwikiCalls++; print STDOUT "\r interwikiLoop iteration ".$enforceCategoryInterwikiCalls; } } # perl pearle.pl ENFORCE_CATEGORY_INTERWIKI Article_name # ## This function is for both external use. From the command line, use ## it to tidy up a live page's category and interwiki tags, specifying ## only the name of the page. It can also be used by interwikiLoop(), ## which supplies the full text on its own. It will post any changes ## to the live wiki that involve anything more than whitespace ## changes. ## ## This function also does {{msg:foo}} -> {{foo}} conversion, so that ## the article parsing algorithm can be recycled. # sub enforceCategoryInterwiki { my ($articleName, $text, $editTime, $startTime, $textOrig, @newLines, $line, $textCopy, $textOrigCopy, $message, @junk, $diff, $token, $online); $articleName = $_[0]; myLog("enforceCategoryInterwiki($articleName)\n"); $text = $_[1]; $online = 0; if ($text eq "") { $online = 1; ($text, $editTime, $startTime, $token) = getPage($articleName); } $textOrig = $text; ($text, $message) = fixCategoryInterwiki($text); if (substantiallyDifferent($text, $textOrig)) { @newLines = split ("\n", $text); $textCopy = $text; $textOrigCopy = $textOrig; open (ONE, ">/tmp/article1.$$"); print ONE $textOrig; close (ONE); open (TWO, ">/tmp/article2.$$"); print TWO $text; close (TWO); $diff = `diff /tmp/article1.$$ /tmp/article2.$$`; unlink("/tmp/article1.$$"); unlink("/tmp/article2.$$"); myLog("*** $articleName - $message\n"); myLog("*** DIFF FOR $articleName\n"); myLog($diff); if ($online == 0) { # Isolate changed files for later runs open (FIXME, ">>./fixme.interwiki.txt.$$"); $text =~ s/\t/\\t/g; $text =~ s/\n/\\n/g; print FIXME $articleName."\t".$text."\n"; close (FIXME); } myLog($articleName." changed by fixCategoryInterwiki(): $message\n"); print STDOUT $articleName." changed by fixCategoryInterwiki(): $message\n"; if ($online == 1) { postPage ($articleName, $editTime, $startTime, $token, $text, $message, "yes"); } } else { print STDOUT "--- No change for ${articleName}.\n"; myLog ("--- No change for ${articleName}.\n"); ### TEMPORARY ### ### Uncomment this line if you want category changes to ### trigger null edits. This is useful if you have have ### changed the category on a template, but due to a bug this ### does not actually move member articles until they are ### edited. postPage ($articleName, $editTime, $startTime, $token, $textOrig, "null edit", "yes"); ### TEMPORARY ### } } sub substantiallyDifferent { my($a, $b); $a = $_[0]; $b = $_[1]; $a =~ s/\s//g; $b =~ s/\s//g; return ($a ne $b); } # Given some wikicode as input, this function will tidy up the # category and interwiki links and return the result and a comment # suitable for edit summaries. sub fixCategoryInterwiki { my ($input, @segmentNames, @segmentContents, $langlist, $i, $message, $output, $flagForReview, $interwikiBlock, $categoryBlock, $flagError, $bodyBlock, $contents, $name, @interwikiNames, @interwikiContents, @categoryNames, @categoryContents, @bodyNames, @bodyContents, $bodyFlag, @bottomNames, @bottomContents, @segmentNamesNew, @segmentContentsNew, $lastContents, @stubContents, @stubNames, $stubBlock, $msgFlag); $input = $_[0]; # The algorithm here is complex. The general idea is to split the # page in to segments, each of which is assigned a type, and then # to rearrange, consolidate, and frob the segments as needed. # Start with one segment that includes the whole page. @::segmentNames = ("bodyText"); @::segmentContents = ($input); # Recognize and tag certain types of segments. The order of # processing is very important. metaTagInterwiki("nowiki", "^(.*?)(\s*.*?\s*)"); metaTagInterwiki("comment", "^(.*?)(<!.*?>\\n?)"); metaTagInterwiki("html", "^(.*?)(<.*?>\\n?)"); metaTagInterwiki("category", "^(.*?)(\\[\\[\\s*Category\\s*:\\s*.*?\\]\\]\\n?)"); $langlist = `cat /home/beland/wikipedia/pearle-wisebot/langlist`; $langlist =~ s/^\s*//s; $langlist =~ s/\s*$//s; $langlist =~ s/\n/\|/gs; $langlist .= "|minnan|zh\-cn|zh\-tw|nb"; metaTagInterwiki("interwiki", "^(.*?)(\\[\\[\\s*($langlist)\\s*:\\s*.*?\\]\\]\\n?)"); metaTagInterwiki("tag", "^(.*?)(\{\{.*?\}\})"); # Allow category and interwiki segments to be followed by HTML # comments only (plus any intervening whitespace). $i = 0; while ($i < @::segmentNames) { $name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; # {{msg:foo}} -> {{foo}} conversion if (($name eq "tag") and ($contents =~ m/^{{msg:(.*?)}}/)) { $msgFlag = 1; $contents =~ s/^{{msg:(.*?)}}/{{$1}}/; } if (($name eq "category") or ($name eq "interwiki")) { if (!($contents =~ m/\n/) and ($::segmentNames[$i+1] eq "comment")) { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents.$::segmentContents[$i+1]); $i += 2; # DEBUG print "AAA - ".$contents.$::segmentContents[$i+1]); next; } if (!($contents =~ m/\n/) and ($::segmentNames[$i+1] eq "bodyText") and ($::segmentContents[$i+1] =~ m/^\s*$/) and !($::segmentContents[$i+1] =~ m/^\n$/) and ($::segmentNames[$i+2] eq "comment") ) { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents.$::segmentContents[$i+1].$::segmentContents[$i+2]); $i += 3; # DEBUG print "BBB".$contents.$::segmentContents[$i+1].$::segmentContents[$i+2]); next; } # Consolidate with any following whitespace if (($::segmentNames[$i+1] eq "bodyText") and ($::segmentContents[$i+1] =~ m/^\s*$/) ) { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents.$::segmentContents[$i+1]); $i += 2; next; } } push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); $i++; } # Clean up results @::segmentNames = @segmentNamesNew; @::segmentContents = @segmentContentsNew; @segmentContentsNew = (); @segmentNamesNew = (); # Move category and interwiki tags that precede the body text (at # the top of the page) to the bottom of the page. $bodyFlag = 0; foreach $i (0 ... @::segmentNames-1) { $name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; if ($bodyFlag == 1) { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); } elsif (($name eq "category") or ($name eq "interwiki")) { push (@bottomNames, $name); push (@bottomContents, $contents); } else { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); $bodyFlag = 1; } } # Clean up results @::segmentNames = (@segmentNamesNew, @bottomNames); @::segmentContents = (@segmentContentsNew, @bottomContents); @segmentContentsNew = (); @segmentNamesNew = (); @bottomNames = (); @bottomContents = (); # Starting at the bottom of the page, isolate category, interwiki, # and body text. If categories or interwiki links are mixed with # body text, flag for human review. ### DEBUG ### # foreach $i (0 ... @::segmentNames-1) # { # print "---$i ".$::segmentNames[$i]."---\n"; # print "%%%".$::segmentContents[$i]."%%%\n"; # } ### DEBUG ### ### DEBUG ### #my ($first); #$first = 1; ### DEBUG ### $bodyFlag = 0; $flagForReview = 0; foreach $i (reverse(0 ... @::segmentNames-1)) { $name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; if (($name eq "category") and ($bodyFlag == 0)) { # Push in reverse @categoryNames = ($name, @categoryNames); @categoryContents = ($contents, @categoryContents); next; } elsif (($name eq "interwiki") and ($bodyFlag == 0)) { # Push in reverse @interwikiNames = ($name, @interwikiNames); @interwikiContents = ($contents, @interwikiContents); next; } elsif (($bodyFlag == 0) and ($name eq "tag") and (($contents =~ m/\{\{[ \w\-]+[\- ]?stub\}\}/) or ($contents =~ m/\{\{[ \w\-]+[\- ]?stub\|.*?\}\}/))) { ### IF THIS IS A STUB TAG AND WE ARE STILL $bodyFlag == 0, ### THEN ADD THIS TO $stubBlock! # Canonicalize by making {{msg:Foo}} into {{Foo}} s/^\{\{\s*msg:(.*?)\}\}/\{\{$1\}\}/i; # Push in reverse @stubNames = ($name, @stubNames); @stubContents = ($contents, @stubContents); next; } elsif (($name eq "category") or ($name eq "interwiki")) # bodyFlag implicitly == 1 { if ($flagForReview == 0) { $flagForReview = 1; $lastContents =~ s/^\s*//s; $lastContents =~ s/\s*$//s; $flagError = substr ($lastContents, 0, 30); } # Drop down to push onto main body stack. } # Handle this below instead. ## Skip whitespace #if (($contents =~ m/^\s*$/s) and ($bodyFlag == 0)) #{ # next; #} # Delete these comments if (($bodyFlag == 0) and ($name == "comment")) { if ( ($contents =~ m/<!--\s*interwiki links\s*-->/i) or ($contents =~ m/<!--\s*interwiki\s*-->/i) or ($contents =~ m/<!--\s*interlanguage links\s*-->/i) or ($contents =~ m/<!--\s*categories\s*-->/i) or ($contents =~ m/<!--\s*other languages\s*-->/i) or ($contents =~ m/<!--\s*The below are interlanguage links.\s*-->/i) ) { ### DEBUG ### #print STDOUT ("YELP!\n"); # #foreach $i (0 ... @bodyNames-1) #{ # print "---$i ".$bodyNames[$i]."---\n"; # print "%%%".$bodyContents[$i]."%%%\n"; #} # #print STDOUT ("END-YELP!"); ### DEBUG ### next; } } # Push onto main body stack (in reverse). @bodyNames = ($name, @bodyNames); @bodyContents = ($contents, @bodyContents); ### DEBUG ### #if (($flagForReview == 1) and ($first == 1)) #{ # $first = 0; # print "\@\@\@${lastContents}\#\#\#\n"; #} ### DEBUG ### # This should let tags mixed in with the category and # interwiki links (not comingled with body text) bubble up. unless (($contents =~ m/^\s*$/s) or ($name eq "tag")) { $bodyFlag = 1; } $lastContents = $contents; } ### DEBUG ### # foreach $i (0 ... @bodyNames-1) # { # print "---$i ".$bodyNames[$i]."---\n"; # print "%%%".$bodyContents[$i]."%%%\n"; # } # foreach $i (0 ... @categoryNames-1) # { # print "---$i ".$categoryNames[$i]."---\n"; # print "^^^".$categoryContents[$i]."^^^\n"; # } # foreach $i (0 ... @interwikiNames-1) # { # print "---$i ".$interwikiNames[$i]."---\n"; # print "&&&".$interwikiContents[$i]."&&&\n"; # } ### DEBUG ### # Assemble body text, category, interwiki, and stub arrays into strings foreach $i (0 ... @bodyNames-1) { $name = $bodyNames[$i]; $contents = $bodyContents[$i]; $bodyBlock .= $contents; } foreach $i (0 ... @categoryNames-1) { $name = $categoryNames[$i]; $contents = $categoryContents[$i]; # Enforce style conventions $contents =~ s/\[\[category\s*:\s*/\[\[Category:/i; # Enforce a single newline at the end of each category line. $contents =~ s/\s*$//; $categoryBlock .= $contents."\n"; } foreach $i (0 ... @interwikiNames-1) { $name = $interwikiNames[$i]; $contents = $interwikiContents[$i]; # Canonicalize minnan to zh-min-nan, since that's what's in # the officially distributed langlist. $contents =~ s/^\[\[minnan:/\[\[zh-min-nan:/; # Canonicalize zh-ch, Chinese (simplified) and zn-tw, Chinese # (traditional) to "zh"; the distinction is being managed # implicitly by software now, not explicitly in wikicode. $contents =~ s/^\[\[zh-cn:/\[\[zh:/g; $contents =~ s/^\[\[zh-tw:/\[\[zh:/g; # Canonicalize nb to no $contents =~ s/^\[\[nb:/\[\[no:/g; # Canonicalize dk to da $contents =~ s/^\[\[dk:/\[\[da:/g; # Enforce a single newline at the end of each interwiki line. $contents =~ s/\s*$//; $interwikiBlock .= $contents."\n"; } foreach $i (0 ... @stubNames-1) { $name = $stubNames[$i]; $contents = $stubContents[$i]; # Enforce a single newline at the end of each stub line. $contents =~ s/\s*$//; $contents =~ s/^\s*//; $stubBlock .= $contents."\n"; } # Minimize interblock whitespace $bodyBlock =~ s/^\s*//s; $bodyBlock =~ s/\s*$//s; $categoryBlock =~ s/^\s*//s; $categoryBlock =~ s/\s*$//s; $interwikiBlock =~ s/^\s*//s; $interwikiBlock =~ s/\s*$//s; $stubBlock =~ s/^\s*//s; $stubBlock =~ s/\s*$//s; # Assemble the three blocks into a single string, flagging for # human review if necessary. $output = ""; if ($bodyBlock ne "") { $output .= $bodyBlock."\n\n"; } if (($flagForReview == 1) and !($input =~ m/\{\{interwiki-category-check/) and !($input =~ m/\{\{split/) and !($input =~ m/\[\[Category:Pages for deletion\]\]/)) { $output .= "{{interwiki-category-check|${flagError}}}\n\n"; } if ($categoryBlock ne "") { $output .= $categoryBlock."\n"; } if ($interwikiBlock ne "") { # $output .= "<!-- The below are interlanguage links. -->\n".$interwikiBlock."\n"; $output .= $interwikiBlock."\n"; } if ($stubBlock ne "") { $output .= $stubBlock."\n"; } if ($input ne $output) { $message = "Minor category, interwiki, or template style cleanup"; if ($flagForReview == 1) { $message = "Flagged for manual review of category/interwiki style"; } if ($msgFlag == 1) { $message .= "; {{msg:foo}} -> {{foo}} conversion for MediaWiki 1.5+ compatibility"; } } else { $message = "No change"; } return($output, $message); } #sub displayInterwiki #{ # my ($i); # ## THIS FUNCTION CANNOT BE CALLED DUE TO SCOPING; YOU MUST MANUALLY # ## COPY THIS TEXT INTO fixCategoryInterwiki(). IT IS ONLY USEFUL # ## FOR DIAGNOSTIC PURPOSES. # # foreach $i (0 ... @::segmentNames-1) # { # print "---$i ".$::segmentNames[$i]."---\n"; # print "%%%".$::segmentContents[$i]."%%%\n"; # } #} # A subroutine of fixCategoryInterwiki(), this function isolates # certain parts of existing segments based on a regular expression # pattern, and tags them with the supplied name (which indicates their # type). Sorry for the global variables. sub metaTagInterwiki { my ($tag, $pattern, $i, $meta, $body, @segmentNamesNew, @segmentContentsNew, $name, $contents, $bodyText, ); $tag = $_[0]; $pattern = $_[1]; foreach $i (0 ... @::segmentNames-1) { $name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; unless ($name eq "bodyText") { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); next; } while (1) { if ($contents =~ m%$pattern%is) { $bodyText = $1; $meta = $2; if ($bodyText ne "") { push (@segmentNamesNew, "bodyText"); push (@segmentContentsNew, $bodyText); } push (@segmentNamesNew, $tag); push (@segmentContentsNew, $meta); $contents =~ s/\Q${bodyText}${meta}\E//s; } else { if ($contents ne "") { push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); } last; } } } @::segmentNames = @segmentNamesNew; @::segmentContents = @segmentContentsNew; @segmentContentsNew = (); @segmentNamesNew = (); } sub nullEdit { my ($text, $articleName, $comment, $editTime, $startTime, $token); $articleName = $_[0]; # Only set this to "yes" if you are doing a bunch of null edits # and don't care about failures. $::roughMode = "no"; print "nullEdit($articleName)\n"; myLog ("nullEdit($articleName)\n"); ($text, $editTime, $startTime, $token) = getPage($articleName); unless ($text eq "") { postPage ($articleName, $editTime, $startTime, $token, $text, "null edit"); } } sub cleanupDate { my ($article, @articles); # Get all articles from Category:Wikipedia cleanup @articles = getCategoryArticles ("Category:Wikipedia cleanup"); # @articles = reverse (sort(@articles)); @articles = (sort(@articles)); foreach $article (@articles) { if (($article =~ m/^Wikipedia:/) or ($article =~ m/^Template:/) or ($article =~ m/^User:/) or ($article =~ m/talk:/i) ) { next; } cleanupDateArticle($article); limit(); } } sub cleanupDateArticle #($target) { my (@result, $link, $currentMonth, $currentYear, $junk, $line, $month, $year, $found, $lineCounter, $target); $target = $_[0]; print "cleanupDateArticle($target)\n"; @result = parseHistory($target); ($currentMonth, $currentYear, $junk) = split(" ", $result[0]); $found = ""; foreach $line (@result) { $lineCounter++; ($month, $year, $link) = split(" ", $line); if (($month eq $currentMonth) and ($year eq $currentYear)) { # print "$month $year - SKIP\n"; next; } # Skip this, because it produces false positives on articles that were # protected at the end of last month, but no longer are. The correct # thing to do is to check if an article is CURRENTLY protected by # fetching the current version, but this seems like a waste of network # resources. # if (checkForTag("protected", $link) eq "yes") # { # print "$target is {{protected}}; skipping\n"; # myLog("$target is {{protected}}; skipping\n"); # return(); # } if (checkForTag("sectionclean", $link) eq "yes") { print "$target has {{sectionclean}}\n"; myLog("$target has {{sectionclean}}\n"); nullEdit($target); return(); } if (checkForTag("Sect-Cleanup", $link) eq "yes") { print "$target has {{Sect-Cleanup}}\n"; myLog("$target has {{Sect-Cleanup}}\n"); nullEdit($target); return(); } if (checkForTag("section cleanup", $link) eq "yes") { print "$target has {{section cleanup}}\n"; myLog("$target has {{section cleanup}}\n"); nullEdit($target); return(); } if (checkForTag("sectcleanup", $link) eq "yes") { print "$target has {{sectcleanup}}\n"; myLog("$target has {{sectcleanup}}\n"); nullEdit($target); return(); } if (checkForTag("cleanup-section", $link) eq "yes") { print "$target has {{cleanup-section}}\n"; myLog("$target has {{cleanup-section}}\n"); nullEdit($target); return(); } if (checkForTag("cleanup-list", $link) eq "yes") { print "$target has {{cleanup-list}}\n"; myLog("$target has {{cleanup-list}}\n"); nullEdit($target); return(); } if (checkForTag("cleanup-nonsense", $link) eq "yes") { print "$target has {{cleanup-nonsense}}\n"; myLog("$target has {{cleanup-nonsense}}\n"); nullEdit($target); return(); } if ((checkForTag("cleanup", $link) eq "yes") or (checkForTag("clean", $link) eq "yes") or (checkForTag("CU", $link) eq "yes") or (checkForTag("cu", $link) eq "yes") or (checkForTag("cleanup-quality", $link) eq "yes") or (checkForTag("tidy", $link) eq "yes")) { $currentMonth = $month; $currentYear = $year; # print "$month $year - YES\n"; next; } else { # print "$month $year - NO\n"; # print "Tag added $currentMonth $currentYear\n"; $found = "Tag added $currentMonth $currentYear\n"; last; } } if ($found eq "") { # print "HISTORY EXHAUSTED\n"; if ($lineCounter < 498) { $found = "Tag added $currentMonth $currentYear\n"; } else { # print "Unable to determine when tag was added to $target.\n"; myLog("Unable to determine when tag was added to $target.\n"); die("Unable to determine when tag was added to $target.\n"); } } if ($found ne "") { changeTag("cleanup", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("tidy", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("CU", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("cu", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("cleanup-quality", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("clean", "cleanup-date\|${currentMonth} ${currentYear}", $target) || nullEdit($target); } } sub changeTag { my ($tagFrom, $tagFromUpper, $tagTo, $tagToUpper, $articleName, $editTime, $startTime, $text, $token, $comment, $junk); $tagFrom = $_[0]; # "cleanup" $tagTo = $_[1]; # "cleanup-date|August 2005" $articleName = $_[2]; # Article name print "changeTag (${tagFrom}, ${tagTo}, ${articleName})\n"; myLog("changeTag (${tagFrom}, ${tagTo}, ${articleName})\n"); $tagFromUpper = ucfirst($tagFrom); $tagToUpper = ucfirst($tagTo); if ($articleName =~ m/^\s*$/) { myLog("changeTag(): Null target."); die("changeTag(): Null target."); } ($text, $editTime, $startTime, $token) = getPage($articleName); unless (($text =~ m/\{\{\s*\Q$tagFrom\E\s*\}\}/) or ($text =~ m/\{\{\s*\Q$tagFromUpper\E\s*\}\}/) or ($text =~ m/\{\{\s*\Qmsg:$tagFrom\E\s*\}\}/) or ($text =~ m/\{\{\s*\Qmsg:$tagFromUpper\E\s*\}\}/) or ($text =~ m/\{\{\s*\QTemplate:$tagFrom\E\s*\}\}/) or ($text =~ m/\{\{\s*\QTemplate:$tagFromUpper\E\s*\}\}/) or ($text =~ m/\{\{\s*\Q$tagFrom\E\|.*?\s*\}\}/) or ($text =~ m/\{\{\s*\Q$tagFromUpper\E\|.*?\s*\}\}/) ) { myLog("changeTag(): {{$tagFrom}} is not in $articleName.\n"); print "changeTag(): {{$tagFrom}} is not in $articleName.\n"; # die("changeTag(): {{$tagFrom}} is not in $articleName.\n"); ### TEMPORARY ### # <nowiki> Just skip articles with {{tidy}}, {{clean}} {{sectionclean}}, {{advert}}, etc. sleep(1); # READ THROTTLE! return(0); } if (($text =~ m/\{\{\s*\Q$tagTo\E\s*\}\}/) or ($text =~ m/\{\{\s*\Q$tagToUpper\E\s*\}\}/)) { myLog("changeTag(): $articleName already contains {{$tagTo}}."); die("changeTag(): $articleName already contains {{$tagTo}}."); } if ($text =~ m/^\s*\#REDIRECT/is) { myLog ("changeTag.a(): $articleName is a redirect!\n"); die ("changeTag.a(): $articleName is a redirect!\n"); sleep(1); # READ THROTTLE! return(0); } # Escape special characters $tagFrom =~ s%\(%\\(%g; $tagFrom =~ s%\)%\\)%g; $tagFrom =~ s%\'%\\\'%g; # We're lazy and don't fully parse the document to properly check # for escaped tags, so there may be some unnecssary aborts from # the following, but they are rare and easily overridden by # manually editing the page in question. if (($text =~ m/<nowiki>.*?\Q$tagFrom\E.*?<\/nowiki>/is) or ($text =~ m/<pre>.*?\Q$tagFrom\E.*?<\/pre>/is)) # <pre> { myLog ("changeTag.r(): $articleName has a dangerous nowiki or pre tag!\n"); die ("changeTag.r(): $articleName has a dangerous nowiki or pre tag!\n"); } # Make the swap! $text =~ s/\{\{\s*\Q$tagFrom\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\Q$tagFromUpper\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\Qmsg:$tagFrom\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\Qmsg:$tagFromUpper\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\QTemplate:$tagFrom\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\QTemplate:$tagFromUpper\E\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\Q$tagFrom\E\|(.*?)\s*\}\}/{{${tagTo}}}/g; $text =~ s/\{\{\s*\Q$tagFromUpper\E\|(.*?)\s*\}\}/{{${tagTo}}}/g; # Tidy up the article in general ($text, $junk) = fixCategoryInterwiki($text); # Post the changes $comment = "Changing \{\{${tagFrom}\}\} to \{\{${tagTo}\}\}"; postPage ($articleName, $editTime, $startTime, $token, $text, $comment, "yes"); return (1); } sub parseHistory { my ($pageName, $html, @lines, $line, $date, $month, $year, $htmlCopy, $link, @result, $pageNameSafe); $pageName = $_[0]; $pageNameSafe = $pageName; $pageNameSafe =~ s/&/%26/g; $html = getURL("http://en.wikipedia.org/w/index.php?title=${pageNameSafe}&action=history&limit=500"); $htmlCopy = $html; $html =~ s%^.*?<ul id="pagehistory">%%s; $html =~ s%(.*?)</ul>.*$%$1%s; $html =~ s%</li>\s*%%s; @lines = split ("</li>", $html); foreach $line (@lines) { $line =~ s/\n/ /g; if ($line =~ m/^\s*$/) { next; } $line =~ s%<span class='user'>.*?$%%; $line =~ s%^.*?Select a newer version for comparison%%; $line =~ s%^.*?Select a older version for comparison%%; $line =~ s%^.*?name="diff" />%%; # print "LINE: ".$line."\n"; $line =~ m%<a href="(.*?)" title="(.*?)">(.*?)</a>%; $link = $1; $date = $3; # print $link." / $date\n"; if ($date =~ m/Jan/) { $month = "January"; } elsif ($date =~ m/Feb/) { $month = "February"; } elsif ($date =~ m/Mar/) { $month = "March"; } elsif ($date =~ m/Apr/) { $month = "April"; } elsif ($date =~ m/May/) { $month = "May"; } elsif ($date =~ m/Jun/) { $month = "June"; } elsif ($date =~ m/Jul/) { $month = "July"; } elsif ($date =~ m/Aug/) { $month = "August"; } elsif ($date =~ m/Sep/) { $month = "September"; } elsif ($date =~ m/Oct/) { $month = "October"; } elsif ($date =~ m/Nov/) { $month = "November"; } elsif ($date =~ m/Dec/) { $month = "December"; } else { $month = "Unknown month"; myLog ("Unknown month - parse failure! $line\nHTML:\n$html\n"); die ("Unknown month - parse failure! (see log) LINE: $line\n"); } $date =~ m/(\d\d\d\d)/; $year = $1; @result = (@result, "$month $year $link"); } return (@result); } sub checkForTag #($targetURLWithOldIDAttached) { my ($tag, $target, $text); $tag = $_[0]; $target = $_[1]; # Must be absolute; assuming English Wikipedia here. $target =~ s%^/w/index.php%http://en.wikipedia.org/w/index.php%; # Decode HTML entities in links $target =~ s/\&/\&/g; if ($target eq $::cachedTarget) { $text = $::cachedText; } else { $text = getURL ($target."&action=edit"); $::cachedTarget = $target; $::cachedText = $text; } if ($text =~ m/\{\{\s*\Q$tag\E\s*\}\}/) { # print $text; die "Cough!"; return "yes"; } $tag = ucfirst($tag); if ($text =~ m/\{\{\s*\Q$tag\E\s*\}\}/) { # print "\n\nSneeze!\n\n"; print $text."\n\n"; return "yes"; } return "no"; } sub getURL #($target) { # Read throttle! sleep (1); my ($attemptStartTime, $attemptFinishTime, $request, $response, $reply, $url); $url = $_[0]; # Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time(); # Create a request-object print "GET ${url}\n"; myLog("GET ${url}\n"); $request = HTTP::Request->new(GET => "${url}"); $response = $::ua->request($request); if ($response->is_success) { $reply = $response->content; # Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "getURL", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); # This may or may not actually work $::ua->cookie_jar->save(); return ($reply); } else { myLog ("getURL(): HTTP ERR (".$response->status_line.") ${url}\n".$response->content."\n"); print ("getURL(): HTTP ERR (".$response->status_line.") ${url}\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { return(retry("getURL", @_)); } else { # Unhandled HTTP response die ("getURL(): HTTP ERR (".$response->status_line.") ${url}\n"); } } } sub opentaskUpdate { my ($target, $historyFile, $opentaskText, $editTime, $startTime, $token, $key, $historyDump); $target = "Template:Opentask"; $historyFile = "/home/beland/wikipedia/pearle-wisebot/opentask-history.pl"; ($opentaskText, $editTime, $startTime, $token) = getPage($target); eval(`cat $historyFile`); $opentaskText = doOpentaskUpdate("CLEANUP", "CLEANUP", $opentaskText); $opentaskText = doOpentaskUpdate("STYLE", "Category:Wikipedia articles needing style editing", $opentaskText); $opentaskText = doOpentaskUpdate("UPDATE", "Category:Wikipedia articles in need of updating", $opentaskText); $opentaskText = doOpentaskUpdate("VERIFY", "Category:Wikipedia articles needing factual verification", $opentaskText); $opentaskText = doOpentaskUpdate("COPYEDIT", "Category:Wikipedia articles needing copy edit", $opentaskText); $opentaskText = doOpentaskUpdate("WIKIFY", "Category:Articles that need to be wikified", $opentaskText); $opentaskText = doOpentaskUpdate("MERGE", "Category:Articles to be merged", $opentaskText); $opentaskText = doOpentaskUpdate("NPOV", "Category:NPOV disputes", $opentaskText); # Dump history $historyDump = "\%::history = (\n"; foreach $key (sort(keys(%::history))) { $key =~ s/\"/\\\"/g; # Escape! $historyDump .= "\"${key}\" => \"".$::history{$key}."\",\n"; } $historyDump =~ s/,\n$//s; $historyDump .= "\n)\n"; open (HISTORY, ">".$historyFile); print HISTORY $historyDump; close (HISTORY); postPage ($target, $editTime, $startTime, $token, $opentaskText, "Automatic rotation of NPOV, copyedit, wikify, merge, update, style, and verify", "yes"); } sub doOpentaskUpdate { my ($categoryID, $sourceCategory, $opentaskText, @articles, $article, %rank, $featuredString, $characterLimit, $featuredStringTmp, $key, $printedFlag, $tmpKey, $l, $nl, %l, %nl, $total, $articleUnderscore, $neverListed, @articlesTmp); $categoryID = $_[0]; $sourceCategory = $_[1]; $opentaskText = $_[2]; $characterLimit = 130; if ($sourceCategory eq "CLEANUP") { @articlesTmp = (getCategoryArticles ("Category:Wikipedia articles needing priority cleanup"), getCategoryArticles ("Category:Cleanup from October 2004"), getCategoryArticles ("Category:Cleanup from November 2004"), getCategoryArticles ("Category:Cleanup from December 2004")); } else { @articlesTmp = getCategoryArticles ($sourceCategory); } # Shuffle and clean up article names; and exclude unwanted entries foreach $article (@articlesTmp) { if (($article =~ m/^Wikipedia:/) or ($article =~ m/^Template:/) or ($article =~ m/^User:/) or ($article =~ m/talk:/i) ) { next; } @articles = (@articles, $article); } foreach $article (@articles) { $article = urlDecode($article); $article =~ s/_/ /g; $articleUnderscore = $article; $articleUnderscore =~ s/ /_/g; $rank{$article} = rand() + ($::history{"${articleUnderscore}-${categoryID}"} * .5); # print " $article: ".$rank{$article}." / ".$::history{"${articleUnderscore}-${categoryID}"}."\n"; } # Pick as many articles as will fit in the space allowed foreach $article (sort {$rank{$a} <=> $rank {$b}} (keys(%rank))) { if (length($article)+1 < $characterLimit - length($featuredString)) { $featuredString .= "[[${article}]],\n"; $article =~ s/ /_/g; # Record how many times each article is featured. $::history{"${article}-${categoryID}"}++; } } $featuredStringTmp = $featuredString; $featuredStringTmp =~ s/\n/ /g; print "Featuring: $featuredStringTmp\n"; myLog("Featuring: $featuredStringTmp\n"); foreach $key (sort {$::history{$a} <=> $::history{$b}} (sort(keys (%::history)))) { if ($key =~ m/${categoryID}$/) { if ($::history{$key} > 7) { print $::history{$key}." "; } $printedFlag = 0; $tmpKey = $key; $tmpKey =~ s/\-$categoryID$//; # print " '$tmpKey' "; foreach $article (keys(%rank)) { $article =~ s/ /_/g; if ($article eq $tmpKey) { if ($::history{$key} > 7) { print "L ${key}\n"; # Still listed. } $printedFlag = 1; $l++; $l{$::history{$key}}++; } } if ($printedFlag == 0) { # if ($::history{$key} > 7) # { # print "NL ${key}\n"; # Not listed anymore; must be fixed! # } $nl++; $nl{$::history{$key}}++; } } } $total = $l + $nl; print "Effectiveness ratio for ${categoryID}: $l L, $nl NL ("; print sprintf("%.2f", $nl/$total)*100; print "%)\n"; foreach $article (@articles) { $articleUnderscore = $article; $articleUnderscore =~ s/ /_/g; if ($::history{"${articleUnderscore}-${categoryID}"} < 1) { $neverListed++; } } print "0 L: $neverListed\n"; foreach $key (sort(keys(%l))) { print $key." L: ".$l{$key}."\n"; } foreach $key (sort(keys(%nl))) { print $key." NL: ".$nl{$key}."\n"; } # Insert into actual page text and finish $opentaskText =~ s/(<!--START-PEARLE-INSERT-$categoryID-->).*?(<!--END-PEARLE-INSERT-$categoryID-->)/${1}\n$featuredString${2}/gs; return ($opentaskText); } # Get a list of the names of articles in a given category. sub getCategoryImages { my ($target, $request, $response, $reply, $images, @images, $attemptStartTime, $attemptFinishTime, $image, %imagesHash); $target = $_[0]; #urlSafe ($target); unless ($target =~ m/^Category:/) { myLog ("getCategoryImages(): Are you sure '$target' is a category?\n"); die ("getCategoryImages(): Are you sure '$target' is a category?\n"); } # Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time(); # Create a request-object print "GET http://en.wikipedia.org/wiki/${target}\n"; myLog("GET http://en.wikipedia.org/wiki/${target}\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}"); $response = $::ua->request($request); if ($response->is_success) { # Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time(); retry ("success", "getCategoryImages", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); $reply = $response->content; # This detects whether or not we're logged in. unless ($reply =~ m%<a href="/wiki/User_talk:Pearle">My talk</a>%) { # We've lost our identity. myLog ("Wiki server is not recognizing me (2).\n---\n${reply}\n---\n"); die ("Wiki server is not recognizing me (2).\n"); } $images = $reply; $images =~ s%^.*?<table class="gallery"%%s; $images =~ s%<div class="printfooter">.*?$%%s; @images = $images =~ m%<a\s+href="/wiki/(.*?)"\s+title=\"Image:%sg; # Uniqify to prevent duplicates foreach $image (@images) { $imagesHash{$image} = 1; } @images = (); foreach $image (sort(keys(%imagesHash))) { @images = (@images, $image); } $::ua->cookie_jar->save(); return decodeArray(@images); } else { myLog ("getCategoryImages($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/) or ($response->status_line =~ m/^502/) or ($response->status_line =~ m/^503/)) { print "getCategoryImages($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n"; return(decodeArray(retry("getCategoryImages", @_))); } else { # Unhandled HTTP response die ("getCategoryImages($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n"); } } }
</nowiki>