User:Cedars/gaauto.pl
From Wikipedia, the free encyclopedia
The following Perl script is a hack that automatically creates a categorized list of good articles in the same format as the good articles page. The script understands quoted, italicized and disguised article links. It uses the existing list as a basis for the new list. It removes old good articles from the revised list and offers the user the opportunity to categorize new good articles. It sorts and counts every article and can automatically adapt to use new headings and subheadings. It allows dual listings and major headings. The script uses cURL to download existing content and the Roman 1.1 Perl module to sort Final Fantasy titles. The script is designed to assist the human editing of Wikipedia articles, not replace it. It is best that users still add and remove articles from the list as they would without the script - this is because they are likely to categorize the items better than the script user. Please feel free to make changes to this page if you feel they would improve the script. If you have comments on the script please feel free to post them on the talk page.
A brief note on output
The script downloads several files to the working directory and outputs two files. The first file, "output_headings.txt", is a file listing the levels and sublevels available for catgorization. This file is output before any requests for categorization are made. The second file, "output.txt", is the formatted wiki-syntax for the list. It may appear corrupt if not opened using UTF-8. The script also outputs a timestamp list of when the most recently added articles were added, "stamp.time", and a backup of the previous version of that list, "stamp.bac". If the timestamp list and backup are dramatically different from each other the script will refuse to run. This is to prevent the timestamp list from becoming distorted and thus damaging the recently added list.
To view script properly use edit mode
#! /usr/bin/perl use Roman; use open ':utf8'; # Download a fresh copy of files $DOWNLOAD = 1; # Warn of removed articles $REMOVED = 1; # Number of new articles to remember $NEWARTICLES = 14; # Should open web browser or text editor $ADVANCED = 0; # Adds section comments (improves editing) $SECTIONCOMMENTS = 0; # Web broswer and text editor commands $WEBBROWSER = "open"; $TEXTEDITOR = "open"; # Sorts article titles sub titlesort { return titlecmp($a, $b); } sub titlecmp { # Grab name %xh = %{shift()}; $x = $xh{"name"}; %yh = %{shift()}; $y = $yh{"name"}; if ($x =~ /Final Fantasy [X|V|I]+/ && $y =~ /Final Fantasy [X|V|I]+/) { # Handle Final Fantasy titles $x =~ /Final Fantasy ([X|V|I]+)/; $x = arabic($1); $y =~ /Final Fantasy ([X|V|I]+)/; $y = arabic($1); return $x <=> $y; } else { # Handle other titles $x =~ s/~~.*//g; $x =~ s/''//g; $y =~ s/~~.*//g; $y =~ s/''//g; if ($x =~ /.*\|(.*)/) { $x = $1; } if ($y =~ /.*\|(.*)/) { $y = $1; } return uc($x) cmp uc($y); } } # Sorts article names sub basicsort { return basiccmp($a, $b); } sub basiccmp { %xh = %{shift()}; $x = $xh{"name"}; %yh = %{shift()}; $y = $yh{"name"}; $x =~ s/~~.*//g; $x =~ s/''//g; $y =~ s/~~.*//g; $y =~ s/''//g; if ($x =~ /(.*)\|.*/) { $x = $1; } if ($y =~ /(.*)\|.*/) { $y = $1; } return uc($x) cmp uc($y); } # Sorts article time stamps sub timesort { return timecmp($a, $b); } sub timecmp { %xh = %{shift()}; $x = $xh{"time"}; %yh = %{shift()}; $y = $yh{"time"}; if ($x < 0 && $y < 0) { return 0; } elsif ($x < 0 && $y >= 0) { return 1; } elsif ($x >= 0 && $y < 0) { return -1; } else { return ($x <=> $y) * -1; } } # Keep backup of timestamp file if (-f "stamp.bac") { $stamp_size = -s "stamp.time"; $stamp_bac_size = -s "stamp.bac"; if (abs($stamp_size - $stamp_bac_size) > 1024) { print "Large change in timestamp file. This script will now quit to prevent data loss.\n"; print "Please delete the \"stamp.bac\" file to continue.\n"; exit(1); } } system "cp stamp.time stamp.bac"; # Download the current good articles file if ($DOWNLOAD) { system "curl \"http://en.wikipedia.org/w/index.php?title=Wikipedia:Good_articles&action=edit\" > input_ga.html"; } # Read the good articles file open(FILE, "input_ga.html"); @input = <FILE>; close(FILE); $input_len = $#input + 1; # Go through each line of the good articles file $major = -1; $level = -1; $sublevel = 0; $headings_len = 0; $articles_len = 0; $preamble_len = 0; $preamble_on = 0; $main_on = 0; $lang_len = 0; for ($i = 0; $i < $input_len; $i++) { # Get the current line $curline = $input[$i]; $curline =~ s/&/&/g; $curline =~ s/</</g; $curline =~ s/>/>/g; $curline =~ s/"/\"/g; # Handle preamble if ($preamble_on) { if ($curline =~ /Gapages/) { $preamble_on = 0; $main_on = 1; } if ($preamble_len == 0) { $curline =~ s/.*>//; } $preamble[$preamble_len] = $curline; $preamble_len++; } elsif ($main_on) { # If it is a language remember it if ($curline =~ /\[\[[^W][^P]\:[^\]]*\]\]/) { $lang[$lang_len] = $curline; $lang_len++; } # If it is a recently added article image remember it if ($curline =~ /colspan=2.*\[\[Image:(.*)\]\]/) { $new_articles_image = $1; } # If it is a major heading add it to the major headings if ($curline =~ /<div style="padding:[^>]*>([^<]*)<\/div>/) { $major += 1; $realpart = $1; $imagpart = $1; $realpart =~ s/\[\[.*\]\]//; $realpart =~ s/'''//g; $imagpart =~ s/[^\]]*$//; $major_text[$major] = $realpart; $major_icon[$major] = $imagpart; } # If it is a heading add it to the headings if ($curline =~ /<div class="NavHead"[^>]*>([^<]*)<\/div>/) { $level += 1; $sublevel = 0; $headings_len += 1; $subheadings_len[$level] = 0; $sound = 1; $realpart = $1; $imagpart = $1; $realpart =~ s/\[\[.*\]\]//; $imagpart =~ s/[^\]]*$//; $headings[$level][$sublevel] = $realpart; $headings_icon[$level] = $imagpart; $headings_major[$level] = $major; } # If it is a subheading add it to the headings and start counting articles if ($curline =~ /=====(.*)=====$/) { $sublevel += 1; $subheadings_len[$level] += 1; $headings[$level][$sublevel] = $1; $start = 1; } # If it is an div stop counting articles if ($curline =~ /\/div/) { $start = 0; } # If it is an article add it to the articles list if ($start && $curline =~ /\[\[[^\]]*\]\]/) { $searchstr = $curline; $searchstr =~ s/.*\[\[([^\]]*)\]\].*\n$/\1/; if ($curline =~ /.*\[\[[^\]]*\]\].*<!--.*-->.*\n$/) { $commentstr = $curline; $commentstr =~ s/.*\[\[[^\]]*\]\].*<!--\ *(.*)\ *-->.*\n$/\1/; $commentstr =~ s/\ +$//; $articles[$articles_len]{"comment"} = $commentstr; } if ($curline =~ /^\ *\'\'/) { $articles[$articles_len]{"italic"} = 1; } else { $articles[$articles_len]{"italic"} = 0; } if ($curline =~ /^\ *"/ || $curline =~ /^\ *\"/) { $articles[$articles_len]{"quote"} = 1; } else { $articles[$articles_len]{"quote"} = 0; } $articles[$articles_len]{"name"} = $searchstr; $articles[$articles_len]{"level"} = $level; $articles[$articles_len]{"sublevel"} = $sublevel; $articles[$articles_len]{"verified"} = 0; $articles[$articles_len]{"multi"} = 0; $articles[$articles_len]{"time"} = time(); $articles_len += 1; } } else { if ($curline =~ /textarea/) { $preamble_on = 1; } } } # Check download worked if ($articles_len == 0) { print "Download of good article list failed.\n"; exit(1); } # Sort the articles list @articles = sort basicsort @articles; # Check for multiple entries $narticles[0] = $articles[0]; $narticles_len = 1; for ($i = 1; $i < $articles_len; $i++) { if (basiccmp($articles[$i], $articles[$i - 1]) == 0) { $narticles[$narticles_len - 1]{"multi"} = 1; $narticles[$narticles_len - 1]{"sec_level"} = $articles[$i]{"level"}; $narticles[$narticles_len - 1]{"sec_sublevel"} = $articles[$i]{"sublevel"}; } else { $narticles[$narticles_len] = $articles[$i]; $narticles_len++; } } @articles = @narticles; $articles_len = $narticles_len; # Go through each of the category files $cat_articles_len = 0; $next = "http://en.wikipedia.org/wiki/Category:Wikipedia_good_articles"; for ($i = 1; $next != -1; $i++) { # Download the category file if ($DOWNLOAD) { system "curl \"$next\" > input_cat$i.html"; } # Read the category file undef @input; open(FILE, "input_cat$i.html"); @input = <FILE>; close(FILE); $input_len = $#input + 1; $next = -1; # Go through each line of the category file for ($j = 0; $j < $input_len; $j++) { # Get the current line $curline = $input[$j]; $curline =~ s/&/&/g; # If it is an article add it to the category articles list do { $run = 0; if ($curline =~ />Talk:([^<]*)</) { $cat_articles[$cat_articles_len]{"name"} = $1; $cat_articles_len += 1; $run = 1; $curline =~ s/>Talk:([^<]*)<//; } } while ($run); # Find the next category file if ($curline =~ /<a.*href=\"([^\"]*)\"[^>]*>next 200/) { $next = "http://en.wikipedia.org".$1; } } } # Check download worked if ($cat_articles_len == 0) { print "Download of good article category failed.\n"; exit(1); } # Print the headings to file open(FILE, ">output_headings.txt"); for ($i = 0; $i < $headings_len; $i++) { for ($j = 0; $j < $subheadings_len[$i] + 1; $j++) { if ($j == 0) { print FILE $i.".0 ".$headings[$i][$j]."\n"; } else { print FILE " ".$i.".".$j." ".$headings[$i][$j]."\n"; } } } close(FILE); # Sort category articles list @cat_articles = sort basicsort @cat_articles; $orig = 0; # Go through each of the category articles for ($j = 0; $j < $cat_articles_len; $j++) { # Search the articles list for the current category article $found_index = -1; if (basiccmp($articles[$orig], $cat_articles[$j]) == 0) { $found_index = $orig; $orig = ($orig + 1) % $articles_len; } else { for ($i = $orig + 1; $i != $orig && $found_index == -1; $i = ($i + 1) % $articles_len) { if (basiccmp($articles[$i], $cat_articles[$j]) == 0) { $found_index = $i; $orig = $i + 1; } } } # If an article is found mark it verified otherwise add a new article to the list if ($found_index != -1) { $articles[$found_index]{"verified"} = 1; $name_lower = 0; if (substr($articles[$found_index]{"name"}, 0, 1) ne substr($cat_articles[$j]{"name"}, 0, 1)) { $name_lower = 1; } $articles[$found_index]{"name"} =~ s/[^|]*/$cat_articles[$j]{"name"}/; if ($name_lower) { $articles[$found_index]{"name"} = lcfirst($articles[$found_index]{"name"}); } } else { $articles[$articles_len]{"name"} = $cat_articles[$j]{"name"}; print "Article not found: ".$cat_articles[$j]{"name"}."\n"; $done = 0; do { print "Which level do you what to assign it to? (t for list, n for ignore)\n"; $in = <STDIN>; chomp($in); $in = lc($in); if ($in eq "w") { if ($ADVANCED) { open(FILE, "output_headings.txt"); @input = <FILE>; foreach $line (@input) { print $line; } close(FILE); $artname = $cat_articles[$j]{"name"}; $artname =~ s/\"//g; $artname =~ s/ /_/g; `$WEBBROWSER "http://en.wikipedia.org/w/index.php?title=$artname"`; } } elsif ($in eq "t") { open(FILE, "output_headings.txt"); @input = <FILE>; foreach $line (@input) { print $line; } close(FILE); } elsif ($in eq "exit" || $in eq "q") { exit(1); } elsif ($in eq "n") { $done = 1; } else { $articles[$articles_len]{"level"} = $in; $done = 1; } } while (!$done); if (!($in eq "n")) { print "Which sublevel do you what to assign it to?\n"; $articles[$articles_len]{"sublevel"} = <STDIN>; $articles[$articles_len]{"verified"} = 1; $articles[$articles_len]{"multi"} = 0; $articles[$articles_len]{"time"} = time(); $articles_len++; } } } # Open the time stamps open(FILE, "stamp.time"); @input = <FILE>; close(FILE); $input_len = $#input + 1; $orig = 0; for ($i = 0; $i < $input_len; $i++) { # Get the current line $curline = $input[$i]; $curline =~ s/&/&/g; $curline =~ s/</</g; $curline =~ s/>/>/g; $curline =~ s/"/\"/g; # Fill out the stamp $curline =~ s/\[\[(.*)\]\]//; $stamp[0]{"name"} = $1; $stamp[0]{"time"} = int($curline); # Search the articles list for a match $found_index = -1; if (basiccmp($articles[$orig], $stamp[0]) == 0) { $found_index = $orig; $orig = ($orig + 1) % $articles_len; } else { for ($j = $orig + 1; $j != $orig && $found_index == -1; $j = ($j + 1) % $articles_len) { if (basiccmp($articles[$j], $stamp[0]) == 0) { $found_index = $j; $orig = ($j + 1) % $articles_len; } } } # Assign the time stamp if ($found_index != -1) { $articles[$found_index]{"time"} = $stamp[0]{"time"}; } } # Find the new articles open(FILE, ">stamp.time"); $new_articles_count = 0; @articles = sort timesort @articles; for ($i = 0; $i < $articles_len; $i++) { if ($articles[$i]{"verified"}) { if ($new_articles_count < $NEWARTICLES && $articles[$i]{"time"} != -1) { $new_articles[$new_articles_count] = $articles[$i]; $new_articles_count++; } else { $articles[$i]{"time"} = -1; } print FILE "[[".$articles[$i]{"name"}."]] ".$articles[$i]{"time"}."\n"; } } close(FILE); @new_articles = sort titlesort @new_articles; # Sort the articles again @articles = sort basicsort @articles; # Open the output file open(FILE, ">output.txt"); # Print out preamble for ($i = 0; $i < $preamble_len; $i++) { print FILE $preamble[$i]; } # Print the recently added articles print FILE "|-\n| colspan=2 width=\"100%\" style=\"padding:1em 1em 1em 1em; border:1px solid #dfdfdf; background-color:#E0EDFA\" valign=\"top\" align=\"center\"|"; if ($new_articles_image) { print FILE "[[Image:".$new_articles_image."]]"; } print FILE "\n'''Recently listed good articles'''\n\n"; $pre = 0; for ($i = 0; $i < $new_articles_count; $i++) { if ($pre) { print FILE " —\n"; } if ($new_articles[$i]{"quote"}) { print FILE ""[[".$new_articles[$i]{"name"}."]]""; } elsif ($new_articles[$i]{"italic"}) { print FILE "''[[".$new_articles[$i]{"name"}."]]''"; } else { print FILE "[[".$new_articles[$i]{"name"}."]]"; } $pre = 1; } print FILE "\n|}\n\n__NOTOC__\n"; print FILE "<div style=\"clear:both;\">\n"; print FILE "<!-- DO NOT REMOVE THIS DIV, USED TO FORCE IE TO DISPLAY BACKGROUND FOR ARTS DIV -->\n"; print FILE "</div>\n"; # Go through each heading and subheading $article_count = 0; $major = -1; for ($i = 0; $i < $headings_len; $i++) { # Print out major heading if ($headings_major[$i] > $major) { $major = $headings_major[$i]; if ($major > 0) { print FILE "</div>\n</div>\n"; } print FILE "<div style=\"clear:both;\">\n"; print FILE "<span id=\"$major_text[$major]\" />\n"; print FILE "<div style=\"padding:5px 5px 8px 5px; background-color:#CCCCFF; text-align:left; font-size:larger;\">$major_icon[$major]'''$major_text[$major]'''</div>\n"; print FILE "<div style=\"text-align:left;\">\n"; } for ($j = 0; $j < $subheadings_len[$i] + 1; $j++) { # Write the heading or subheading if ($j == 0) { if ($i != 0) { print FILE "</div>\n"; print FILE "</div>\n"; print FILE "\n"; } print FILE "<div style=\"clear:both;\" class=\"NavFrame\">\n"; print FILE "<div class=\"NavHead\" style=\"padding:2px 2px 2px 30px; background-color:#FFFAF0; text-align:left; font-size:larger;\">$headings_icon[$i]$headings[$i][$j]</div>\n"; print FILE "<div class=\"NavContent\" style=\"text-align:left;\">\n"; if ($SECTIONCOMMENTS) { print FILE "==<!--$headings[$i][$j]--> ==\n"; } else { print FILE "== ==\n"; } } else { print FILE "\n=====".$headings[$i][$j]."=====\n"; } # Run through the articles adding them if they belong to the current level undef @cur_articles; $cur_articles_len = 0; $article_count = 0; for ($k = 0; $k < $articles_len; $k++) { if ($articles[$k]{"level"} == $i && $articles[$k]{"sublevel"} == $j) { if ($articles[$k]{"verified"}) { $cur_articles[$article_count] = $articles[$k]; $article_count++; $total_count++; } else { if ($REMOVED) { print "REMOVED ARTICLE: ".$articles[$k]{"name"}."\n"; } } } elsif ($articles[$k]{"multi"} == 1 && $articles[$k]{"sec_level"} == $i && $articles[$k]{"sec_sublevel"} == $j) { if ($articles[$k]{"verified"}) { $cur_articles[$article_count] = $articles[$k]; $article_count++; } } } # Then sort and print the articles if ($article_count > 0) { @cur_articles = sort titlesort @cur_articles; $pre = 0; for ($k = 0; $k < $article_count; $k++) { if ($pre) { print FILE " —\n"; } if ($cur_articles[$k]{"quote"}) { print FILE ""[[".$cur_articles[$k]{"name"}."]]""; } elsif ($cur_articles[$k]{"italic"}) { print FILE "''[[".$cur_articles[$k]{"name"}."]]''"; } else { print FILE "[[".$cur_articles[$k]{"name"}."]]"; } if ($cur_articles[$k]{"comment"}) { print FILE " <!-- ".$cur_articles[$k]{"comment"}." -->"; } $pre = 1; } if ($article_count == 1) { print FILE "\n<small>\x{2014} (1 article)</small>\n"; } else { print FILE "\n<small>\x{2014} (".$article_count." articles)</small>\n"; } } } } # Close the output file print FILE "</div>\n"; print FILE "</div>\n\n"; for ($i = 0; $i < $lang_len; $i++) { print FILE $lang[$i]; } print FILE "\n"; print FILE "[[Category:Wikipedia good articles| ]]\n"; close(FILE); # Reopen the output file and reprint with correct number of articles open(FILE, "output.txt"); @input = <FILE>; close(FILE); $input_len = $#input + 1; open(FILE, ">output.txt"); for ($i = 0; $i < $input_len; $i++) { $input[$i] =~ s/\[\[Wikipedia\:Good articles\/Statistics\|[0-9]*\]\]/\[\[Wikipedia\:Good articles\/Statistics\|$total_count\]\]/; $input[$i] =~ s/expr: \{\{NUMBEROFARTICLES\:R\}\} \/ [0-9]*/expr: \{\{NUMBEROFARTICLES\:R\}\} \/ $total_count/; print FILE $input[$i]; } close(FILE); # Print out total number of articles print "Number of articles: ".$total_count."\n"; # Open for editing if ($ADVANCED) { print "Do you want me to open your browser for editing? (y/n)\n"; $in = <STDIN>; chomp($in); $in = lc($in); if ($in eq "y") { `$WEBBROWSER "http://en.wikipedia.org/w/index.php?title=Wikipedia:Good_articles&action=edit"`; `$TEXTEDITOR "output.txt"`; } }