User:Lar/ClassificationTableGen/Backlev
From Wikipedia, the free encyclopedia
Perl code: This code generated User:Lar/Sandbox2 (version 6).. There is a lot of work to do on it yet but if you stumble across this, feedback welcome. Not ready for public release yet (if ever).
Updated as of ++Lar: t/c 05:06, 27 March 2006 (UTC)
#!/usr/bin/perl -w #---------------------------------------------------------------------------# # process files and generate a category table # Author: Larry Pieniazek (IBM/Ascential Software) as hobby project # Adapted from stuff I cribbed from all over. # (c)Larry Pieniazek 2006. This library is free software; you can redistribute it # and/or modify it under the same terms as Perl itself. # additionally, can be redistributed and modified under GFDL or CC-SA as you choose # # Abstract: # This perlscript is designed to parse category SQL dumps from wikipedia # which are found here: http://download.wikimedia.org/enwiki/ # For example the 23 March dump is called # http://download.wikimedia.org/enwiki/20060323/enwiki-20060323-categorylinks.sql.gz # # The parsing is to generate article classification tables such as those found at # http://en.wikipedia.org/wiki/Wikipedia_talk:WikiProject_The_Beatles/Article_Classification # # In addition to the dump (currently must have been converted to linefeed delimited tuples) # the other input is a list of categories of interest, one per line. # #---------------------------------------------------------------------------# use strict; use Data::Dumper; use Getopt::Std; # things we may want to use at some point # use File::Spec::Functions; #---------------------------------------------------------------------------# # Subroutine prototypes: # #---------------------------------------------------------------------------# # setup sub Usage; # print info message about how to use this sub ProcessOptions; # Process Command Line Options. # utility sub ScoreToBlank; # underscores to blanks sub BlankToScore; # blanks to underscores sub FlipComma; # reverse a reversed comma string. "Lennon, John" -> "John Lennon" sub UnEscape; # remove escapes with a clever rexp # general sub ReadCatFile; # read the category file into the catArray and is_catHash sub ParseSQL; # parse the big SQL file and build the article data (hashref $collect) sub WriteTable; # create the output sub WriteTableHeader; # used by above, create header sub WriteTableSecBreak; # used by above, create a section break (when the leading char changes) # ------ option switches and related --------- my(%options); # hash of switches, values # ----- logging NOT IMPLEMENTED YET (ever?) -- my($logging); # Flag to denote we are writing to log. my($log_dir); # Log Directory. my($lfh); # Log File Handle. my($LOG_FILE_NAME); # Name of Log File to be written -l value or default # my($verbose); # -v Flag to denote verbose messaging. my($debug); # -d Flag to denote REALLY verbose messaging. my($sqlFileName); # -q <file name of SQL file to parse> (or 'enwiki-20060303-categorylinks.sql') my($catFileName); # -c <file name of categories> (or 'categoryList.txt') my($tableFileName); # -o <table file to create> (or 'tables.txt') # ------ Data structures --------------------- my $inCats=(); # what cats is the article in? my $nameVersions=(); # what are the versions of the name (lex orders) my $rec={}; # ref to one article's record my $collect={}; # ref to all the articles keyed on the $artKey var # what the data will look like # my $rec={ # key => "178234", # numeric key from first tuple value (article key, believed unique) # artLink => "link text" # text to use for link not same as sort # sortKey => "sort text" # sort text (what order should article come out) # inCats => [@inCats], # array of categories the article is in # nameVersions => [@nameVersions] # array of version of the name of the article # # this one may not be used for anything # }; # one article's record # # my %collect={ # key => $rec # }; # all the articles keyed on the $artKey var # ------ work vars ---------------------------- my @catList; my @catArray; my %is_catHash; # file handles my $sqlH; my $tableH; my $cfH; #---------------------------------------------------------------------------# # Usage - Print Usage Information and exit. #---------------------------------------------------------------------------# sub Usage { print <<END_USAGE; Usage: $0 [-h] [-v] [-d] [-q <sqlFile>] [-c <catFile>] [-o <tableFile>] Switch meanings: -h --help print this help message. -v --version print version message. -d <0|1|2|3> debug: 0: quiet 1: Verbose Mode 2: REALLY verbose mode 3: Every frigging detail. File switches: -q <file name of SQL file to parse> (or 'enwiki-20060303-categorylinks_sample.sql' by default) -c <file name of categories> (or 'categoryList.txt' by default) -o <table file to create> (or 'tables.txt' by default) END_USAGE print "Status: 99\n"; exit(99); } # End of Usage. sub Version { print "\nfilterCategories version 0.04 - 26 March 2006, Larry Pieniazek." ." \n -- released under GFDL and CC-SA -- \n\n"; # really should print something else } # this stuff isn't quite right at the moment # required for getopts to support --help and --version sub HELP_MESSAGE{ &Usage(); } # required for getopts to support --help and --version sub VERSION_MESSAGE{ &Version(); } #---------------------------------------------------------------------------# # ProcessOptions - Process Command Line Options. #---------------------------------------------------------------------------# sub ProcessOptions { &Version if ($options{'v'}); &Usage if ($options{'h'}); my %debugHash = ( '0'=>"silent" , '1'=>"normal trace", '2'=>"very chatty", '3'=>"insanely chatty" ); if (defined $options{'d'}) { $debug=$options{'d'}; if ($debugHash{$debug}) { print"...debug switch was ".$options{'d'}." giving setting: ".$debugHash{$debug}."\n" unless 0 == $options{'d'} ; # if 0, then REALLY quiet } else { $debug=1; print"...debug switch was ".$options{'d'}." defaulting debug to 1 - normal trace\n"; } # recognised option } else { # default, no switch $debug=1; print"...debug switch not found, defaulting debug to 1 - normal trace\n"; } if (defined $options{'q'}) { $sqlFileName=$options{'q'}; } else { $sqlFileName="enwiki-20060303-categorylinks_sample.sql"; } if (defined $options{'c'}) { $catFileName=$options{'c'}; } else { $catFileName="categoryList.txt"; } if (defined $options{'o'}) { $tableFileName=$options{'o'}; } else { $tableFileName="tables.txt"; } } # End of ProcessOptions. #---------------------------------------------------------------------------# # ReadCatFile - read in categories to build article tracking tables for #---------------------------------------------------------------------------# sub ReadCatFile { my $rc=0; # $catFileName = $_[0]; # now set processOptions() if ($debug>2) { stat($catFileName); print "Exists\n" if -e _; print "Readable\n" if -r _; print "Writable\n" if -w _; print "Executable\n" if -x _; print "Setuid\n" if -u _; print "Setgid\n" if -g _; print "Sticky\n" if -k _; print "Text\n" if -T _; print "Binary\n" if -B _; } if (( -e $catFileName ) && ( -r $catFileName )) { if (!open $cfH, "<", $catFileName){ warn "can't open ".$catFileName."\n"; $rc=99; return $rc; } } else { print "error with ".$catFileName." ... does not exist or not readable \n"; $rc= 99; return $rc; } %is_catHash = (); if ($debug>0) {print "reading ".$catFileName."\n";} # @catList=<$cfH>; my $catListItem; for (;;) { undef $!; unless (defined( $catListItem = <$cfH> )) { die $! if $!; last; # reached EOF } chomp $catListItem; $catListItem=ScoreToBlank($catListItem); push @catList, $catListItem; # set up searchable hash... $is_catHash{$catListItem} = 1; } if ($debug>0) { print "\nCategories to process: \n"; for my $fe(@catList) {print( $fe."\n");}; print "\n"; } if ($debug>1) { print "\n\n... corresponding hash values: \n"; while (my ($key, $value) = each %is_catHash) { print "$key = $value\n"; } print "\n"; } # end chatty trace $rc=0; return $rc; } #---------------------------------------------------------------------------# # ScoreToBlank - convert underscores to blanks #---------------------------------------------------------------------------# sub ScoreToBlank { my $str=$_[0]; if ($debug>3) {print "ScoreToBlank \$str IN: $str\n";} $str=~ s/_/ /g; if ($debug>3) {print "ScoreToBlank \$str OUT: $str\n";} return $str; } # there #---------------------------------------------------------------------------# # BlankToScore - convert blanks to underscores #---------------------------------------------------------------------------# sub BlankToScore { my $str=$_[0]; $str=~ s/ /_/g; return $str; } # and back again #---------------------------------------------------------------------------# # FlipComma - take a phrase with comma (and 1 blank) and flip it, # "Lennon, John" -> "John Lennon" #---------------------------------------------------------------------------# sub FlipComma { my $str=$_[0]; my ($first,$second)= split(/, /,$str,2); if (length($second)>0) { # there is something there to flip $str=$second." ".$first; } return $str; } # round and round we go #---------------------------------------------------------------------------# # StripLeadTrail - strip leading s/^\s+// and trailing s/\s+$// blanks #---------------------------------------------------------------------------# sub StripLeadTrail { my $str=$_[0]; $str=~ s/^\s+//; $str=~ s/\s+$//; return $str; } # and back again #---------------------------------------------------------------------------# # UnEscape - remove escape chars unless they're escaped # this code lifted from John Alden's Escape Delimiters # http://search.cpan.org/src/JOHNA/Text-EscapeDelimiters-1.004/lib/Text/EscapeDelimiters.pm # Text::EscapeDelimiters v1.004 # (c) John Alden 2005. This library is free software; you can redistribute it # and/or modify it under the same terms as Perl itself. #---------------------------------------------------------------------------# sub UnEscape { my($string) = $_[0]; my $eseq = "\\"; return $string unless($eseq); #no-op #Remove escape characters apart from double-escapes $string =~ s/\Q$eseq\E(?!\Q$eseq\E)//gs; #Fold double-escapes down to single escapes $string =~ s/\Q$eseq$eseq\E/$eseq/gs; return $string; } #---------------------------------------------------------------------------# # ParseSQL - read through the SQL file and build the data structures # - read in one tuple at a time (currently one line but change to # buffered read later) # - for each tuple parse out the pieces we need # - add or update record in $collect hash, recording category and lexical key # (if we find a comma reversed version of the article name, it's probably # a better lexical key than we have so take it.) # - update lexical key, article name, category seen # - possibly strip blanks, change _ to blanks, remove \ escapes, # and reverse comma fields. (future: use list of articles with commas # in their names as refinement) #---------------------------------------------------------------------------# sub ParseSQL { my $rc=0; if (( -e $sqlFileName ) && ( -r $sqlFileName )) { open ($sqlH, "<", $sqlFileName) or die "can't open ".$sqlFileName." for reading \n"; } else { print "error with ".$sqlFileName." ... does not exist or not readable \n"; $rc=99; return $rc; } if ($debug>0) {print "reading ".$sqlFileName."\n"; } my $sqlLine; my $sqlLC=0; $Data::Dumper::Indent = 2; # pretty print (3 is with array indices $Data::Dumper::Useqq = 1; # print strings in double quotes $Data::Dumper::Pair = " : "; # specify hash key/value separator $Data::Dumper::Purity = 1; # fill in the holes for eval $Data::Dumper::Maxdepth = 3; # no deeper than 3 refs down $Data::Dumper::Deepcopy = 1; # deep copy for (;;) { undef $!; unless (defined( $sqlLine = <$sqlH> )) { die $! if $!; last; # reached EOF } # we have to process lines that look like any of these # (12731,'Catholics_not_in_communion_with_Rome','George Harrison',20060228150212), # ordinary # (12731,'Deaths_by_lung_cancer','Harrison, George',20050904074730), # sort order is different (the article name is probably the first 12731 that doesn't # have a comma in the article name # (12731,'George_Harrison','',20060303000936), # self ref... the category contains an article named the same thing # (2246703,'The_Beatles_songs','Don\'t Pass Me By',20050719071328), # embedded escaped ' will screw up parse if not careful. # safe to process line as we got a line $sqlLC++; if ($debug>2) { print "line ".$sqlLC." was ".$sqlLine."\n"; } chomp $sqlLine; my($firstP, $secondP) = split(/',/, $sqlLine,2); if ($debug>2) { print "firstP: >".$firstP."< secondP: >".$secondP."< \n";} my($artKey, $catName) = split(/,'/,$firstP,2); $artKey=substr($artKey,1); $catName=ScoreToBlank($catName); if ($debug>2) { print "artKey: >".$artKey."< catName: >".$catName."< \n"; } my($artName, $timeStamp)=split(/',/,substr($secondP,1),2); # $timeStamp=split(/),/,$timeStamp,1); $timeStamp=substr($timeStamp,0,-2); if ($debug>2) {print "artName: >".$artName."< timeStamp: >".$timeStamp."< \n";} if (0==length($artName)) { # empty, this is the case of matching art/cat names $artName=$catName; } else { $artName=StripLeadTrail(UnEscape($artName)); } my $sortKey=""; my $skHasComma=0; my $anHasComma=0; if (exists($is_catHash{$catName}) ) { if ($debug>1) { print "artName: >".$artName. "<\n timeStamp: >".$timeStamp. "<\n artKey: >".$artKey. "<\n catName: >".$catName."< \n"; print " ... one of our cats! \n"; } if (exists($collect->{$artKey}) ) { if ($debug>1) { print " ... and we have the article already\n"; }; $rec = $collect->{$artKey}; # get ref to existing one $inCats= $rec->{inCats}; # and to the arrays it carries $nameVersions = $rec->{nameVersions}; } else { $rec={}; # make an empty one $rec->{key}=$artKey; # uses same key $inCats=(); $nameVersions=(); } $inCats->{$catName}=1; $nameVersions->{$artName}=1; $rec->{'inCats'}=$inCats; $rec->{'nameVersions'}=$nameVersions; # put logic to handle making sure name of article for link is non comma $anHasComma= ( $artName =~/,/ ); my $artNameSave=$artName; if ($anHasComma) { # if article has comma flip it and save that as name $artNameSave=FlipComma($artName); } if (!(exists($rec->{artLink}))) { $rec->{artLink}=$artNameSave; } if ($debug>1) {print "\$artName: $artName \$artNameSave: $artNameSave \n"} # put logic for sort key here if (exists($rec->{sortKey})) { $sortKey=$rec->{sortKey}; if ($debug>1) {print "sortKey: $sortKey\n"; } if ($sortKey ne $artName) { # If the keys are the same do nothing $skHasComma= ( $sortKey =~/,/ ); if ($debug>1) {print "anHasComma: $anHasComma skHasComma: $skHasComma\n";} if ($anHasComma eq $skHasComma) { # if neither has a comma, or both have a comma take whichever one is earlier in the alphabet if ($sortKey gt $artName) { $rec->{sortKey}= $artName; } # else not needed because sortKey already earlier, leave it. } else { # If the new key has a comma in it, use that one, it's probably the sort key if ($anHasComma) { $rec->{sortKey}= $artName; } # else not needed, leave as is } if ($debug>1) {print "sortKey now is ".$rec->{sortKey}."\n"; } } # end of handling different keys } else { # we don't have it, save it away $rec->{sortKey}=$artName; # since it's new, the sort key is the name we found if ($debug>1) {print "added sortKey: $rec->{sortKey}\n"; } } # end if sortKey does/doesn't exist $collect->{$artKey}=$rec; } # end if category is one we care about } # end for (;;) (the read loop) if ($debug>0) { print "...collect: \n"; print Dumper($collect); } if ($debug>0) {print "finished parsing SQL\n"; } return $rc; } # end ParseSQL #---------------------------------------------------------------------------# # WriteTableHeader - create output table header #---------------------------------------------------------------------------# sub WriteTableHeader { # assumes that $tableH is open and valid print $tableH <<END_TABLEH; {| |valign=top| {| width="100%" border="1" cellpadding="2" cellspacing="0" style="margin: 1em 1em 1em 0; background: #f9f9f9; border: 1px #aaa solid; border-collapse: collapse; font-size: 85%;" |- !width=20%|Article !width=15%|Categories !width=7%|Assessed !width=7%|Status !width=5%|Uses Infobox !width=37%|Comments and Pending tasks !width=8%|Assessed by END_TABLEH return 0; } #---------------------------------------------------------------------------# # WriteTableSecBreak - create output table break between sections #---------------------------------------------------------------------------# sub WriteTableSecBreak { my $headChar=$_[0]; print $tableH "|-\n|colspan=\"7\" align=\"left\" style=\"background:white; font-size: 200%;" ." font-weight:bold; border-bottom:4px solid grey; \"| \n" ."====".$headChar."====\n"; return 0; } # end WriteTableSecBreak #---------------------------------------------------------------------------# # WriteTable - create output table # - sort the data structure by the sort keys (which are the lexical # (sometimes comma inverted) article names) ... these keys are inside the # structure # - using the sorted array of keys, iterate the hash in sort order # - every time the first letter of the key changes, write out a SecBreak #---------------------------------------------------------------------------# sub WriteTable { my $rc=0; if ($debug>2) { print" statting: ".$tableFileName."\n"; stat($tableFileName); print "Exists\n" if -e _; print "Readable\n" if -r _; print "Writable\n" if -w _; print "Executable\n" if -x _; print "Setuid\n" if -u _; print "Setgid\n" if -g _; print "Sticky\n" if -k _; print "Text\n" if -T _; print "Binary\n" if -B _; } open ($tableH, '>', $tableFileName) or die "can't open ".$tableFileName." for writing \n"; $rc=&WriteTableHeader(); if ($rc) { die "error building table header\n"; } # we want to create line pairs of the form # (with the pipe in col 1) # |- # |[[Abbey Road (album)]]||[[:category:The Beatles albums|]]|| ||{{/Unknown}}||unknown|| || # |- # |[[Anthology 1]]||[[:category:The Beatles albums|]]|| ||{{/Unknown}}||unknown|| || # # in sorted order # make an array of the keys to the hash # (the article keys, which are not in any particular alpha) my @keys = sort { $collect->{$a}->{sortKey} # custom sort spec, use the lexical key cmp # (which is embedded in the rec) $collect->{$b}->{sortKey} } keys %{$collect}; my $firstLet=chr(00); # has to be lower than any other character val! # iterate in sorted order foreach my $artKey ( @keys ) { $rec = $collect->{$artKey}; # get easy access to the record $inCats= $rec->{inCats}; # and to the category array it carries my $artLink=$rec->{artLink}; my $trialFirst=substr($rec->{sortKey},0,1); # get first char if ($trialFirst ne $firstLet) { $firstLet=$trialFirst; if ($debug>1) {print "Switching to new first letter: $firstLet \n";} &WriteTableSecBreak($firstLet); } # end if new first letter in lexical order my ($catStr,$catV,$catK); $catStr=""; while (($catK, $catV) = each %{$inCats}) { if ("" ne $catStr) { $catStr.="<br>"; } $catStr.="[[:category:".$catK."|]]"; } # loop through the categories we saw if ($debug>0) { print "key: ".$artKey." rec key ".$rec->{key}." article Link text ".$artLink."\n"; } print $tableH "|-\n||[[".$artLink."]]||".$catStr."||| ||" ."{{Wikipedia:WikiProject The Beatles/Article Classification/Unknown}}" ."||unknown|| || \n"; } # end of iteration through the hash in sorted order # finish off table print $tableH "\n|}"; return $rc; } #---------------------------------------------------------------------------# # Main routine - # process options # read in categories desired # build hash of articles by parsing SQL file # write out table file using hash #---------------------------------------------------------------------------# # main my $rc=0; # print "prior to getopts\n"; getopts('hvd:q:c:o:', \%options) or &Usage; # debug also d # print "post getopts, pre process\n"; &ProcessOptions(); if ($debug>1) { print "post process, pre read cat\n"; } $rc=&ReadCatFile(); if ($rc) { die "error reading category list\n"; } $rc=&ParseSQL(); if ($rc) { die "error reading SQL or building structure\n"; } $rc=&WriteTable(); if ($rc) { die "error building table\n"; } exit 0;