User:Lar/ClassificationTableGen/Backlev

From Wikipedia, the free encyclopedia

Perl code: This code generated User:Lar/Sandbox2 (version 6).. There is a lot of work to do on it yet but if you stumble across this, feedback welcome. Not ready for public release yet (if ever).

Updated as of ++Lar: t/c 05:06, 27 March 2006 (UTC)

#!/usr/bin/perl -w
#---------------------------------------------------------------------------#
# process files and generate a category table                       
# Author: Larry Pieniazek (IBM/Ascential Software) as hobby project
# Adapted from stuff I cribbed from all over.            
# (c)Larry Pieniazek 2006. This library is free software; you can redistribute it 
# and/or modify it under the same terms as Perl itself.  
# additionally, can be redistributed and modified under GFDL or CC-SA as you choose 
# 
# Abstract:
#       This perlscript is designed to parse category SQL dumps from wikipedia
#       which are found here: http://download.wikimedia.org/enwiki/
#   For example the 23 March dump is called 
#         http://download.wikimedia.org/enwiki/20060323/enwiki-20060323-categorylinks.sql.gz
#   
#       The parsing is to generate article classification tables such as those found at
#         http://en.wikipedia.org/wiki/Wikipedia_talk:WikiProject_The_Beatles/Article_Classification
#
#   In addition to the dump (currently must have been converted to linefeed delimited tuples)
#       the other input is a list of categories of interest, one per line.
#
#---------------------------------------------------------------------------#
use strict;
use Data::Dumper;
use Getopt::Std;

# things we may want to use at some point
# use File::Spec::Functions;

#---------------------------------------------------------------------------#
# Subroutine prototypes:                                                    #
#---------------------------------------------------------------------------#

# setup

sub Usage;                              # print info message about how to use this
sub ProcessOptions;     # Process Command Line Options.

# utility

sub ScoreToBlank;               # underscores to blanks
sub BlankToScore;               # blanks to underscores
sub FlipComma;                  # reverse a reversed comma string. "Lennon, John" -> "John Lennon"
sub UnEscape;                   # remove escapes with a clever rexp

# general 

sub ReadCatFile;                # read the category file into the  catArray and is_catHash
sub ParseSQL;                   # parse the big SQL file and build the article data (hashref $collect)
sub WriteTable;                 # create the output
sub WriteTableHeader;           # used by above, create header
sub WriteTableSecBreak;         # used by above, create a section break (when the leading char changes)


# ------ option switches and related ---------

my(%options);       # hash of switches, values

# ----- logging NOT IMPLEMENTED YET (ever?) --

my($logging);       # Flag to denote we are writing to log.
my($log_dir);       # Log Directory.
my($lfh);           # Log File Handle.
my($LOG_FILE_NAME); # Name of Log File to be written -l value or default


# my($verbose);     # -v Flag to denote verbose messaging.
my($debug);         # -d Flag to denote REALLY verbose messaging.


my($sqlFileName);   # -q <file name of SQL file to parse> (or 'enwiki-20060303-categorylinks.sql')
my($catFileName);   # -c <file name of categories>        (or 'categoryList.txt')
my($tableFileName); # -o <table file to create>           (or 'tables.txt')

# ------ Data structures ---------------------

my $inCats=();          # what cats is the article in?
my $nameVersions=();    # what are the versions of the name (lex orders)
    
my $rec={};             # ref to one article's record
my $collect={};         # ref to all the articles keyed on the $artKey var
    
    
# what the data will look like
    
#    my $rec={
#       key => "178234",        # numeric key from first tuple value (article key, believed unique)
#       artLink => "link text"  # text to use for link not same as sort
#       sortKey => "sort text"  # sort text (what order should article come out)
#       inCats  => [@inCats],   # array of categories the article is in
#       nameVersions => [@nameVersions] # array of version of the name of the article
#                                               # this one may not be used for anything
#       };                                      # one article's record
#       
#    my %collect={
#       key => $rec
#       };                              # all the articles keyed on the $artKey var

# ------ work vars ----------------------------

my @catList;
my @catArray;
my %is_catHash;

# file handles

my $sqlH;
my $tableH;
my $cfH;

#---------------------------------------------------------------------------#
# Usage - Print Usage Information and exit.                                 
#---------------------------------------------------------------------------#
sub Usage {

  print <<END_USAGE;
  Usage: $0 [-h] [-v] [-d] [-q <sqlFile>] [-c <catFile>] [-o <tableFile>] 

  Switch meanings:
    -h --help    print this help message.
    -v --version print version message.
    -d <0|1|2|3>   debug:
        0: quiet
        1: Verbose Mode
        2: REALLY verbose mode
        3: Every frigging detail.
  File switches:  
    -q <file name of SQL file to parse>
        (or 'enwiki-20060303-categorylinks_sample.sql' by default)
    -c <file name of categories>        
        (or 'categoryList.txt' by default)
    -o <table file to create>           
        (or 'tables.txt' by default)

END_USAGE
  print "Status: 99\n";
  exit(99);
} # End of Usage.

sub Version {
    print "\nfilterCategories version 0.04 - 26 March 2006, Larry Pieniazek."
        ." \n -- released under GFDL and CC-SA -- \n\n"; 
    # really should print something else        
}

# this stuff isn't quite right at the moment

# required for getopts to support --help and --version
sub HELP_MESSAGE{
    &Usage();
}
# required for getopts to support --help and --version
sub VERSION_MESSAGE{
    &Version();     
} 

#---------------------------------------------------------------------------#
# ProcessOptions - Process Command Line Options.  
#---------------------------------------------------------------------------#
sub ProcessOptions {
    &Version if ($options{'v'});
    &Usage if ($options{'h'});   

    my %debugHash = ( 
        '0'=>"silent" ,
        '1'=>"normal trace",
        '2'=>"very chatty",
        '3'=>"insanely chatty" );
                                  
    if (defined $options{'d'}) {
        $debug=$options{'d'};
        if ($debugHash{$debug}) {
            print"...debug switch was ".$options{'d'}." giving setting: ".$debugHash{$debug}."\n"
                unless 0 == $options{'d'} ;  # if 0, then REALLY quiet
        } else {
            $debug=1;
            print"...debug switch was ".$options{'d'}." defaulting debug to 1 - normal trace\n";
        } # recognised option
                
        } else { # default, no switch
            $debug=1;
            print"...debug switch not found, defaulting debug to 1 - normal trace\n";                         
        }
        
        if (defined $options{'q'}) {
            $sqlFileName=$options{'q'};
        } else {
            $sqlFileName="enwiki-20060303-categorylinks_sample.sql";          
        }

        if (defined $options{'c'}) {
            $catFileName=$options{'c'};
        } else {
            $catFileName="categoryList.txt";          
        }

        if (defined $options{'o'}) {
            $tableFileName=$options{'o'};
        } else {
            $tableFileName="tables.txt";      
        }

} # End of ProcessOptions.

#---------------------------------------------------------------------------#
# ReadCatFile - read in categories to build article tracking tables for
#---------------------------------------------------------------------------#
sub ReadCatFile {

    my $rc=0;
        
#   $catFileName = $_[0];  # now set processOptions()

    if ($debug>2) {
        stat($catFileName);
        print "Exists\n" if -e _;
        print "Readable\n" if -r _;
        print "Writable\n" if -w _;
        print "Executable\n" if -x _;
        print "Setuid\n" if -u _;
        print "Setgid\n" if -g _;
        print "Sticky\n" if -k _;
        print "Text\n" if -T _;
        print "Binary\n" if -B _;
    }
        
    if (( -e $catFileName ) && ( -r $catFileName )) {
        if (!open $cfH, "<", $catFileName){ warn "can't open ".$catFileName."\n";  $rc=99; return $rc; }
    } else {
        print  "error with ".$catFileName." ... does not exist or not readable \n";
        $rc= 99;
        return $rc;
    }

    %is_catHash = ();

    if ($debug>0) {print "reading ".$catFileName."\n";}
    # @catList=<$cfH>;
        
    my $catListItem;
        
    for (;;) {
        undef $!;
        unless (defined( $catListItem = <$cfH> )) {
            die $! if $!;
            last; # reached EOF
        }
        chomp $catListItem;
        $catListItem=ScoreToBlank($catListItem);
        push  @catList, $catListItem;
                
        # set up searchable hash...
        $is_catHash{$catListItem} = 1;
        }
        
    if ($debug>0) {
        print "\nCategories to process: \n";
        for my $fe(@catList) {print( $fe."\n");};
        print "\n";
    }
        
    if ($debug>1) {
        print "\n\n... corresponding hash values: \n";
        while (my ($key, $value) = each %is_catHash) {
            print "$key = $value\n";
        }
        print "\n";
    } # end chatty trace

    $rc=0;
    return $rc; 
}

#---------------------------------------------------------------------------#
# ScoreToBlank - convert underscores to blanks
#---------------------------------------------------------------------------#
sub ScoreToBlank {
    my $str=$_[0];
    if ($debug>3) {print "ScoreToBlank \$str IN: $str\n";}
    $str=~ s/_/ /g;     
    if ($debug>3) {print "ScoreToBlank \$str OUT: $str\n";}
    return $str;
} # there

#---------------------------------------------------------------------------#
# BlankToScore - convert blanks to underscores
#---------------------------------------------------------------------------#
sub BlankToScore {
    my $str=$_[0];
    $str=~ s/ /_/g;     
    return $str;
} # and back again

#---------------------------------------------------------------------------#
# FlipComma - take a phrase with comma (and 1 blank) and flip it,
#    "Lennon, John" -> "John Lennon"
#---------------------------------------------------------------------------#
sub FlipComma {
    my $str=$_[0];
    my ($first,$second)= split(/, /,$str,2);
    if (length($second)>0) { # there is something there to flip
        $str=$second." ".$first;
    }
    return $str;                
} # round and round we go

#---------------------------------------------------------------------------#
# StripLeadTrail - strip leading s/^\s+// and trailing s/\s+$// blanks 
#---------------------------------------------------------------------------#
sub StripLeadTrail {
    my $str=$_[0];
    $str=~ s/^\s+//;
    $str=~      s/\s+$//;
    return $str;
} # and back again      
        
#---------------------------------------------------------------------------#
# UnEscape - remove escape chars unless they're escaped
#   this code lifted from John Alden's Escape Delimiters
#     http://search.cpan.org/src/JOHNA/Text-EscapeDelimiters-1.004/lib/Text/EscapeDelimiters.pm
# Text::EscapeDelimiters v1.004
# (c) John Alden 2005. This library is free software; you can redistribute it 
# and/or modify it under the same terms as Perl itself.
#---------------------------------------------------------------------------#
sub UnEscape {
    my($string) = $_[0];
    my $eseq = "\\";
    return $string unless($eseq); #no-op
        
    #Remove escape characters apart from double-escapes
    $string =~ s/\Q$eseq\E(?!\Q$eseq\E)//gs;

    #Fold double-escapes down to single escapes
    $string =~ s/\Q$eseq$eseq\E/$eseq/gs;

    return $string;
}


#---------------------------------------------------------------------------#
# ParseSQL - read through the SQL file and build the data structures 
# - read in one tuple at a time (currently one line but change to 
#   buffered read later)
# - for each tuple parse out the pieces we need
# - add or update record in $collect hash, recording category and lexical key
#   (if we find a comma reversed version of the article name, it's probably
#   a better lexical key than we have so take it.)
#   - update lexical key, article name, category seen
#   - possibly strip blanks, change _ to blanks, remove \ escapes, 
#     and reverse comma fields. (future: use list of articles with commas 
#     in their names as refinement)
#---------------------------------------------------------------------------#
sub ParseSQL {

    my $rc=0;
        
    if (( -e $sqlFileName ) && ( -r $sqlFileName )) {
        open ($sqlH, "<", $sqlFileName) or die "can't open ".$sqlFileName." for reading \n";
    } else {
        print  "error with ".$sqlFileName." ... does not exist or not readable \n";
        $rc=99;
        return $rc;
    }

    if ($debug>0) {print "reading ".$sqlFileName."\n"; }
        
    my $sqlLine;
    my $sqlLC=0;

    $Data::Dumper::Indent = 2;         # pretty print (3 is with array indices
    $Data::Dumper::Useqq = 1;          # print strings in double quotes
    $Data::Dumper::Pair = " : ";       # specify hash key/value separator
    $Data::Dumper::Purity = 1;         # fill in the holes for eval
    $Data::Dumper::Maxdepth = 3;       # no deeper than 3 refs down
    $Data::Dumper::Deepcopy = 1;       # deep copy    
        
    for (;;) {
        undef $!;
        unless (defined( $sqlLine = <$sqlH> )) {
            die $! if $!;
            last; # reached EOF
        }
        
        # we have to process lines that look like any of these 
        # (12731,'Catholics_not_in_communion_with_Rome','George Harrison',20060228150212),
        #    ordinary
        # (12731,'Deaths_by_lung_cancer','Harrison, George',20050904074730),
        #    sort order is different (the article name is probably the first 12731 that doesn't
        #    have a comma in the article name
        # (12731,'George_Harrison','',20060303000936),
        #    self ref... the category contains an article named the same thing
        # (2246703,'The_Beatles_songs','Don\'t Pass Me By',20050719071328),
        #    embedded escaped ' will screw up parse if not careful.
        
        # safe to process line as we got a line        
        
        $sqlLC++;
        if ($debug>2) { print "line ".$sqlLC." was ".$sqlLine."\n"; }
        
        chomp $sqlLine;
        my($firstP, $secondP) = split(/',/, $sqlLine,2);        
        if ($debug>2) { print "firstP: >".$firstP."< secondP: >".$secondP."< \n";}
        
        my($artKey, $catName) = split(/,'/,$firstP,2);
        $artKey=substr($artKey,1);
        $catName=ScoreToBlank($catName);
                
        if ($debug>2) { print "artKey: >".$artKey."< catName: >".$catName."< \n"; }
        
        my($artName, $timeStamp)=split(/',/,substr($secondP,1),2);
        # $timeStamp=split(/),/,$timeStamp,1);
        
        $timeStamp=substr($timeStamp,0,-2);
        if ($debug>2) {print "artName: >".$artName."< timeStamp: >".$timeStamp."< \n";}
        
        
        if (0==length($artName)) { # empty, this is the case of matching art/cat names
            $artName=$catName;
        } else {
            $artName=StripLeadTrail(UnEscape($artName));
        }
        
        my $sortKey="";
        my $skHasComma=0;
        my $anHasComma=0;

        if (exists($is_catHash{$catName}) ) {
            if ($debug>1) {
                print "artName: >".$artName.
                    "<\n  timeStamp: >".$timeStamp.
                    "<\n  artKey: >".$artKey.
                    "<\n  catName: >".$catName."< \n";
                print " ... one of our cats! \n";
                }
            if (exists($collect->{$artKey}) ) {
                if ($debug>1) { print "    ... and we have the article already\n"; };
                $rec = $collect->{$artKey};          # get ref to existing one
                $inCats= $rec->{inCats};             # and to the arrays it carries
                $nameVersions = $rec->{nameVersions};
            } else {
                $rec={};                                # make an empty one
                $rec->{key}=$artKey;         # uses same key
                $inCats=();                     
                $nameVersions=();
            }
            $inCats->{$catName}=1;
            $nameVersions->{$artName}=1;
            $rec->{'inCats'}=$inCats;
            $rec->{'nameVersions'}=$nameVersions;
                
            # put logic to handle making sure name of article for link is non comma
            $anHasComma= ( $artName =~/,/ );
            my $artNameSave=$artName;
            if ($anHasComma) { # if article has comma flip it and save that as name
                $artNameSave=FlipComma($artName);                       
            } 
            if (!(exists($rec->{artLink}))) {        
                $rec->{artLink}=$artNameSave;
            }
            if ($debug>1) {print "\$artName: $artName \$artNameSave: $artNameSave       \n"}
            # put logic for sort key here
            if (exists($rec->{sortKey})) {
                $sortKey=$rec->{sortKey};
                if ($debug>1) {print "sortKey: $sortKey\n"; }
                    
                if ($sortKey ne $artName) { # If the keys are the same do nothing
                    $skHasComma= ( $sortKey =~/,/ );                                    
                    if ($debug>1) {print "anHasComma: $anHasComma skHasComma: $skHasComma\n";}
                                                
                    if ($anHasComma eq $skHasComma) {
                        # if neither has a comma, or both have a comma take whichever one is earlier in the alphabet
                        if ($sortKey gt $artName) {
                            $rec->{sortKey}= $artName;
                        } # else not needed because sortKey already earlier, leave it.
                    } else {                                            
                    # If the new key has a comma in it, use that one, it's probably the sort key
                        if ($anHasComma) {
                            $rec->{sortKey}= $artName;
                        } # else not needed, leave as is
                    }
                    if ($debug>1) {print "sortKey now is ".$rec->{sortKey}."\n"; }
                } # end of handling different keys
            } else { # we don't have it, save it away
                $rec->{sortKey}=$artName;    # since it's new, the sort key is the name we found
                if ($debug>1) {print "added sortKey: $rec->{sortKey}\n"; }
            } # end if sortKey does/doesn't exist       
            $collect->{$artKey}=$rec;
        } # end if category is one we care about              
    } # end for (;;) (the read loop)

    if ($debug>0) {
        print "...collect: \n";               
        print Dumper($collect); 
    }
    if ($debug>0) {print "finished parsing SQL\n"; }

    return $rc;
} # end ParseSQL

#---------------------------------------------------------------------------#
# WriteTableHeader - create output table header
#---------------------------------------------------------------------------#

sub WriteTableHeader {
    # assumes that $tableH is open and valid
    print $tableH <<END_TABLEH;   
{| 
|valign=top|
{| width="100%" border="1" cellpadding="2" cellspacing="0" style="margin: 1em 1em 1em 0; background: #f9f9f9; border: 1px #aaa solid; border-collapse: collapse; font-size: 85%;"
|-
!width=20%|Article
!width=15%|Categories
!width=7%|Assessed
!width=7%|Status
!width=5%|Uses Infobox
!width=37%|Comments and Pending tasks
!width=8%|Assessed by
        
END_TABLEH

    return 0;
}

#---------------------------------------------------------------------------#
# WriteTableSecBreak - create output table break between sections
#---------------------------------------------------------------------------#
sub WriteTableSecBreak {
    my $headChar=$_[0];
    print $tableH "|-\n|colspan=\"7\" align=\"left\" style=\"background:white; font-size: 200%;"
                ." font-weight:bold; border-bottom:4px solid grey; \"| \n"
                ."====".$headChar."====\n";
    return 0;                                   
} # end  WriteTableSecBreak

#---------------------------------------------------------------------------#
# WriteTable - create output table
# -  sort the data structure by the sort keys (which are the lexical 
#   (sometimes comma inverted) article names) ... these keys are inside the
#   structure
# - using the sorted array of keys, iterate the hash in sort order
# - every time the first letter of the key changes, write out a SecBreak
#---------------------------------------------------------------------------#

sub WriteTable {
    my $rc=0;
        
    if ($debug>2) {
        print" statting: ".$tableFileName."\n";
        stat($tableFileName);
        print "Exists\n" if -e _;
        print "Readable\n" if -r _;
        print "Writable\n" if -w _;
        print "Executable\n" if -x _;
        print "Setuid\n" if -u _;
        print "Setgid\n" if -g _;
        print "Sticky\n" if -k _;
        print "Text\n" if -T _;
        print "Binary\n" if -B _;
    }

    open ($tableH, '>', $tableFileName) or die "can't open ".$tableFileName." for writing \n";
        
        
    $rc=&WriteTableHeader();
    if ($rc) { die "error building table header\n"; }
                
    # we want to create line pairs of the form  
    # (with the pipe in col 1)
    #   |-
    #   |[[Abbey Road (album)]]||[[:category:The Beatles albums|]]|| ||{{/Unknown}}||unknown|| || 
    #   |-
    #   |[[Anthology 1]]||[[:category:The Beatles albums|]]|| ||{{/Unknown}}||unknown|| || 
    # 
    # in sorted order
        
    # make an array of the keys to the hash 
    # (the article keys, which are not in any particular alpha)

    my @keys = sort {  $collect->{$a}->{sortKey}    # custom sort spec, use the lexical key
                    cmp                             # (which is embedded in the rec)
                        $collect->{$b}->{sortKey} } 
                           keys %{$collect};
    my $firstLet=chr(00); # has to be lower than any other character val!
    
    # iterate in sorted order
    foreach my $artKey ( @keys ) {
        $rec = $collect->{$artKey};     # get easy access to the record
        $inCats= $rec->{inCats};     # and to the category array it carries
                
        my $artLink=$rec->{artLink};
        
        my $trialFirst=substr($rec->{sortKey},0,1);  # get first char
        if ($trialFirst ne $firstLet) {
            $firstLet=$trialFirst;
            if ($debug>1) {print "Switching to new first letter: $firstLet \n";}
            &WriteTableSecBreak($firstLet);
        } # end if new first letter in lexical order
                
        my ($catStr,$catV,$catK);
        $catStr="";
        while (($catK, $catV) = each %{$inCats}) {
            if ("" ne $catStr) {
                $catStr.="<br>";
            }
            $catStr.="[[:category:".$catK."|]]";
        } # loop through the categories we saw
        if ($debug>0) { print "key: ".$artKey." rec key ".$rec->{key}." article Link text ".$artLink."\n"; }
        
        print $tableH "|-\n||[[".$artLink."]]||".$catStr."||| ||"
                ."{{Wikipedia:WikiProject The Beatles/Article Classification/Unknown}}"
                ."||unknown|| || \n";
    } # end of iteration through the hash in sorted order       

    # finish off table          
    print $tableH "\n|}"; 
    return $rc; 
                
}

#---------------------------------------------------------------------------#
# Main routine -
#       process options
#       read in categories desired
#       build hash of articles by parsing SQL file
#       write out table file using hash           
#---------------------------------------------------------------------------#
# main
    my $rc=0;
    # print "prior to getopts\n";
        
    getopts('hvd:q:c:o:', \%options) or &Usage; # debug also d

    # print "post getopts, pre process\n";
    &ProcessOptions();
                
    if ($debug>1) { print "post process, pre read cat\n";  }
        
    $rc=&ReadCatFile();
    if ($rc) { die "error reading category list\n"; }

    $rc=&ParseSQL();
    if ($rc) { die "error reading SQL or building structure\n"; }

    $rc=&WriteTable();
    if ($rc) { die "error building table\n"; }

exit 0;