User:Bo Lindbergh/dabalyze
From Wikipedia, the free encyclopedia
This is a Perl script for finding links to disambiguation pages in Wikipedia by analyzing database dumps in the new XML format. It may not work properly in a non-Unix environment. Save it as "dabalyze" in a convenient directory. Instructions follow below.
#! /usr/bin/perl use strict; my %interesting= ('' => { name => 'article', filename => 'articles.txt', cutoff => 100}, 'Template' => { name => 'template', filename => 'templates.txt', cutoff => 0, list => 1}); my $exp_re=qr/\(disambiguation\)$/; my @templates=split(/\n/,<<__EOT__); 2LA 2LCdisambig 4LA 5LA Albumdis Dab Dambig Disam Disamb Disambig Disambig-cleanup Disambiguate Disambiguation Exp-dab Geodis Hndis Hurricane disambig Hurricanedis Interstatedis Listdis LND Miscdis Nocatdab Numberdis Phrasedis Rdab Roadis Songdis Substadis Tla-dab TLAdisambig TLAdisambiguation Townshipdis __EOT__ foreach my $template (@templates) { $template =~ s/^([[:alpha:]])/[$1\L$1]/; } my $tmpl_re=join('|',reverse(sort(@templates))); my $dab_re=qr/{{(?i:msg:)?\s*(?i:template\s*:\s*)?($tmpl_re)\s*}}/; my($ns_re,%ns_canon); my $want_progress=@ARGV>0 && $ARGV[0] eq '-p'; my $last_progress=-1; sub pageloop (&) { my($handler)=@_; my($size); local $/="</page>\x0A"; $size=-s PAGES; while (defined(my $page=<PAGES>)) { my($nstitle,$ns,$title); $page =~ /^\s*<page>/ or last; ($nstitle)=($page =~ m{<title>([^<]+)</title>}) or die "Can't find page title"; if ($nstitle =~ /^($ns_re):(.+)$/) { $ns=$1; $title=$2; } else { $ns=''; $title=$nstitle; } $page =~ m{</text>} or next; substr($page,$-[0])=''; $page =~ /<text xml:space="preserve">/ or die "Can't find start of text for page $nstitle"; substr($page,0,$+[0])=''; $handler->($nstitle,$ns,$title,$page); if ($want_progress) { my $progress=int(tell(PAGES)/$size*1000); if ($progress!=$last_progress) { $last_progress=$progress; printf STDERR "\r0.%.3u",$progress; } } } if ($want_progress) { print STDERR "\r"; } } sub mungtarget ($$$ ) { my(undef,$source,$sub)=@_; for my $target ($_[0]) { $target =~ tr/\t\n\r/ /; $target =~ s/^ +//; $target =~ s/ +$//; $target =~ s/ {2,}/ /g; if ($sub && $target =~ m{^/}) { $target=$source.$target; } elsif ($target =~ /^:*($ns_re) *: *(.+)$/i) { $target=$2; utf8::decode($target); $target=ucfirst($target); utf8::encode($target); $target=$ns_canon{lc($1)}.":".$target; } elsif ($target =~ /^:*(.+)$/i) { $target=$1; utf8::decode($target); $target=ucfirst($target); utf8::encode($target); } else { # a malformed link, usually empty brackets } } } my(%dab,%redir,@circular); sub pass1 () { print STDERR "Analysis pass 1\n"; { my($siteinfo,@namespaces); local $/="</siteinfo>\x0A"; $siteinfo=<PAGES>; @namespaces= $siteinfo =~ m{<namespace key="-?\d+">([^<]+)</namespace>}g; $ns_re=join('|',map(quotemeta($_),reverse(sort(@namespaces)))); foreach my $ns (@namespaces) { $ns_canon{lc($ns)}=$ns; } } pageloop { my($nstitle,$ns,$title)=splice(@_,0,3); for my $text ($_[0]) { my $sub=$interesting{$ns}->{subpages}; if ($ns eq '' && $text =~ $dab_re) { $dab{$nstitle}=1; } if ($text =~ /^#redirect.*\[\[([^\]\|]+)/i) { my($target,$back); $target=$1; mungtarget($target,$nstitle,$sub); while ($target ne $nstitle) { my($newtarget); $newtarget=$redir{$target}; last unless defined($newtarget); $target=$newtarget; } if ($target eq $nstitle) { push(@circular,$nstitle); } else { $redir{$nstitle}=$target; } } } }; foreach my $target (keys(%redir)) { my(@chain); for (;;) { my $newtarget=$redir{$target}; last unless defined($newtarget); push(@chain,$target); $target=$newtarget; } pop(@chain); foreach my $source (@chain) { $redir{$source}=$target; } } print STDERR " ".keys(%dab)." total disambiguation pages\n"; print STDERR "\n"; } my %stats=map { ($_,{}); } keys(%interesting); my %lists=map { ($_,{}); } grep { $interesting{$_}->{list}; } keys(%interesting); sub pass2 () { my(%linked); print STDERR "Analysis pass 2\n"; { local $/="</siteinfo>\x0A"; <PAGES>; } pageloop { my($nstitle,$ns,$title)=splice(@_,0,3); for my $text ($_[0]) { my($stats,$lists,$sub); $stats=$stats{$ns}; $lists=$lists{$ns}; $sub=$interesting{$ns}->{subpages}; if ($stats) { my(%seen); while ($text =~ /\[\[([^\]\|]+)/g) { my($target,$final); $target=$1; mungtarget($target,$nstitle,$sub); next if $target =~ $exp_re; $final=$redir{$target}; $final=$target unless defined($final); if ($dab{$final} && !$seen{$final}++) { $linked{$final}=1; $stats->{$final}++; if ($lists) { push(@{$lists->{$final}},$nstitle); } } } } } }; print STDERR " ".keys(%linked)." linked disambiguation pages\n"; foreach my $ns (sort(keys(%stats))) { print STDERR (" ".keys(%{$stats{$ns}})." in the ". $interesting{$ns}->{name}." namespace\n"); } print STDERR "\n"; } sub wikilink ($ ) { my($target)=@_; if (exists($redir{$target})) { "[{{SERVER}}{{localurl:$target|redirect=no}} $target]"; } elsif ($target =~ m{/\.{1,2}(?:$|/)}) { "[{{SERVER}}{{localurl:$target}} $target]"; } elsif ($target =~ m{^/}) { "[[:$target]]"; } else { "[[$target]]"; } } sub report () { print STDERR "Report generation\n"; foreach my $target (@circular) { $redir{$target}=$target; } while (my($ns,$stats)=each(%stats)) { my($filename,$cutoff)=@{$interesting{$ns}}{qw(filename cutoff)}; my $lists=$lists{$ns}; my @nstitles=sort { $stats->{$b}<=>$stats->{$a} || $a cmp $b; } grep { $stats->{$_}>=$cutoff; } keys(%{$stats}); my $total=0; open(REPORT,'>',$filename) or die "Can't create $filename: $!"; binmode(REPORT); print REPORT "\xEF\xBB\xBF"; foreach my $nstitle (@nstitles) { $total+=$stats->{$nstitle}; } print REPORT "Total link count: $total\n"; foreach my $nstitle (@nstitles) { print REPORT ("# ",wikilink($nstitle),": ",$stats->{$nstitle}, " [[Special:Whatlinkshere/",$nstitle,"|links]]\n"); if ($lists) { foreach my $source (sort(@{$lists->{$nstitle}})) { print REPORT "#* ",wikilink($source),"\n"; } } } close(REPORT); print STDERR " ".@nstitles." entries written to $filename\n"; } if (@circular) { @circular=sort(@circular); open(REPORT,'>','circular.txt') or die "Can't create circular.txt: $!"; binmode(REPORT); print REPORT "\xEF\xBB\xBF"; foreach my $target (@circular) { print REPORT "* ",wikilink($target),"\n"; } close(REPORT); print STDERR " ".@circular." entries written to circular.txt\n"; } else { unlink('circular.txt'); } } open(PAGES,'<','pages_current.xml') or die "Can't open pages_current.xml: $!"; binmode(PAGES); pass1(); seek(PAGES,0,0); pass2(); close(PAGES); report();
- input
- The script expects to find the file "pages_current.xml" in the current directory. You can get this by downloading and uncompressing http://download.wikimedia.org/wikipedia/en/pages_current.xml.bz2
- output
- The script generates two text files named "articles.txt" and "templates.txt" in the current directory. The first one contains a list of disambiguation pages linked to by articles, suitable for inclusion in Wikipedia:Disambiguation pages with links. The second one contains a list of disambiguation pages linked to by templates; this is intended for a hypothetical sub-project concentrating on the template namespace. Note that the files use UTF-8 encoding; any text editor you use for copying and pasting into Wikipedia must be able to handle that.
- Since the script has to handle circular redirects anyway, it generates a list of them in the file "circular.txt".
- diagnostics
- A successful run generates diagnostic output similar to the following:
Analysis pass 1 41868 total disambiguation pages Analysis pass 2 30385 linked disambiguation pages 30369 in the article namespace 880 in the template namespace Report generation 514 entries written to articles.txt 880 entries written to templates.txt 100 entries written to circular.txt
The total running time is about 32 minutes on an 867 MHz PowerPC G4 (based on the database dump dated 2005-10-20).