Wikipedia:Duplicated sections/script

From Wikipedia, the free encyclopedia

# Hot pipes
$| = 1;

# This script is expecting entries.txt to be a relatively database
# dump that has been pre-processed to put each page on line by itself.

# On 31 July 2005, this script ran on a 1.2GHz i686 laptop with ~700MB
# RAM in about 20 minutes.  Not using the dupHeaders() filter will
# cause it to take probably about 5 hours or more.

# The author of this script is Christopher Beland, User:Beland on
# en.wikipedia.org.  It is hereby released into the Public Domain.
# Feel free to use it for any purpose whatsoever.

use strict;

main();

sub main
{

    my ($cur_id, $cur_namespace, $cur_title, $cur_text, @junk, $line,
        $cur_namespace_name, $i, $j, @tokens, $printed, $chain);

    unless (-d "./todo")
    {
        mkdir "./todo";
    }

    open (ENTRIES, "<data/entries.txt")
        || die "Cannot read data/entries.txt";
    open (DUPHEAD, ">todo/duplicate-chunks.txt")
        || die "Cannot write todo/blank-pages.txt" ;

    while (<ENTRIES>)
    {
        if (++$j % 100 == 0)
        {
            print STDERR $j."\r";
        }

        $line = $_;
        
        eval("\@tokens = $line");
                
        ($cur_id, $cur_namespace, $cur_title, $cur_text, @junk)
            = @tokens;

        unless (dupHeaders($cur_text) == 1)
        {
            next;
        }

        if ($cur_namespace == -2)
        {
            $cur_namespace_name = "Media:";
        }
        elsif ($cur_namespace == -1)
        {
            $cur_namespace_name = "Special:";
        }
        elsif ($cur_namespace == 0)
        {
            $cur_namespace_name = "";
        }
        elsif ($cur_namespace == 1)
        {
            $cur_namespace_name = "Talk:";
        }
        elsif ($cur_namespace == 2)
        {
            $cur_namespace_name = "User:";
        }
        elsif ($cur_namespace == 3)
        {
            $cur_namespace_name = "User_talk:";
        }
        elsif ($cur_namespace == 4)
        {
            $cur_namespace_name = "Wikipedia:";
        }
        elsif ($cur_namespace == 5)
        {
            $cur_namespace_name = "Wikipedia_talk:";
        }
        elsif ($cur_namespace == 6)
        {
            $cur_namespace_name = ":Image:";
        }
        elsif ($cur_namespace == 7)
        {
            $cur_namespace_name = "Image_talk:";
        }
        elsif ($cur_namespace == 8)
        {
            $cur_namespace_name = "MediaWiki:";
        }
        elsif ($cur_namespace == 9)
        {
            $cur_namespace_name = "MediaWiki_talk:";
        }
        elsif ($cur_namespace == 10)
        {
            $cur_namespace_name = "Template:";
        }
        elsif ($cur_namespace == 11)
        {
            $cur_namespace_name = "Template_talk:";
        }
        elsif ($cur_namespace == 12)
        {
            $cur_namespace_name = "Help:";
        }
        elsif ($cur_namespace == 13)
        {
            $cur_namespace_name = "Help_talk:";
        }
        elsif ($cur_namespace == 14)
        {
            $cur_namespace_name = ":Category";
        }
        elsif ($cur_namespace == 15)
        {
            $cur_namespace_name = "Category_talk:";
        }

        # Remove leading and trailing 's.
        $cur_title =~ s/^\'//;
        $cur_title =~ s/\'$//;
        # Remove leading and trailing whitespace
        $cur_title =~ s/^\s*//;
        $cur_title =~ s/\s*$//;

        $cur_text =~ s/\\n/ /g;
        $cur_text =~ s/\s+/ /g;

        my (%chains, @chunks, $i, $per, $numberRepeated);

        @chunks = split (" ", $cur_text);
        
        while (@chunks > 3)
        {
            $chain = $chunks[-1]." ".$chunks[-2]." ".$chunks[-3];
            $chains{$chain}++;
            pop(@chunks);

            # Note: pop from the rear is a bjillion times more
            # efficient than unloading manually from the front.

            $i++;
        }

#       print DUPHEAD "* [[".$cur_namespace_name.$cur_title."]] $i\n";

        $printed = 0;

        foreach $chain (keys(%chains))
        {
            if ($chains{$chain} > 1)
            {
                if ($printed == 0)
                {
                    print DUPHEAD "* [[".$cur_namespace_name.$cur_title."]]";
                    $printed = 1;
                }
#               print DUPHEAD $chains{$chain}.": ".$chain."\n";
                $numberRepeated++
            }
        }

        if ($printed == 1)
        {
            $per = int(($numberRepeated / $i) * 100);
            print DUPHEAD " ${per}% repeated - $numberRepeated out of $i triplets\n";
        }

    }
    close (ENTRIES);
    close (DUPHEAD);
}


sub dupHeaders
{
    my ($text, %headers, $line);
    
    $text = $_[0];
    
    unless ($text =~ m/=/)
    {
        # No headers means no duplicate headers
        return (0);
    }

    $text =~ s/\\n/\n/g;
    
    foreach $line (split ("\n", $text))
    {
        if ($line =~ m/^\s*\=/)
        {
            $headers{$line}++;          
        }
    }
    
    foreach $line (keys(%headers))
    {
        if ($headers{$line} > 1)
        {
            # Found a duplicated header
            return(1);
        }
    }

    # Didn't return, so must not have found any duplicate headers
    return(0);
}


print `sort -nr -k3 todo/duplicate-chunks.txt > todo/duplicate-chunks-sorted.txt`