Wikipedia:Shortpages/How to update

From Wikipedia, the free encyclopedia

After downloading a current-pages database dump for the English Wikipedia, I use the following commands:

  • mkdir data; mkdir todo
  • gunzip -c dl/20050909_pages_current.xml.gz | perl ../scripts/parse-entries2.pl >! data/entries.txt
  • perl ../scripts/shortpages.pl

The two Perl scripts are shown below, respectively.

-- Beland 06:54, 5 October 2005 (UTC)


# parse-entries2.pl

use strict;

main();

sub main
{
    my ($text, $title);

    $/ = "</page>";

    while (<>)
    {
        $_ =~ m%^(.*?)<revision>(.*?)</revision>.*$%s;
        $title = $1;
        $text = $2;

        $title =~ s%^.*?<title>(.*?)</title>.*?$%$1%s;
        $title =~ s/ /_/g;
        $title =~ s/\&/\&/g;
        $title =~ s/\>/>/g;
        $title =~ s/\</</g;

        if ($text =~ m%<text xml:space="preserve" />%)
        {
            $text = "";
        }
        else
        {
            $text =~ s%^.*<text xml:space="preserve">(.*?)</text>.*$%$1%s;
        }
        $text =~ s/\t/\\t/g;
        $text =~ s/\n/\\n/g;
        $text =~ s/\&/\&/g;
        $text =~ s/\>/>/g;
        $text =~ s/\</</g;

        print $title."\t".$text."\n";
    }
}


# shortpages.pl

use strict;

main();

sub main
{

    my ($title, $text, $i);

    open (ENTRIES, "<data/entries.txt");
    open (SHORT, ">todo/shortpages.txt");
    open (SHORTSTUB, ">todo/shortstubs.txt");
    while (<ENTRIES>)
    {
        $_ =~ m/^(.*?)\t(.*)$/;
        $title = $1;
        $text = $2;

        # Protect!
        $text =~ s%</nowiki>%%g;

        # Remove leading and trailing whitespace
        $title =~ s/^\s*//;
        $title =~ s/\s*$//;
        # Uppercase title
        $title = ucfirst($title);
        # Underscores, please
        $title =~ s/ /_/g;
        
        # Exclude all namespaces except Article, 
        # Portal, Wikipedia, and Help
        if (($title =~ m/^\w+_talk:/)
            or ($title =~ m/^Media:/)
            or ($title =~ m/^Special:/)
            or ($title =~ m/^Talk:/)
            or ($title =~ m/^User:/)
            or ($title =~ m/^Image:/)
            or ($title =~ m/^MediaWiki:/)
            or ($title =~ m/^Template:/)
            or ($title =~ m/^Category:/)
            )
        {
            next;
        }

        if (length ($text) < 100)
        {

            if (($text =~ m/\{\{copyvio/)
                or ($text =~ m/^\s*\#\s*redirect.*?\s*\[\[.*?\]\]/i)
                or ($text =~ m/\{\{deletedpage\}\}/)
                or ($text =~ m/\{\{Deletedpage\}\}/)
                or ($text =~ m/\{\{deletedPage\}\}/)
                or ($text =~ m/\{\{DeletedPage\}\}/)
                or ($text =~ m/\{\{deletedarticle\}\}/)
                or ($text =~ m/\{\{disambig\}\}/)
                or ($text =~ m/\{\{rfd\}\}/)
                )
            {
                next;
            }
            
            if ($text =~ m/\-*stub\}\}/)
            {
                print SHORTSTUB "<tr><td>".sprintf("%02d", length ($text))."</td><td>[[$title]]</td><td>$text</td></tr>\n";
                #print "SHORTSTUB ".length ($text)." [[$title]] $text\n";

            }
            else
            {
                print SHORT "<tr><td>".sprintf("%02d", length ($text))."</td><td>[[$title]]</td><td>$text</td></tr>\n";
                #print "SHORT ".length ($text)." [[$title]] $text\n";

            }
        }
        

        if ($i++ % 10000 == 0)
        {
            print STDERR $i - 1 ."\r";
        }
        
    }
    close (ENTRIES);
    close (SHORT);
    close (SHORTSTUB);
}

print `cat ./todo/shortpages.txt | sort -n > ./todo/shortpages-sorted.txt`;
print `cat ./todo/shortstubs.txt | sort -n > ./todo/shortstubs-sorted.txt`;

unlink ("./todo/shortpages.txt");
unlink ("./todo/shortstubs.txt");


The old method is to run the following SQL commands on a database dump. This does not remove stubs.

DROP TABLE IF EXISTS temp_sizesmall;

CREATE TABLE temp_sizesmall (UNIQUE KEY `s_id` (`s_id`))
SELECT 
        cur_title AS s_title, 
        cur_id AS s_id, 
        cur_text AS s_text,  
        length(cur_text) AS s_size,
        cur_namespace AS s_namespace,
        cur_is_redirect AS s_is_redirect
FROM cur
WHERE LENGTH(cur_text)<251
LIMIT 1000000;

DELETE 
FROM temp_sizesmall
WHERE s_is_redirect=1;

DELETE
FROM temp_sizesmall
WHERE s_namespace<>0;

ALTER TABLE temp_sizesmall DROP COLUMN s_namespace;
ALTER TABLE temp_sizesmall DROP COLUMN s_is_redirect;

SELECT 
CONCAT(
        '|-\n|', 
        s_size, 
        '||[[', 
        REPLACE(s_title,'_',' '), 
        ']]||', 
        LEFT((REPLACE(REPLACE(REPLACE(REPLACE(s_text,'\n',' '),'\r',' '),'   ',' '),'&','&amp;')),100), 
        '') 
AS List
INTO OUTFILE 'wp_smallpages.txt'      #change it to the drive/path you need
FROM temp_sizesmall
WHERE s_text NOT LIKE '%{{disambig}}%'
AND s_text NOT LIKE '%{{disambig}}%'
AND s_text NOT LIKE '%{{copyvio1}}%'
AND s_text NOT LIKE '%{{copyvio%'
AND s_size>0
AND s_text NOT LIKE '%{{List_of_people%'
ORDER BY s_size, Lower(s_title)
LIMIT 170