Wikipedia:Shortpages/How to update
From Wikipedia, the free encyclopedia
After downloading a current-pages database dump for the English Wikipedia, I use the following commands:
- mkdir data; mkdir todo
- gunzip -c dl/20050909_pages_current.xml.gz | perl ../scripts/parse-entries2.pl >! data/entries.txt
- perl ../scripts/shortpages.pl
The two Perl scripts are shown below, respectively.
-- Beland 06:54, 5 October 2005 (UTC)
# parse-entries2.pl use strict; main(); sub main { my ($text, $title); $/ = "</page>"; while (<>) { $_ =~ m%^(.*?)<revision>(.*?)</revision>.*$%s; $title = $1; $text = $2; $title =~ s%^.*?<title>(.*?)</title>.*?$%$1%s; $title =~ s/ /_/g; $title =~ s/\&/\&/g; $title =~ s/\>/>/g; $title =~ s/\</</g; if ($text =~ m%<text xml:space="preserve" />%) { $text = ""; } else { $text =~ s%^.*<text xml:space="preserve">(.*?)</text>.*$%$1%s; } $text =~ s/\t/\\t/g; $text =~ s/\n/\\n/g; $text =~ s/\&/\&/g; $text =~ s/\>/>/g; $text =~ s/\</</g; print $title."\t".$text."\n"; } }
# shortpages.pl use strict; main(); sub main { my ($title, $text, $i); open (ENTRIES, "<data/entries.txt"); open (SHORT, ">todo/shortpages.txt"); open (SHORTSTUB, ">todo/shortstubs.txt"); while (<ENTRIES>) { $_ =~ m/^(.*?)\t(.*)$/; $title = $1; $text = $2; # Protect! $text =~ s%</nowiki>%%g; # Remove leading and trailing whitespace $title =~ s/^\s*//; $title =~ s/\s*$//; # Uppercase title $title = ucfirst($title); # Underscores, please $title =~ s/ /_/g; # Exclude all namespaces except Article, # Portal, Wikipedia, and Help if (($title =~ m/^\w+_talk:/) or ($title =~ m/^Media:/) or ($title =~ m/^Special:/) or ($title =~ m/^Talk:/) or ($title =~ m/^User:/) or ($title =~ m/^Image:/) or ($title =~ m/^MediaWiki:/) or ($title =~ m/^Template:/) or ($title =~ m/^Category:/) ) { next; } if (length ($text) < 100) { if (($text =~ m/\{\{copyvio/) or ($text =~ m/^\s*\#\s*redirect.*?\s*\[\[.*?\]\]/i) or ($text =~ m/\{\{deletedpage\}\}/) or ($text =~ m/\{\{Deletedpage\}\}/) or ($text =~ m/\{\{deletedPage\}\}/) or ($text =~ m/\{\{DeletedPage\}\}/) or ($text =~ m/\{\{deletedarticle\}\}/) or ($text =~ m/\{\{disambig\}\}/) or ($text =~ m/\{\{rfd\}\}/) ) { next; } if ($text =~ m/\-*stub\}\}/) { print SHORTSTUB "<tr><td>".sprintf("%02d", length ($text))."</td><td>[[$title]]</td><td>$text</td></tr>\n"; #print "SHORTSTUB ".length ($text)." [[$title]] $text\n"; } else { print SHORT "<tr><td>".sprintf("%02d", length ($text))."</td><td>[[$title]]</td><td>$text</td></tr>\n"; #print "SHORT ".length ($text)." [[$title]] $text\n"; } } if ($i++ % 10000 == 0) { print STDERR $i - 1 ."\r"; } } close (ENTRIES); close (SHORT); close (SHORTSTUB); } print `cat ./todo/shortpages.txt | sort -n > ./todo/shortpages-sorted.txt`; print `cat ./todo/shortstubs.txt | sort -n > ./todo/shortstubs-sorted.txt`; unlink ("./todo/shortpages.txt"); unlink ("./todo/shortstubs.txt");
The old method is to run the following SQL commands on a database dump. This does not remove stubs.
DROP TABLE IF EXISTS temp_sizesmall; CREATE TABLE temp_sizesmall (UNIQUE KEY `s_id` (`s_id`)) SELECT cur_title AS s_title, cur_id AS s_id, cur_text AS s_text, length(cur_text) AS s_size, cur_namespace AS s_namespace, cur_is_redirect AS s_is_redirect FROM cur WHERE LENGTH(cur_text)<251 LIMIT 1000000; DELETE FROM temp_sizesmall WHERE s_is_redirect=1; DELETE FROM temp_sizesmall WHERE s_namespace<>0; ALTER TABLE temp_sizesmall DROP COLUMN s_namespace; ALTER TABLE temp_sizesmall DROP COLUMN s_is_redirect; SELECT CONCAT( '|-\n|', s_size, '||[[', REPLACE(s_title,'_',' '), ']]||', LEFT((REPLACE(REPLACE(REPLACE(REPLACE(s_text,'\n',' '),'\r',' '),' ',' '),'&','&')),100), '') AS List INTO OUTFILE 'wp_smallpages.txt' #change it to the drive/path you need FROM temp_sizesmall WHERE s_text NOT LIKE '%{{disambig}}%' AND s_text NOT LIKE '%{{disambig}}%' AND s_text NOT LIKE '%{{copyvio1}}%' AND s_text NOT LIKE '%{{copyvio%' AND s_size>0 AND s_text NOT LIKE '%{{List_of_people%' ORDER BY s_size, Lower(s_title) LIMIT 170