Wikipedia:Shortpages/How to update

After downloading a current-pages database dump for the English Wikipedia, I use the following commands:
 * mkdir data; mkdir todo
 * gunzip -c dl/20050909_pages_current.xml.gz | perl ../scripts/parse-entries2.pl >! data/entries.txt
 * perl ../scripts/shortpages.pl

The two Perl scripts are shown below, respectively.

-- Beland 06:54, 5 October 2005 (UTC)


 * 1) parse-entries2.pl

use strict;

main;

sub main {   my ($text, $title);

$/ = " ";

while (<>) {	$_ =~ m%^(.*?) (.*?) .*$%s; $title = $1; $text = $2;

$title =~ s%^.*? (.*?) .*?$%$1%s; $title =~ s/ /_/g; $title =~ s/\&amp;/\&/g; $title =~ s/\&gt;/>/g; $title =~ s/\&lt;/%) {	   $text = ""; }	else {	   $text =~ s%^.*(.*?) .*$%$1%s; }	$text =~ s/\t/\\t/g; $text =~ s/\n/\\n/g; $text =~ s/\&amp;/\&/g; $text =~ s/\&gt;/>/g; $text =~ s/\&lt;/todo/shortpages.txt"); open (SHORTSTUB, ">todo/shortstubs.txt"); while () {	$_ =~ m/^(.*?)\t(.*)$/; $title = $1; $text = $2;

# Protect! $text =~ s% %%g;

# Remove leading and trailing whitespace $title =~ s/^\s*//; $title =~ s/\s*$//; # Uppercase title $title = ucfirst($title); # Underscores, please $title =~ s/ /_/g; # Exclude all namespaces except Article, # Portal, Wikipedia, and Help if (($title =~ m/^\w+_talk:/)           or ($title =~ m/^Media:/)            or ($title =~ m/^Special:/)            or ($title =~ m/^Talk:/)            or ($title =~ m/^User:/)            or ($title =~ m/^Image:/)            or ($title =~ m/^MediaWiki:/)            or ($title =~ m/^Template:/)            or ($title =~ m/^Category:/)	    ) {           next; }

if (length ($text) < 100) {

if (($text =~ m/\{\{copyvio/)		or ($text =~ m/^\s*\#\s*redirect.*?\s*\[\[.*?\]\]/i)		or ($text =~ m/\{\{deletedpage\}\}/)		or ($text =~ m/\{\{Deletedpage\}\}/)		or ($text =~ m/\{\{deletedPage\}\}/)		or ($text =~ m/\{\{DeletedPage\}\}/)		or ($text =~ m/\{\{deletedarticle\}\}/)		or ($text =~ m/\{\{disambig\}\}/)		or ($text =~ m/\{\{rfd\}\}/)		) {		next; }	   if ($text =~ m/\-*stub\}\}/) {		print SHORTSTUB " ".sprintf("%02d", length ($text))."  $title   $text   \n"; #print "SHORTSTUB ".length ($text)." $title $text\n";

}	   else {		print SHORT " ".sprintf("%02d", length ($text))."  $title   $text   \n"; #print "SHORT ".length ($text)." $title $text\n";

}	}

if ($i++ % 10000 == 0) {	   print STDERR $i - 1 ."\r"; }   }    close (ENTRIES); close (SHORT); close (SHORTSTUB); }

print `cat ./todo/shortpages.txt | sort -n > ./todo/shortpages-sorted.txt`; print `cat ./todo/shortstubs.txt | sort -n > ./todo/shortstubs-sorted.txt`;

unlink ("./todo/shortpages.txt"); unlink ("./todo/shortstubs.txt");

The old method is to run the following SQL commands on a database dump. This does not remove stubs.

DROP TABLE IF EXISTS temp_sizesmall;

CREATE TABLE temp_sizesmall (UNIQUE KEY `s_id` (`s_id`)) SELECT cur_title AS s_title, cur_id AS s_id, cur_text AS s_text, length(cur_text) AS s_size, cur_namespace AS s_namespace, cur_is_redirect AS s_is_redirect FROM cur WHERE LENGTH(cur_text)<251 LIMIT 1000000;

DELETE FROM temp_sizesmall WHERE s_is_redirect=1;

DELETE FROM temp_sizesmall WHERE s_namespace<>0;

ALTER TABLE temp_sizesmall DROP COLUMN s_namespace; ALTER TABLE temp_sizesmall DROP COLUMN s_is_redirect;

SELECT CONCAT(       '|-\n|',         s_size,         '||',         REPLACE(s_title,'_',' '),         '|| ',         LEFT((REPLACE(REPLACE(REPLACE(REPLACE(s_text,'\n',' '),'\r',' '),'   ',' '),'&amp;','&amp;amp;')),100),         ' ') AS List INTO OUTFILE 'wp_smallpages.txt'     #change it to the drive/path you need FROM temp_sizesmall WHERE s_text NOT LIKE '%%' AND s_text NOT LIKE '%%' AND s_text NOT LIKE '%%' AND s_text NOT LIKE '%{{copyvio%' AND s_size>0 AND s_text NOT LIKE '%{{List_of_people%' ORDER BY s_size, Lower(s_title) LIMIT 170