Wikipedia:Shortpages/How to update
![]() | This page is currently inactive and is retained for historical reference. Either the page is no longer relevant or consensus on its purpose has become unclear. To revive discussion, seek broader input via a forum such as the village pump. |
After downloading a current-pages database dump for the English Wikipedia, I use the following commands:
- mkdir data; mkdir todo
- gunzip -c dl/20050909_pages_current.xml.gz | perl ../scripts/ >! data/entries.txt
- perl ../scripts/
The two Perl scripts are shown below, respectively.
-- Beland 06:54, 5 October 2005 (UTC)
# use strict; main(); sub main { my ($text, $title); $/ = "</page>"; while (<>) { $_ =~ m%^(.*?)<revision>(.*?)</revision>.*$%s; $title = $1; $text = $2; $title =~ s%^.*?<title>(.*?)</title>.*?$%$1%s; $title =~ s/ /_/g; $title =~ s/\&/\&/g; $title =~ s/\>/>/g; $title =~ s/\</</g; if ($text =~ m%<text xml:space="preserve" />%) { $text = ""; } else { $text =~ s%^.*<text xml:space="preserve">(.*?)</text>.*$%$1%s; } $text =~ s/\t/\\t/g; $text =~ s/\n/\\n/g; $text =~ s/\&/\&/g; $text =~ s/\>/>/g; $text =~ s/\</</g; print $title."\t".$text."\n"; } }
# use strict; main(); sub main { my ($title, $text, $i); open (ENTRIES, "<data/entries.txt"); open (SHORT, ">todo/shortpages.txt"); open (SHORTSTUB, ">todo/shortstubs.txt"); while (<ENTRIES>) { $_ =~ m/^(.*?)\t(.*)$/; $title = $1; $text = $2; # Protect! $text =~ s%</nowiki>%%g; # Remove leading and trailing whitespace $title =~ s/^\s*//; $title =~ s/\s*$//; # Uppercase title $title = ucfirst($title); # Underscores, please $title =~ s/ /_/g; # Exclude all namespaces except Article, # Portal, Wikipedia, and Help if (($title =~ m/^\w+_talk:/) or ($title =~ m/^Media:/) or ($title =~ m/^Special:/) or ($title =~ m/^Talk:/) or ($title =~ m/^User:/) or ($title =~ m/^Image:/) or ($title =~ m/^MediaWiki:/) or ($title =~ m/^Template:/) or ($title =~ m/^Category:/) ) { next; } if (length ($text) < 100) { if (($text =~ m/\{\{copyvio/) or ($text =~ m/^\s*\#\s*redirect.*?\s*\[\[.*?\]\]/i) or ($text =~ m/\{\{deletedpage\}\}/) or ($text =~ m/\{\{Deletedpage\}\}/) or ($text =~ m/\{\{deletedPage\}\}/) or ($text =~ m/\{\{DeletedPage\}\}/) or ($text =~ m/\{\{deletedarticle\}\}/) or ($text =~ m/\{\{disambig\}\}/) or ($text =~ m/\{\{rfd\}\}/) ) { next; } if ($text =~ m/\-*stub\}\}/) { print SHORTSTUB "<tr><td>".sprintf("%02d", length ($text))."</td><td>[[$title]]</td><td>$text</td></tr>\n"; #print "SHORTSTUB ".length ($text)." [[$title]] $text\n"; } else { print SHORT "<tr><td>".sprintf("%02d", length ($text))."</td><td>[[$title]]</td><td>$text</td></tr>\n"; #print "SHORT ".length ($text)." [[$title]] $text\n"; } } if ($i++ % 10000 == 0) { print STDERR $i - 1 ."\r"; } } close (ENTRIES); close (SHORT); close (SHORTSTUB); } print `cat ./todo/shortpages.txt | sort -n > ./todo/shortpages-sorted.txt`; print `cat ./todo/shortstubs.txt | sort -n > ./todo/shortstubs-sorted.txt`; unlink ("./todo/shortpages.txt"); unlink ("./todo/shortstubs.txt");
The old method is to run the following SQL commands on a database dump. This does not remove stubs.
DROP TABLE IF EXISTS temp_sizesmall; CREATE TABLE temp_sizesmall (UNIQUE KEY `s_id` (`s_id`)) SELECT cur_title AS s_title, cur_id AS s_id, cur_text AS s_text, length(cur_text) AS s_size, cur_namespace AS s_namespace, cur_is_redirect AS s_is_redirect FROM cur WHERE LENGTH(cur_text)<251 LIMIT 1000000; DELETE FROM temp_sizesmall WHERE s_is_redirect=1; DELETE FROM temp_sizesmall WHERE s_namespace<>0; ALTER TABLE temp_sizesmall DROP COLUMN s_namespace; ALTER TABLE temp_sizesmall DROP COLUMN s_is_redirect; SELECT CONCAT( '|-\n|', s_size, '||[[', REPLACE(s_title,'_',' '), ']]||', LEFT((REPLACE(REPLACE(REPLACE(REPLACE(s_text,'\n',' '),'\r',' '),' ',' '),'&','&')),100), '') AS List INTO OUTFILE 'wp_smallpages.txt' #change it to the drive/path you need FROM temp_sizesmall WHERE s_text NOT LIKE '%{{disambig}}%' AND s_text NOT LIKE '%{{disambig}}%' AND s_text NOT LIKE '%{{copyvio1}}%' AND s_text NOT LIKE '%{{copyvio%' AND s_size>0 AND s_text NOT LIKE '%{{List_of_people%' ORDER BY s_size, Lower(s_title) LIMIT 170