User:Whobot/code2

use strict; use Time::HiRes;


 * 1) The following may be helpful in debugging character encoding
 * 2) problems.


 * 1) use utf8;
 * 2) use encoding 'utf8';

use LWP::UserAgent; use HTTP::Cookies; use HTTP::Request::Common qw(POST); use HTML::Entities; print "\n";
 * 1) Initialization


 * 1) LWP:UserAgent is a library which allows us to create a "user agent"
 * 2) object that handles the low-level details of making HTTP requests.

$::ua = LWP::UserAgent->new(timeout => 300); $::ua->agent("Whobot Wisebot/0.1"); $::ua->cookie_jar(HTTP::Cookies->new(file => "cookies.whobot.txt", autosave => 1)); $::ua->cookie_jar->load;

$| = 1;
 * 1) Hot pipes


 * 1) test;
 * 2) sub test
 * 3)    my ($target, $text, $editTime, $startTime, $token);
 * 4)    $target = "Wikipedia:Sandbox";
 * 5)    ($text, $editTime, $startTime, $token) = getPage($target);
 * 6)    print $text;
 * 7)    $text .= "\Eat my electrons! -- Whobot\n";
 * 8)    print "---\n";
 * 9)    postPage ($target, $editTime, $startTime, $token, $text, "Test 008");
 * 10)    die ("Test complete.");
 * }
 * 1)    postPage ($target, $editTime, $startTime, $token, $text, "Test 008");
 * 2)    die ("Test complete.");
 * }

interpretCommand(@ARGV);

sub interpretCommand {

my ($command, @arguments, $i, $line, $argument, @newArguments,	$from, $to, $page, $pageCopy);

($command, @arguments) = @_;

$command =~ s/\*\s*//;

myLog(`date /t`); myLog ($command.": ".join(" ", @arguments)."\n"); print `date /t`; print $command.": ".join(" ", @arguments)."\n";

if ($command eq "POST_STDIN") {	if ($arguments[2] ne "") {	   myLog ("Too many arguments to POST_STDIN.\n"); die ("Too many arguments to POST_STDIN.\n"); }	postSTDIN($arguments[0],$arguments[1]); }   elsif ($command eq "POST_STDIN_NULLOK") {	if ($arguments[2] ne "") {	   myLog ("Too many arguments to POST_STDIN.\n"); die ("Too many arguments to POST_STDIN.\n"); }	$::nullOK = "yes"; postSTDIN($arguments[0],$arguments[1]); $::nullOK = "no"; }   elsif ($command eq "MOVE_CONTENTS") {	if ($arguments[2] ne "") {	   if (($arguments[3] eq "")		and ($arguments[1] eq "->")) {		moveCategoryContents($arguments[0],$arguments[2]); return; }	   else {		myLog ("Too many arguments to MOVE_CONTENTS.\n"); die ("Too many arguments to MOVE_CONTENTS.\n"); }	}	moveCategoryContents($arguments[0],$arguments[1],"no","yes"); }   elsif ($command eq "MOVE_CONTENTS_INCL_CATS") {	if ($arguments[2] ne "") {	   if (($arguments[3] eq "")		and ($arguments[1] eq "->")) {		moveCategoryContents($arguments[0],$arguments[2],"yes","yes"); return; }	   else {		myLog ("Too many arguments to MOVE_CONTENTS_INCL_CATS.\n"); die ("Too many arguments to MOVE_CONTENTS_INCL_CATS.\n"); }	}	moveCategoryContents($arguments[0],$arguments[1],"yes","yes"); }   elsif ($command eq "REMOVE_X_FROM_CAT") {	if ($arguments[2] ne "") {	   myLog ("Too many arguments to REMOVE_X_FROM_CAT.\n"); die ("Too many arguments to REMOVE_X_FROM_CAT.\n"); }	removeXFromCat($arguments[0],$arguments[1],""); }   elsif ($command eq "DEPOPULATE_CAT") {	if ($arguments[1] ne "") {	   myLog ("Too many arguments to DEPOPULATE_CAT.\n"); die ("Too many arguments to DEPOPULATE_CAT.\n"); }	depopulateCat($arguments[0]); }   elsif ($command eq "PRINT_WIKITEXT") {	if ($arguments[1] ne "") {	   myLog ("Too many arguments to PRINT_WIKITEXT.\n"); die ("Too many arguments to PRINT_WIKITEXT.\n"); }	printWikitext($arguments[0]); }   elsif ($command eq "ADD_CFD_TAG") {	if ($arguments[1] ne "") {	   myLog ("Too many arguments to ADD_CFD_TAG.\n"); die ("Too many arguments to ADD_CFD_TAG.\n"); }	addCFDTag($arguments[0]); }

elsif ($command eq "ADD_CFDU_TAG") {	if ($arguments[2] ne "") {	   myLog ("Too many arguments to ADD_CFDU_TAG.\n"); die ("Too many arguments to ADD_CFDU_TAG.\n"); }	addCFDUTag($arguments[0],$arguments[1],""); }   elsif ($command eq "REMOVE_CFD_TAG") {	if ($arguments[1] ne "") {	   myLog ("Too many arguments to REMOVE_CFD_TAG.\n"); die ("Too many arguments to REMOVE_CFD_TAG.\n"); }	removeCFDTag($arguments[0]); }   elsif ($command eq "ADD_TO_CAT") {	if ($arguments[2] ne "") {	   myLog ("Too many arguments to ADD_TO_CAT.\n"); die ("Too many arguments to ADD_TO_CAT.\n"); }	addToCat($arguments[0],$arguments[1],""); }   elsif ($command eq "ADD_TO_CAT_NULL_OK") {	if ($arguments[2] ne "") {	   myLog ("Too many arguments to ADD_TO_CAT_NULL_OK.\n"); die ("Too many arguments to ADD_TO_CAT_NULL_OK.\n"); }	$::nullOK = "yes"; addToCat($arguments[0],$arguments[1],""); $::nullOK = "no"; }   elsif ($command eq "TRANSFER_TEXT") {	if ($arguments[2] ne "") {	   myLog ("Too many arguments to TRANSFER_TEXT.\n"); die ("Too many arguments to TRANSFER_TEXT.\n"); }	transferText($arguments[0], $arguments[1]); }   # DON'T USE THE BELOW COMMAND; IT'S NOT IMPLEMENTED PROPERLY YET. elsif ($command eq "CHANGE_CATEGORY") {	if ($arguments[3] ne "") {	   myLog ("Too many arguments to CHANGE_CATEGORY.\n"); die ("Too many arguments to CHANGE_CATEGORY.\n"); }	changeCategory($arguments[0], $arguments[1], $arguments[2]); }   elsif ($command eq "CLEANUP_DATE") {	if ($arguments[0] ne "") {	   myLog ("Too many arguments to CLEANUP_DATE.\n"); die ("Too many arguments to CLEANUP_DATE.\n"); }	cleanupDate; }   elsif ($command eq "OPENTASK_UPDATE") {	if ($arguments[0] ne "") {	   myLog ("Too many arguments to OPENTASK_UPDATE.\n"); die ("Too many arguments to OPENTASK_UPDATE.\n"); }	opentaskUpdate; }   # DON'T USE THE BELOW COMMAND; IT'S NOT IMPLEMENTED PROPERLY YET. #elsif ($command eq "ENFORCE_CATEGORY_REDIRECTS_CHECK") #{   #	enforceCategoryRedirects("no"); #}
 * 1)    elsif ($command eq "LIST_TO_CAT_CHECK")
 * 2) 	if ($arguments[2] ne "")
 * 3) 	   myLog ("Too many arguments to LIST_TO_CAT_CHECK.\n");
 * 4) 	   die ("Too many arguments to LIST_TO_CAT_CHECK.\n");
 * }
 * 1) 	listToCat($arguments[0], $arguments[1], "no");
 * 2)    }
 * 1) 	listToCat($arguments[0], $arguments[1], "no");
 * 2)    }

# This command is for remedial cleanup only. #elsif ($command eq "INTERWIKI_LOOP") #{   #	interwikiLoop; #}

elsif ($command eq "ENFORCE_CATEGORY_INTERWIKI") {	if ($arguments[1] ne "") {	   myLog ("Too many arguments to ENFORCE_CATEGORY_INTERWIKI.\n"); die ("Too many arguments to ENFORCE_CATEGORY_INTERWIKI.\n"); }	enforceCategoryInterwiki($arguments[0]); }

elsif ($command eq "STOP") {	myLog ("Stopped."); die ("Stopped."); }   elsif (($command eq "READ_COMMANDS")	   or ($command eq "")) {	while () {	   $line = $_;
 * 1) Broken due to recent changes on WP:CFD
 * 2)    elsif ($command eq "ENFORCE_CFD")
 * 3) 	enforceCFD;
 * 4)    }
 * 1)    }

if ($line =~ m/READ_COMMANDS/) {		myLog ("interpretCommands: Infinite loop!"); die ("interpretCommands: Infinite loop!"); }

if ($line =~ m/^\s*$/) {		next; }	   $line =~ s/\s+$//s; $line =~ s/\*\s*//;

if ($line =~ m/\[\[:?(.*?)\]\] -> \[\[:?(.*?)\]\]/) {		$line =~ s/\[\[:?(.*?)\]\] -> \[\[:?(.*?)\]\]//; $from = $1; $to = $2; $line =~ s/\s*$//; $from =~ s/ /_/g; $to =~ s/ /_/g; interpretCommand($line, $from, $to); }	   else {		while ($line =~ m/\[\[:?(.*?)\]\]/) {		   $line =~ m/\[\[:?(.*?)\]\]/; $page = $1; $pageCopy = $page; $page =~ s/ /_/g; $line =~ s/\[\[:?$pageCopy\]\]/$page/; }		interpretCommand(split (" ", $line)); }

unless ($line =~ m/TRANSFER_TEXT_CHECK/) {		limit; }	}	myLog ("Execution complete.\n"); print ("Execution complete.\n"); }   else {	myLog ("Unrecognized command '".$command."': ".join(" ", @arguments)."\n"); die ("Unrecognized command '".$command."': ".join(" ", @arguments)); } }
 * 1) 	   unless (($line =~ m/TRANSFER_TEXT_CHECK/) or
 * 2) 	            ($line =~ m/ENFORCE_CATEGORY_INTERWIKI/))

sub limit {   my ($i); # Rate-limiting to avoid hosing the wiki server # Min 30 sec unmarked # Min 10 sec marked # May be raised by retry if load is heavy

### ATTENTION ### # Increasing the speed of the bot to faster than 1 edit every 10 # seconds violates English Wikipedia rules as of April, 2005, and # will cause your bot to be banned. So don't change $normalDelay # unless you know what you are doing. Other sites may have # similar policies, and you are advised to check before using your # bot at the default speed. #################

if ($::speedLimit < 10) {	$::speedLimit = 10; }   $i = $::speedLimit; while ($i >= 0) {	sleep (1); print STDERR "Sleeping $i seconds...\r"; $i--; }   print STDERR "                                   \r"; }

sub postSTDIN {   my ($text, $articleName, $comment, $editTime, $startTime, $junk, $token);
 * 1) perl pearle.pl POST_STDIN User:Whobot/categories-alpha "Update from 13 Oct 2004 database dump"

$articleName = $_[0]; $comment = $_[1];

#urlSafe($articleName);

while () {	$text .= $_; }

if ($text =~ m/^\s*$/) {	myLog ("postSTDIN: Null input.\n"); die ("postSTDIN: Null input.\n"); }   ($junk, $editTime, $startTime, $token) = getPage($articleName);

if ($comment eq "") {	$comment = "Automated post"; }   postPage ($articleName, $editTime, $startTime, $token, $text, $comment); }

sub addToCat {   my ($text, $articleName, $category, $editTime, $startTime, $comment, $status,	@junk, $sortkey, $token);
 * 1) perl pearle.pl ADD_TO_CAT Page_name Category:Category_name sortkey

$articleName = $_[0]; $category = $_[1]; $sortkey = $_[2];

#urlSafe($articleName); #urlSafe($category);

($text, $editTime, $startTime, $token) = getPage($articleName);

$comment = "Add ${category} per WP:CFD";

($status, $text, @junk) = addCatToText($category, $text, $sortkey, $articleName); if ($status ne "success") {	return; }

postPage ($articleName, $editTime, $startTime, $token, $text, $comment); }

sub myLog { open (LOG, ">>whobot-log.txt") || die "Could not append to log!"; print LOG $_[0]; close (LOG); }

sub getPage {

my ($target, $request, $response, $reply, $text, $text2,	$editTime, $startTime, $attemptStartTime, $attemptFinishTime,	$token);

$target = $_[0];

if ($target =~ m/^\s*$/) {	myLog("getPage: Null target."); die("getPage: Null target."); }

# urlSafe ($target);

# Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time;

# Create a request-object print "GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n"; myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit"); $response = $::ua->request($request);

if ($response->is_success) {	$reply = $response->content;

# Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time; retry ("success", "getPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); # This detects whether or not we're logged in. unless ($reply =~ m%My talk%) {	   # We've lost our identity. myLog ("Wiki server is not recognizing me (1).\n---\n${reply}\n---\n"); die ("Wiki server is not recognizing me (1).\n"); }

$reply =~ m%(.*?) %s; $text = $1;

$reply =~ m/value="(\d+)" name="wpEdittime"/; $editTime = $1;

# Added 22 Aug 2005 to correctly handle articles that have # been undeleted $reply =~ m/value="(\d+)" name="wpStarttime"/; $startTime = $1;

# Added 9 Mar 2005 after recent software change. $reply =~ m/value="(\w+)" name="wpEditToken"/; $token = $1; ###

if (($text =~ m/^\s*$/)	   and ($::nullOK ne "yes")) {	   myLog ("getPage($target): Null text!\n"); myLog "\n---\n$reply\n---\n"; die ("getPage($target): Null text!\n"); }

if (($editTime =~ m/^\s*$/)	   and ($::nullOK ne "yes")) {	   myLog ("getPage($target): Null time!\n"); myLog "\n---\n$reply\n---\n"; die ("getPage($target): Null time!\n"); }

if (($text =~ m/>/) or	   ($text =~ m/ " ) etc	# This function is from HTML::Entities.	decode_entities($text);

# This may or may not actually work $::ua->cookie_jar->save;

return ($text, $editTime, $startTime, $token); }    else {	myLog ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n"); print ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/)	   or ($response->status_line =~ m/^502/)	    or ($response->status_line =~ m/^503/)) {	   return(retry("getPage", @_)); }	else {	   # Unhandled HTTP response die ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n"); }   } }

sub postPage {   my ($request, $response, $pageName, $textToPost, $summaryEntry,	$editTime, $startTime, $actual, $expected, $attemptStartTime,	$attemptFinishTime, $date, $editToken, $minor);

$pageName = $_[0]; $editTime = $_[1]; $startTime = $_[2]; $editToken = $_[3]; $textToPost = $_[4]; $summaryEntry = $_[5]; # Max 200 chars! $minor = $_[6];

$summaryEntry = substr($summaryEntry, 0, 200);

if ($pageName eq "") {	myLog ("postPage: Empty pageName.\n"); die ("postPage: Empty pageName.\n"); }

if ($summaryEntry eq "") {	$summaryEntry = "Automated editing."; }   # Monitor server responsiveness $attemptStartTime = Time::HiRes::time;

if ($minor eq "yes") {	$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpMinoredit => "on", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis }   else {	$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis, wpMinoredit }

# ---   ## If posts are failing, you can uncomment the below to see what ## HTTP request is being made. # myLog($request->as_string); # print $request->as_string;	$::speedLimit = 60 * 10; # print $::ua->request($request)->as_string; # ---

myLog("POSTing..."); print "POSTing..."; # Pass request to the user agent and get a response back $response = $::ua->request($request); myLog("POSTed.\n"); print "POSTed.\n";

if ($response->content =~ m/Please confirm that really want to recreate this article./) {	myLog ($response->content."\n"); die ("Deleted article conflict! See log!"); }

# Check the outcome of the response if (($response->is_success) or ($response->is_redirect)) {	# Monitor server responsiveness $attemptFinishTime = Time::HiRes::time; retry ("success", "postPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime));

$expected = "302 Moved Temporarily"; $actual = $response->status_line; if (($expected ne $actual)	   and ($actual ne "200 OK")) {	   myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n"); myLog ($request->as_string); myLog ("EXPECTED: '${expected}'\n"); myLog (" ACTUAL: '${actual}'\n");

die ("postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n"); }

$expected = "http://en.wikipedia.org/wiki/${pageName}"; $expected =~ s/\'/%27/g; $expected =~ s/\*/%2A/g; # $expected = urlEncode($expected);

$actual = $response->headers->header("Location");

if (($expected ne $actual) 	   and !(($actual eq "") and ($response->status_line eq "200 OK"))) { 	   myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n"); myLog ("EXPECTED: '${expected}'\n"); myLog (" ACTUAL: '${actual}'\n"); die ("postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n"); }

if ($response->content =~ m/Edit conflict/) {	   myLog ("Edit conflict on '$pageName' at '$editTime'!\n"); die ("Edit conflict on '$pageName' at '$editTime'!\n"); }

$::ua->cookie_jar->save; return ($response->content); }   else {	$date = `date /t`; $date =~ s/\n//g; myLog ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n");

# 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/)	   or ($response->status_line =~ m/^502/)	    or ($response->status_line =~ m/^503/)) {	   print "Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n"; return(retry("postPage", @_)); }	else {	   # Unhandled HTTP response die ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n"); }   } }

sub urlSafe {   # This function is no longer called because the LWP::UserAgent and # HTTP::Request libraries handle character escaping.

my ($text, $textCopy);

$text = $_[0]; $textCopy = $text;

# & may not be included in this list! $textCopy =~ s%[\p{IsWord}\w\-,\(\):\/\'\.\;\!]*%%g; unless ($textCopy eq "") {	myLog ("urlSafe: Bad character in ${text}: '${textCopy}'\n"); die ("urlSafe: Bad character in ${text}: '${textCopy}'\n"); } }

sub moveCategoryContents {   my (@articles, $categoryFrom, $categoryTo, $article, $status,	@subcats, $includeCategories, $subcat, @junk, $sortkey,	$includeSortkey);
 * 1) perl pearle.pl MOVE_CONTENTS_INCL_CATS Category:From_here Category:To_here

# -- INITIALIZATION --

$categoryFrom = $_[0]; $categoryTo = $_[1]; $includeCategories = $_[2]; $includeSortkey = $_[3];

if ($categoryFrom =~ m/^\[\[:(Category:.*?)\]\]/) {	$categoryFrom =~ s/^\[\[:(Category:.*?)\]\]/$1/; $categoryFrom =~ s/\s+/_/g; }

if ($categoryTo =~ m/^\[\[:(Category:.*?)\]\]/) {	$categoryTo =~ s/^\[\[:(Category:.*?)\]\]/$1/; $categoryTo =~ s/\s+/_/g; }

$categoryFrom =~ s/^\[\[://; $categoryTo =~ s/^\[\[://; $categoryFrom =~ s/\]\]$//; $categoryTo =~ s/\]\]$//;

unless (($categoryFrom =~ m/^Category:/) and	   ($categoryTo =~ m/^Category:/)) {	myLog ("moveCategoryContents: Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); die ("moveCategoryContents: Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); }

transferText ($categoryFrom, $categoryTo);

# Subcategory transfer if ($includeCategories eq "yes") {	@subcats = getSubcategories($categoryFrom); foreach $subcat (@subcats) {	   if ($subcat =~ m/^\s*$/) {		next; }

$subcat = urlDecode($subcat); print "changeCategory($subcat, $categoryFrom, $categoryTo) c\n"; myLog "changeCategory($subcat, $categoryFrom, $categoryTo) c\n"; changeCategory($subcat, $categoryFrom, $categoryTo); limit; }   }

# Article transfer @articles = getCategoryArticles($categoryFrom);

foreach $article (reverse(@articles)) {	if ($article =~ m/^\s*$/) {	   next; }
 * 1)    foreach $article (@articles)

$article = urlDecode($article); print "changeCategory($article, $categoryFrom, $categoryTo) a\n"; myLog "changeCategory($article, $categoryFrom, $categoryTo) a\n"; changeCategory($article, $categoryFrom, $categoryTo); limit; } }

sub depopulateCat #($category); {   my (@articles, $category, $article, $status, @subcats, $subcat, @junk);
 * 1) perl pearle.pl DEPOPULATE_CAT Category:To_be_depopulated

$category = $_[0];

if ($category =~ m/^\[\[:(Category:.*?)\]\]/) {	$category =~ s/^\[\[:(Category:.*?)\]\]/$1/; $category =~ s/\s+/_/g; }

unless ($category =~ m/^Category:/)

{	myLog ("depopulateCat: Are you sure '$category' is a category?\n"); die ("depopulateCat: Are you sure '$category' is a category?\n"); }

# Remove all subcategories @subcats = getSubcategories($category); foreach $subcat (@subcats) {	$subcat = urlDecode($subcat);

print "removeXFromCat($subcat, $category) c\n"; myLog "removeXFromCat($subcat, $category) c\n"; ($status, @junk) = removeXFromCat($subcat, $category); unless ($status == 0) {	   myLog ("Status: $status\n"); print "Status: $status\n"; }   }

# Remove all articles @articles = getCategoryArticles($category); foreach $article (reverse(@articles)) #foreach $article (@articles) {	$article = urlDecode($article);

print "removeXFromCat($article, $category) a\n"; myLog "removeXFromCat($article, $category) a\n"; ($status, @junk) = removeXFromCat($article, $category); unless ($status == 0) {	   myLog ("Status: $status\n"); print "Status: $status\n"; }   } }

sub removeXFromCat {
 * 1) perl pearle.pl REMOVE_X_FROM_CAT Article_name Category:Where_the_article_is

my ($text, $articleName, $category, $editTime, $startTime, $comment, $catTmp,	$sortkey, @junk, $token, $categoryUnd, $categoryHuman);

$articleName = $_[0]; $category = $_[1]; #urlSafe($articleName); #urlSafe($category);

unless ($category =~ m/^Category:\w+/) {	myLog ("addToCat: Bad format on category.\n"); die ("addToCat: Bad format on category.\n"); }

($text, $editTime, $startTime, $token) = getPage($articleName); $comment = "Removed ${category} per WP:CFD";

# Convert underscore to spaces; this is human-readable. $category =~ s/_/ /g;

$categoryHuman = $category;

# Insert possible whitespace $category =~ s/^Category://; $category = "Category:\\s*".$category; $category =~ s%\(%\\(%g; $category =~ s%\)%\\)%g; $category =~ s%\'%\\\'%g; $categoryUnd = $category; $categoryUnd =~ s/ /_/g;
 * 1)    $category = "Category:\\s*\\Q".$category."\\E"; # THIS DOES NOT WORK

unless (($text =~ m/\[\[\s*${category}\s*\]\]/is)	   or ($text =~ m/\[\[\s*${category}\s*\|.*?\]\]/is)	    or ($text =~ m/\[\[\s*${categoryUnd}\s*\]\]/is)	    or ($text =~ m/\[\[\s*${categoryUnd}\s*\|.*?\]\]/is)) {	print "removeXFromCat: $articleName is not in '$category'.\n"; myLog ("removeXFromCat: $articleName is not in '$category'.\n");

### TEMPORARY ### ### Uncomment these lines if you want category remove attempts ### to trigger null edits. This is useful if you have have ### changed the category on a template, but due to a bug this ### does not actually move member articles until they are ### edited. ($text, @junk) = fixCategoryInterwiki($text); postPage ($articleName, $editTime, $startTime, $token, $text, "Mostly null edit to actually remove from ${categoryHuman}", "yes"); limit; ### TEMPORARY ### return(1); }

if ($text =~ m/^\s*\#REDIRECT/is) {	print "addToCat: $articleName is a redirect!\n"; myLog ("addToCat: $articleName is a redirect!\n"); return(2); }

$text =~ m/\[\[\s*${category}\s*\|\s*(.*?)\]\]/is; $sortkey = $1; if ($sortkey eq "") {	$text =~ m/\[\[\s*${categoryUnd}\s*\|\s*(.*?)\]\]/is; }

# Remove the page from the category and any trailing newline. $text =~ s/\[\[\s*${category}\s*\|?(.*?)\]\]\n?//isg; $text =~ s/\[\[\s*${categoryUnd}\s*\|?(.*?)\]\]\n?//isg;

($text, @junk) = fixCategoryInterwiki($text);

postPage ($articleName, $editTime, $startTime, $token, $text, $comment); return(0, $sortkey); }

sub printWikitext {   my ($editTime, $startTime, $text, $target, $token);
 * 1) perl pearle.pl PRINT_WIKITEXT Article_you_want_to_get
 * 2) Warning: Saves to a file in the current directory with the same name
 * 3) as the article, plus another file with the .html extention.

$target = $_[0];

$target =~ s/^\[\[://; $target =~ s/\]\]$//;

($text, $editTime, $startTime, $token) = getPage($target);

# Save the wikicode version to a file. open (WIKITEXT, ">./${target}"); print WIKITEXT $text; close (WIKITEXT);

# Save the HTML version to a file. print `wget http://en.wikipedia.org/wiki/${target} -O ./${target}.html`; }

sub getCategoryArticles {   my ($target, $request, $response, $reply, $articles, $article,	@articles, $attemptStartTime, $attemptFinishTime);
 * 1) Get a list of the names of articles in a given category.

$target = $_[0];

#urlSafe ($target);

unless ($target =~ m/^Category:/)

{	myLog ("getCategoryArticles: Are you sure '$target' is a category?\n"); die ("getCategoryArticles: Are you sure '$target' is a category?\n"); }

# Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time;

# Create a request-object print "GET http://en.wikipedia.org/wiki/${target}\n"; myLog("GET http://en.wikipedia.org/wiki/${target}\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}"); $response = $::ua->request($request);

if ($response->is_success) {	# Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time; retry ("success", "getCategoryArticles", sprintf("%.3f", $attemptFinishTime-$attemptStartTime));

$reply = $response->content;

# This detects whether or not we're logged in. unless ($reply =~ m%My talk%) {	   # We've lost our identity. myLog ("Wiki server is not recognizing me (2).\n---\n${reply}\n---\n"); die ("Wiki server is not recognizing me (2).\n"); }

$articles = $reply; $articles =~ s%^.*? Articles in category.*? %%s; $articles =~ s% .*?$%%s; @articles = $articles =~ m%cookie_jar->save; return @articles; }    else {	myLog ("getCategoryArticles($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n");

# 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/)	   or ($response->status_line =~ m/^502/)	    or ($response->status_line =~ m/^503/)) {	   print "getCategoryArticles($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n"; return(retry("getCategoryArticles", @_)); }	else {	   # Unhandled HTTP response die ("getCategoryArticles($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n"); }   } }

sub getSubcategories {   my ($target, $request, $response, $reply, $subcats, $subcat,	@subcats, $attemptStartTime, $attemptFinishTime);
 * 1) Get a list of the names of subcategories of a given category.

$target = $_[0];

#urlSafe ($target);

unless ($target =~ m/^Category:/)

{	myLog ("getSubcategories: Are you sure '$target' is a category?\n"); die ("getSubcategories: Are you sure '$target' is a category?\n"); }

# Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time;

# Create a request-object print "GET http://en.wikipedia.org/wiki/${target}\n"; myLog("GET http://en.wikipedia.org/wiki/${target}\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}"); $response = $::ua->request($request);

if ($response->is_success) {	# Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time; retry ("success", "getSubcategories", sprintf("%.3f", $attemptFinishTime-$attemptStartTime));

$reply = $response->content;

# This detects whether or not we're logged in. unless ($reply =~ m%My talk%) {	   # We've lost our identity. myLog ("Wikipedia is not recognizing me (3).\n---\n${reply}\n---\n"); die ("Wikipedia is not recognizing me (3).\n"); }

$subcats = $reply;

if ($subcats =~ m%^.*? Subcategories (.*?) Articles in category.*? .*?$%s) {	   $subcats =~ s%^.*? Subcategories (.*?) Articles in category.*? .*?$%$1%s; }	else {	   return ; }

@subcats = $subcats =~ m%cookie_jar->save; return @subcats; }    else {	myLog ("getSubcategories($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/)	   or ($response->status_line =~ m/^502/)	    or ($response->status_line =~ m/^503/)) {	   print "getSubcategories($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n".$response->content."\n"; return(retry("getCategoryArticles", @_)); }	else {	   # Unhandled HTTP response die ("getSubcategories($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/wiki/${target}\n"); }   } }

sub addCFDTag {   my ($text, $category, $editTime, $startTime, $comment, $catTmp, @junk, $token);
 * 1) perl pearle.pl ADD_CFD_TAG Category:Category_name

$category = $_[0];

#urlSafe($category);

unless ($category =~ m/^Category:\w+/) {	myLog ("addCFDTag: Bad format on category.\n"); die ("addCFDTag: Bad format on category.\n"); }

$::nullOK = "yes"; ($text, $editTime, $startTime, $token) = getPage($category); $::nullOK = "no";

$comment = "Nominated for deletion or renaming";

if (($text =~ m/\{\{cfd\}\}/is) or	($text =~ m/\{\{cfm/is) or	($text =~ m/\{\{cfr/is)) {	print "addCFDTag: $category is already tagged.\n"; myLog ("addCFDTag: $category is already tagged.\n"); return; }

if ($text =~ m/^\s*\#REDIRECT/is) {	print "addCFDTag: $category is a redirect!\n"; myLog ("addCFDTag: $category is a redirect!\n"); return; }

# Add the CFD tag to the beginning of the page. $text = "\n".$text;

($text, @junk) = fixCategoryInterwiki($text);

postPage ($category, $editTime, $startTime, $token, $text, $comment); }

sub addCFDUTag {   my ($text, $category, $editTime, $startTime, $comment, $catTmp, @junk, $token, $stuff);
 * 1) perl pearle.pl ADD_CFDU_TAG Category:Category_name

$category = $_[0]; $stuff = $_[1];

urlSafe($category); #urlSafe($stuff);

unless ($category =~ m/^Category:\w+/) {	myLog ("addCFDUTag: Bad format on category.\n"); die ("addCFDUTag: Bad format on category.\n"); }

$::nullOK = "yes"; ($text, $editTime, $startTime, $token) = getPage($category); $::nullOK = "no";

$comment = "Nominated for deletion or renaming"; #$comment = "Test edit";

if (($text =~ m/\{\{cfd\}\}/is) or	($text =~ m/\{\{cfm/is) or	($text =~ m/\{\{cfr/is)) {	print "addCFDUTag: $category is already tagged.\n"; myLog ("addCFDUTag: $category is already tagged.\n"); return; }

if ($text =~ m/^\s*\#REDIRECT/is) {	print "addCFDUTag: $category is a redirect!\n"; myLog ("addCFDUTag: $category is a redirect!\n"); return; }

# Add the CFDU tag to the beginning of the page. $text = "\n".$text; # $text = $stuff;

($text, @junk) = fixCategoryInterwiki($text);

postPage ($category, $editTime, $startTime, $token, $text, $comment); }

sub removeCFDTag {   my ($text, $category, $editTime, $startTime, $comment, $catTmp, @junk, $token);
 * 1) perl pearle.pl REMOVE_CFD_TAG Category:Category_name

$category = $_[0];

#urlSafe($category);

unless ($category =~ m/^Category:\w+/) {	myLog ("removeCFDTag: Bad format on category.\n"); die ("removeCFDTag: Bad format on category.\n"); }

$::nullOK = "yes"; ($text, $editTime, $startTime, $token) = getPage($category); $::nullOK = "no";

$comment = "De-listed from Categories for deletion";

unless (($text =~ m/\{\{cfd\}\}/is) or	   ($text =~ m/\{\{cfm/is) or	    ($text =~ m/\{\{cfr/is)) {	print "removeCFDTag: $category is not tagged.\n"; myLog ("removeCFDTag: $category is not tagged.\n"); return; }

if ($text =~ m/^\s*\#REDIRECT/is) {	print "removeCFDTag: $category is a redirect!\n"; myLog ("removeCFDTag: $category is a redirect!\n"); return; }

# Remove the CFD tag. $text =~ s/\s*//gis; $text =~ s/\{\{cfr.*?\}\}\s*//is; $text =~ s/\{\{cfm.*?\}\}\s*//is; $text =~ s/\{\{cfdu.*?\}\}\s*//is; $text =~ s/\{\{cfru.*?\}\}\s*//is;

($text, @junk) = fixCategoryInterwiki($text);

postPage ($category, $editTime, $startTime, $token, $text, $comment); }


 * 1) perl pearle.pl TRANSFER_TEXT Category:From_here Category:To_there


 * 1) Note that this code is called automatically whenever moving a
 * 2) category, so you probably don't need to call it yourself from the
 * 3) command line.

sub transferText {

my ($source, $destination, $sourceText, $destinationText,	$sourceTime, $destinationTime, @sourceCategories,	@destinationCategories, $category, $lastCategory,	$sourceTextOrig, $destinationTextOrig, $comment, $sourceHuman,	$destinationHuman, $noMergeFlag, $sourceToken,	$destinationToken, $junk, $sourceStartTime,	$destinationStartTime);

$source = $_[0]; $destination = $_[1]; $comment = "Cleanup per WP:CFD (moving $source to $destination)"; # Make human-readable versions of these variables for use in edit summaries $sourceHuman = $source; $sourceHuman =~ s/_/ /g; $destinationHuman = $destination; $destinationHuman =~ s/_/ /g;

unless (($source =~ m/^Category:/) and	   ($destination =~ m/^Category:/)) {	myLog ("transferText: Are you sure these are categories? ".$source."/".$destination."\n"); die ("transferText: Are you sure these are categories? ".$source."/".$destination."\n"); }

($sourceText, $sourceTime, $sourceStartTime, $sourceToken) = getPage($source);

# Avoid double runs!

# This text must be the same as that which is implanted below, and # it should be an HTML comment, so that it's invisible. if ($sourceText =~ m/<\!--WHOBOT-MOVE-CATEGORY-CONTENTS-SRC-FLAG-->/) {	return; }

$sourceTextOrig = $sourceText; $sourceText =~ s///; $sourceText =~ s/\{\{cfr.*?\}\}\s*//is; $sourceText =~ s/\{\{cfm.*?\}\}\s*//is; $sourceText =~ s/\{\{cfdu.*?\}\}\s*//is; $sourceText =~ s/\{\{cfru.*?\}\}\s*//is; $sourceText =~ s/^\s+//s; $sourceText =~ s/\s+$//s;

$::nullOK = "yes"; ($destinationText, $destinationTime, $destinationStartTime, $destinationToken) = getPage($destination); $::nullOK = "no";

$destinationTextOrig = $destinationText; $destinationText =~ s///; $destinationText =~ s/\{\{cfm.*?\}\}\s*//is; $destinationText =~ s/\{\{cfr.*?\}\}\s*//is; $destinationText =~ s/\{\{cfdu.*?\}\}\s*//is; $destinationText =~ s/\{\{cfru.*?\}\}\s*//is; $destinationText =~ s/^\s+//s; $destinationText =~ s/\s+$//s;

# To help keep things straight when we're in a loop. print STDOUT "\n\n";

if (($sourceText eq "") and	($destinationText ne "")) {	# The HTML comment must be the same as that above. $sourceText = "\nThis category has been moved to $destinationHuman. Any remaining articles and subcategories will soon be moved by a bot unless otherwise noted on WP:CFD.\n\n"; }	   elsif (($sourceText ne "") and	($destinationText eq "")) {	$destinationText = $sourceText; # The HTML comment must be the same as that above. $sourceText = "\nThis category has been moved to $destinationHuman. Any remaining articles and subcategories will soon be moved by a bot unless otherwise noted on WP:CFD.\n\n"; }   elsif (($sourceText ne "") and	   ($destinationText ne "")) {	@sourceCategories = $sourceText =~ m/\[\[\s*(Category:.*?)\s*\]\]/gs; @destinationCategories = $destinationText =~ m/\[\[\s*(Category:.*?)\s*\]\]/gs;

$sourceText =~ s/\[\[\s*(Category:.*?)\s*\]\]\s*//gs; $sourceText =~ s/^\s+//s; $sourceText =~ s/\s+$//s; $destinationText =~ s/\[\[\s*(Category:.*?)\s*\]\]\s*//gs; $destinationText =~ s/^\s+//s; $destinationText =~ s/\s+$//s;

$destinationText = $sourceText."\n".$destinationText; $destinationText =~ s/^\s+//s; $destinationText =~ s/\s+$//s; foreach $category (sort (@sourceCategories, @destinationCategories)) {	   if ($category eq $lastCategory) {		next; }	   $destinationText .= "\n${category}"; $lastCategory = $category; }	# The HTML comment must be the same as that above. $sourceText = "\nThis category has been moved to $destinationHuman. Any remaining articles and subcategories will soon be moved by a bot unless otherwise noted on WP:CFD.\n\n"; }

$sourceText =~ s/\n\s+\n/\n\n/sg; $destinationText =~ s/\n\s+\n/\n\n/sg;

# You may need to futz with this, depending on the templates # currently in use. unless (($sourceTextOrig =~ m/\{\{cfd/)	   or ($sourceTextOrig =~ m/\{\{cfr/)	    or ($sourceTextOrig =~ m/\{\{cfru|/)	    or ($sourceTextOrig =~ m/\{\{cfdu|/)	    or ($sourceTextOrig =~ m/\{\{cfm/)) {	print STDOUT "FATAL ERROR: $source was not tagged, , , , or !\n"; myLog("FATAL ERROR: $source was not tagged, , , , or !\n"); die("FATAL ERROR: $source was not tagged, , , , or !\n"); }

if (($sourceText eq $sourceTextOrig) and	($destinationText eq $destinationTextOrig)) {	print STDOUT "No changes for $source and $destination.\n"; return; }

if ($destinationTextOrig =~ m/^\s*$/) {	print "No merging was required from $source into $destination.\n"; $noMergeFlag = "yes"; }

unless ($noMergeFlag eq "yes") {	$destinationText .= "\n"; }

# Make sure category and interwiki links conform to style # guidelines. ($destinationText, $junk) = fixCategoryInterwiki($destinationText);

# If we did have to change things around, print the changes and post them to the wiki.

if ($sourceText ne $sourceTextOrig) {	unless ($noMergeFlag eq "yes") {	   print STDOUT "SOURCE FROM:\n%%%${sourceTextOrig}%%%\nSOURCE TO:\n%%%${sourceText}%%%\n"; }	postPage ($source, $sourceTime, $sourceStartTime, $sourceToken, $sourceText, $comment); }

if ($destinationText ne $destinationTextOrig) {	unless ($noMergeFlag eq "yes") {	   print STDOUT "DESTINATION FROM:\n%%%${destinationTextOrig}%%%\nDESTINATION TO:\n%%%${destinationText}%%%\n"; }	postPage ($destination, $destinationTime, $destinationStartTime, $destinationToken, $destinationText, $comment); } }

sub urlDecode {   my ($input); $input = $_[0];
 * 1) Translate from HTTP URL encoding to the native character set.

$input =~ s/\%([a-f|A-F|0-9][a-f|A-F|0-9])/chr(hex($1))/eg;

return ($input); }

sub urlEncode {   my ($char, $input, $output);
 * 1) Translate from the native character set to HTTP URL encoding.

$input = $_[0];

foreach $char (split("",$input)) {
 * 1) 	if ($char =~ m/[a-z|A-Z|0-9|\-_\.\!\~\*\'\(\)]/)

# The below exclusions should conform to Wikipedia practice # (possibly non-standard) if ($char =~ m/[a-z|A-Z|0-9|\-_\.\*\/:]/) {	   $output .= $char; }	elsif ($char eq " ") {	   $output .= "+"; }	else {	   $output .= uc(sprintf("%%%x", ord($char))); # %HH where HH is the (Unicode?) hex code of $char }   }

return ($output); }

sub changeCategory {
 * 1) perl pearle.pl CHANGE_CATEGORY Article_name Category:From Category:To

my ($articleName, $categoryFrom, $categoryTo, $editTime, $startTime, $text,	$comment, $catTmp, $sortkey, $token, $junk, $categoryFromUnd);

$articleName = $_[0]; $categoryFrom = $_[1]; $categoryTo = $_[2]; #urlSafe($articleName); #urlSafe($categoryFrom); #urlSafe($categoryTo);

unless (($categoryFrom =~ m/^Category:/) and	   ($categoryTo =~ m/^Category:/)) {	myLog ("moveCategoryContents: Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); die ("moveCategoryContents: Are you sure these are categories? ".$categoryFrom."/".$categoryTo."\n"); }

if ($articleName =~ m/^\s*$/) {	myLog("changeCategory: Null target."); die("changeCategory: Null target."); }

($text, $editTime, $startTime, $token) = getPage($articleName); $comment = "Recat per WP:CFD ${categoryFrom} to ${categoryTo}";

# --- Start the removing part ---

# Convert underscore to spaces; this is human-readable. $categoryFrom =~ s/_/ /g;

# Insert possible whitespace $categoryFrom =~ s/^Category://; $categoryFrom = "Category:\\s*".$categoryFrom; # Escape special characters $categoryFrom =~ s%\(%\\(%g; $categoryFrom =~ s%\)%\\)%g; $categoryFrom =~ s%\'%\\\'%g;

$categoryFromUnd = $categoryFrom; $categoryFromUnd =~ s/ /_/g;

unless (($text =~ m/\[\[\s*${categoryFrom}\s*\]\]/is)	   or ($text =~ m/\[\[\s*${categoryFrom}\s*\|.*?\]\]/is)	    or ($text =~ m/\[\[\s*${categoryFromUnd}\s*\]\]/is)	    or ($text =~ m/\[\[\s*${categoryFromUnd}\s*\|.*?\]\]/is)) {       myLog ("changeCategory.r: $articleName is not in '$categoryFrom'.\n"); my ($nullEditFlag);

# Set this to "yes" if you want mass category change attempts # to trigger null edits automatically. You should check the # category later to see if everything worked or not, to see if # any templates should be changed. The below will add a small # amount of unnecessary server load to try the null edits if # template changes haven't already been made. $nullEditFlag = "yes";

if ($nullEditFlag eq "yes") {	   myLog ("changeCategory: Attempting null edit on $articleName.\n"); print "changeCategory: Attempting null edit on $articleName.\n"; nullEdit($articleName); return; }	else {	   print "###${text}###\n"; die ("changeCategory.r: $articleName is not in '$categoryFrom'.\n"); }   }

if ($text =~ m/^\s*\#REDIRECT/is) {	myLog ("changeCategory.r: $articleName is a redirect!\n"); die ("changeCategory.r: $articleName is a redirect!\n"); }

# We're lazy and don't fully parse the document to properly check # for escaped category tags, so there may be some unnecssary # aborts from the following, but they are rare and easily # overridden by manually editing the page in question. if ($text =~ m/ .*?category.*?<\/nowiki>/is) {	myLog ("changeCategory.r: $articleName has a dangerous nowiki tag!\n"); die ("changeCategory.r: $articleName has a dangerous nowiki tag!\n"); }

$text =~ m/\[\[\s*${categoryFrom}\s*\|\s*(.*?)\]\]/is; $sortkey = $1; if ($sortkey eq "") {       $text =~ m/\[\[\s*${categoryFromUnd}\s*\|\s*(.*?)\]\]/is; }

# Remove the page from the category and any trailing newline. $text =~ s/\[\[\s*${categoryFrom}\s*\|?(.*?)\]\]\n?//isg; $text =~ s/\[\[\s*${categoryFromUnd}\s*\|?(.*?)\]\]\n?//isg;

# --- Start the adding part ---

# Remove any newlines at the end of the document. $text =~ s/\n*$//s;

$catTmp = $categoryTo; # _ and spaces are equivalent and may be intermingled in wikicode. $catTmp =~ s/Category:\s*/Category:\\s*/g; $catTmp =~ s/_/[_ ]/g; $catTmp =~ s%\(%\\\(%g; $catTmp =~ s%\)%\\\)%g; $catTmp =~ s%\.%\\\.%g; if (($text =~ m/(\[\[\s*${catTmp}\s*\|.*?\]\])/is)	or ($text =~ m/(\[\[\s*${catTmp}\s*\]\])/is)) {	myLog ("changeCategory.a: $articleName is already in '$categoryTo'.\n"); print "\n1: '${1}'\n"; print "\ncattmp: '${catTmp}'\n"; print "changeCategory.a: $articleName is already in '$categoryTo'.\n";

## It's generally OK to merge it in, so don't do this: # die "changeCategory.a: $articleName is already in '$categoryTo'.\n"; # return; }   elsif ($text =~ m/^\s*\#REDIRECT/is) {	print "changeCategory.a: $articleName is a redirect!\n"; myLog ("changeCategory.a: $articleName is a redirect!\n"); return; }   else {	# Convert underscore to spaces; this is human-readable. $categoryTo =~ s/_/ /g; # Add the category on a new line. if ($sortkey eq "") {	   $text .= "\n${categoryTo}"; }	else {	   $text .= "\n${sortkey}"; }   }	    # --- Post-processing --- ($text, $junk) = fixCategoryInterwiki($text); postPage ($articleName, $editTime, $startTime, $token, $text, $comment, "yes"); }

sub listToCat {   my ($lists, $cats, $list, $cat, $listText, @junk, @articlesInList,	@articlesInCat, %articlesInCat, $article, $implement);
 * 1) This function is not yet finished.  Right now it simply compares the
 * 2) membership of a given list and a given category.  Eventually, it is
 * 3) intended to be used to convert lists into categories. This is not
 * 4) yet authorized behavior.

$lists = $_[0]; $cats = $_[1]; $implement = $_[2];

if ($implement ne "yes") {	print "Diffing membership of '$lists' and '$cats'\n"; }

foreach $list (split(";", $lists)) {	$list =~ s/^\[\[:?//; $list =~ s/\]\]$//;

($listText, @junk) = getPage($list); $listText =~ s% .*? %%gis; $listText =~ s% .*? %%gis; #

@articlesInList = (@articlesInList, $listText =~ m%\[\[(.*?)\]\]%sg); sleep 1; }

foreach $cat (split(";", $cats)) {	$cat =~ s/^\[\[:?//; $cat =~ s/\]\]$//; $cat =~ s/^:Category/Category/;

@articlesInCat = (@articlesInCat, getCategoryArticles($cat)); sleep 1; }

foreach $article (@articlesInCat) {	$article = urlDecode ($article); $articlesInCat{$article} = 1; # print "In cat: $article\n"; }

foreach $article (@articlesInList) {	$article =~ s/\s+/_/gs; $article =~ s/\|.*$//; if (exists $articlesInCat{$article}) {	   # print "OK: $article\n"; delete $articlesInCat{$article}; }	else {	   print "Only in list(s): $article\n"; }   }

foreach $article (sort(keys(%articlesInCat))) {	print "Only in cat(s): $article\n"; } }

sub shellfix {   my ($string, $stringTmp);
 * 1) A little paranoia never hurt anyone.

$string = $_[0]; $string =~ s/([\*\?\!\(\)\&\>\<])\"\'/\\$1/g;

$stringTmp = $string;

$stringTmp =~ s/[Å\p{IsWord}[:alpha:][:digit:]\*,:_.\'\"\)\(\?\-\/\&\>\<\!]//g;

if ($stringTmp ne "") {       die ("\nUnsafe character(s) in '${string}': '$stringTmp'\n"); }

return $string; }

sub enforceCategoryRedirects {   my ($implementActually, $line, $lineTmp, $articlesToMove,	$article, $flatResults, $entry, $contents, $catTo, $lineTmp2);
 * 1) You will not be able to use this function; it requires a dataset
 * 2) processed by scripts which have not been included.  (It's not
 * 3) finished, anyway.)

$implementActually = $_[0]; $flatResults = `cat data/reverse-category-links-sorted.txt | grep ^Category:Wikipedia_category_redirects`; foreach $line (split("\n", $flatResults)) {	$line =~ s/^Category:Wikipedia_category_redirects <\- //;

$lineTmp = shellfix($line); $lineTmp2 = $lineTmp; $lineTmp2 =~ s/^Category://;

if ($line =~ m/^Category/) {	   $articlesToMove = `type data/reverse-category-links-sorted.txt | grep ^${lineTmp}`;

if ($articlesToMove eq "") {		next; }	   print "ATM: $articlesToMove\n"; $entry = `egrep \"^\\([0-9]+,14,'$lineTmp2'\" data/entries-categoryredirect.txt `;	   $entry =~ m/^\([0-9]+,14,'$lineTmp2','(.*?)',/; $contents = $1;

$contents =~ m/\{\{categoryredirect\|(.*?)\}\}/; $catTo = $1; $catTo = ":Category:".$catTo; $catTo =~ s/_/ /g;

$lineTmp = $line; $lineTmp =~ s/^Category/:Category/i; $lineTmp =~ s/_/ /g;

foreach $article (split("\n", $articlesToMove)) {		print "ARTICLE: $article\n"; print "LINE: $line\n"; $article =~ s/^$line <\- //; print "* Move $article from $lineTmp to $catTo\n"; }	}   } }

sub retry {
 * 1) A call to this recursive function handles any retries necessary to
 * 2) wait out network or server problems.  It's a bit of a hack.

my ($callType, @args, $i, $normalDelay, $firstRetry,	$secondRetry, $thirdRetry);

($callType, @args) = @_;

### ATTENTION ### # Increasing the speed of the bot to faster than 1 edit every 10 # seconds violates English Wikipedia rules as of April, 2005, and # will cause your bot to be banned. So don't change $normalDelay # unless you know what you are doing. Other sites may have # similar policies, and you are advised to check before using your # bot at the default speed. #################

# HTTP failures are usually an indication of high server load. # The retry settings here are designed to give human editors # priority use of the server, by allowing it ample recovering time # when load is high.

# Time to wait before retry on failure, in seconds $normalDelay = 10;      # Normal interval between edits is 10 seconds $firstRetry = 60;       # First delay on fail is 1 minute $secondRetry = 60 * 10; # Second delay on fail is 10 minutes $thirdRetry = 60 * 60;  # Third delay on fail is 1 hour # SUCCESS CASE # e.g. retry ("success", "getPage", "0.23"); if ($callType eq "success") {	myLog("Response time for ".$args[0]." (sec): ".$args[1]."\n"); $::retryDelay = $normalDelay;

if ($args[0] eq "postPage") {	   # If the response time is greater than 20 seconds... if ($args[1] > 20) {		print "Wikipedia is very slow. Increasing minimum wait to 10 min...\n"; myLog("Wikipedia is very slow. Increasing minimum wait to 10 min...\n"); $::speedLimit = 60 * 10; }

# If the response time is between 10 and 20 seconds... elsif ($args[1] > 10) {		print "Wikipedia is somewhat slow. Setting minimum wait to 60 sec...\n"; myLog("Wikipedia is somewhat slow. Setting minimum wait to 60 sec...\n"); $::speedLimit = 60; }

# If the response time is less than 10 seconds... else {		if ($::speedLimit > 10) {		   print "Returning to normal minimum wait time.\n"; myLog("Returning to normal minimum wait time.\n"); $::speedLimit = 10; }	   }	}	return; }

# e.g. retry ("getPage", "George_Washington") # FAILURE CASES elsif (($::retryDelay == $normalDelay)	  or ($::retryDelay == 0)) {	print "First retry for ".$args[0]."\n"; myLog("First retry for ".$args[0]."\n"); $::retryDelay = $firstRetry; $::speedLimit = 60 * 10; }   elsif ($::retryDelay == $firstRetry) {	print "Second retry for ".$args[0]."\n"; myLog("Second retry for ".$args[0]."\n"); $::retryDelay = $secondRetry; $::speedLimit = 60 * 10; }   elsif ($::retryDelay == $secondRetry) {	print "Third retry for ".$args[0]."\n"; myLog("Third retry for ".$args[0]."\n"); $::retryDelay = $thirdRetry; $::speedLimit = 60 * 10; }   elsif ($::retryDelay == $thirdRetry) {	print "Nth retry for ".$args[0]."\n"; myLog("Nth retry for ".$args[0]."\n"); $::retryDelay = $thirdRetry; $::speedLimit = 60 * 10; }   else {	die ("retry: Internal error - unknown delay factor '".$::retryDelay."'\n"); }

# DEFAULT TO FAILURE CASE HANDLING $i = $::retryDelay; while ($i >= 0) {	sleep (1); print STDERR "Waiting $i seconds for retry...\r"; $i--; }   print "                                     \r";

# DO THE ACTUAL RETRY if ($callType eq "getPage") {	return(getPage(@args)); }   elsif ($callType eq "postPage") {	return(postPage(@args)); }   elsif ($callType eq "getCategoryArticles") {	return(getCategoryArticles(@args)); }   elsif ($callType eq "getSubcategories") {	return(getSubcategories(@args)); }   elsif ($callType eq "getURL") {	return(getURL(@args)); }   else {	myLog ("retry: Unknown callType: $callType\n"); die ("retry: Unknown callType: $callType\n"); } }

sub enforceCFD {   my (@subcats, $subcat, $cfd, $editTime, $startTime, $token, $cfdU, $cfdR); @subcats = getSubcategories("Category:Categories_for_deletion");
 * 1) perl pearle ENFORCE_CFD
 * 2) This just compares the contents of Category:Categories_for_deletion
 * 3) with WP:CFD and /resolved and /unresolved.  It is broken now due to
 * 4) recent changes which list all nominations on subpages.  It also
 * 5) does not check above the first 200 members of the category, due to
 * 6) recent changes which paginates in 200-page blocks.

($cfd, $editTime, $startTime, $token) = getPage("Wikipedia:Categories_for_deletion"); ($cfdU, $editTime, $startTime, $token) = getPage("Wikipedia:Categories_for_deletion/unresolved"); ($cfdR, $editTime, $startTime, $token) = getPage("Wikipedia:Categories_for_deletion/resolved");

$cfd =~ s/[\r\n_]/ /g; $cfd =~ s/\s+/ /g; $cfdU =~ s/[\r\n_]/ /g; $cfdU =~ s/\s+/ /g; $cfdR =~ s/[\r\n_]/ /g; $cfdR =~ s/\s+/ /g;

foreach $subcat (@subcats) {	$subcat =~ s/[\r\n_]/ /g; $subcat =~ s/\s+/ /g; $subcat = urlDecode ($subcat);

unless ($cfd =~ m/$subcat/) {	   print "$subcat is not in WP:CFD"; if ($cfdR =~ m/$subcat/) {		print " (listed on /resolved)"; }	   if ($cfdU =~ m/$subcat/) {		print " (listed on /unresolved)"; }	   print "\n"; }   } }

sub addCatToText {   my ($category, $text, $catTmp, $sortkey, $articleName, $junk);
 * 1) An internal function that handles the complexity of adding a
 * 2) category tag to the wikicode of a page.

$category = $_[0]; $text = $_[1]; $sortkey = $_[2]; $articleName = $_[3];

unless ($category =~ m/^Category:\w+/) {	myLog ("addCatToText: Bad format on category.\n"); die ("addCatToText: Bad format on category.\n"); }

$catTmp = $category; # _ and spaces are equivalent and may be intermingled. $catTmp =~ s/Category:\s*/Category:\\s*/g; $catTmp =~ s/_/[_ ]/g; $catTmp =~ s%\(%\\\(%g; $catTmp =~ s%\)%\\\)%g; $catTmp =~ s%\.%\\\.%g; if (($text =~ m/(\[\[\s*${catTmp}\s*\|.*?\]\])/is)	or ($text =~ m/(\[\[\s*${catTmp}\s*\]\])/is)) {	print "addCatToText: $articleName is already in '$category'.\n"; myLog ("addCatToText: $articleName is already in '$category'.\n"); print "\n1: '${1}'\n"; print "\ncattmp: '${catTmp}'\n"; return("fail", $text); }

if ($text =~ m/^\s*\#REDIRECT/is) {	print "addCatToText: $articleName is a redirect!\n"; myLog ("addCatToText: $articleName is a redirect!\n"); return("fail", $text); }

# Convert underscore to spaces; this is human-readable. $category =~ s/_/ /g;

# Add the category $text .= "\n$category"; # Move the category to the right place ($text, $junk) = fixCategoryInterwiki($text); return ("success", $text); }


 * 1) THIS ROUTINE IS CURRENTLY UNUSED ###


 * 1) It will probably not be useful to you, anyway, since it requires
 * 2) pre-processed database dumps which are not included in Whobot.

sub getPageOffline {   my ($target, $result, $targetTmp);

$target = $_[0]; # Must run the following before using this function, from 200YMMDD/data: # cat entries.txt | perl ../../scripts/rewrite-entries.pl > entries-simple.txt # Even after this pre-processing, this routine is incredibly slow. # Set up and use MySQL instead if you care about speed.

$target =~ s/\s/_/g;

# Double escape the tab, once for Perl, once for the shell # -P means "treat as Perl regexp" (yay!) $targetTmp = shellfix($target); $result = `grep -P '^${targetTmp}\\t' /home/beland/wikipedia/20050107/data/matches2.txt`; $result =~ s/^${target}\t//;
 * 1)    $result = `grep -P '^${target}\\t' /home/beland/wikipedia/20050107/data/entries-simple.txt`;

$result =~ s/\\n/\n/g;

return ($result, "junk"); }


 * 1) --- CATEGORY AND INTERWIKI STYLE CLEANUP ROUTINES ---

sub interwikiLoop {   my ($article, $text, @junk,	$enforceCategoryInterwikiCalls);
 * 1) perl pearle.pl INTERWIKI_LOOP
 * 2) This command is for remedial cleanup only, and so is probably not
 * 3) useful anymore. This loop takes input of the form:
 * 4) "ArticleName\tBodyText\n{repeat...}" on STDIN.
 * 1) "ArticleName\tBodyText\n{repeat...}" on STDIN.

while () {	if ($_ =~ m/^\s*$/) {	   next; }	($article, $text, @junk) = split ("\t", $_); $text =~ s/\\n/\n/g; enforceCategoryInterwiki($article, $text);

$enforceCategoryInterwikiCalls++; print STDOUT "\r interwikiLoop iteration ".$enforceCategoryInterwikiCalls;

} }


 * 1) perl pearle.pl ENFORCE_CATEGORY_INTERWIKI Article_name
 * 2) This function is for both external use.  From the command line, use
 * 3) it to tidy up a live page's category and interwiki tags, specifying
 * 4) only the name of the page.  It can also be used by interwikiLoop,
 * 5) which supplies the full text on its own.  It will post any changes
 * 6) to the live wiki that involve anything more than whitespace
 * 7) changes.
 * 8) This function also does  ->  conversion, so that
 * 9) the article parsing algorithm can be recycled.
 * 1) This function also does  ->  conversion, so that
 * 2) the article parsing algorithm can be recycled.

sub enforceCategoryInterwiki {

my ($articleName, $text, $editTime, $startTime, $textOrig, @newLines, $line,	$textCopy, $textOrigCopy, $message, @junk, $diff, $token,	$online);

$articleName = $_[0]; myLog("enforceCategoryInterwiki($articleName)\n"); $text = $_[1];

$online = 0;

if ($text eq "") {	$online = 1; ($text, $editTime, $startTime, $token) = getPage($articleName); }

$textOrig = $text;

($text, $message) = fixCategoryInterwiki($text);

if (substantiallyDifferent($text, $textOrig)) {	@newLines = split ("\n", $text); $textCopy = $text; $textOrigCopy = $textOrig;

open (ONE, ">/tmp/article1.$$"); print ONE $textOrig; close (ONE);

open (TWO, ">/tmp/article2.$$"); print TWO $text; close (TWO);

$diff = `diff /tmp/article1.$$ /tmp/article2.$$`; unlink("/tmp/article1.$$"); unlink("/tmp/article2.$$");

myLog("*** $articleName - $message\n"); myLog("*** DIFF FOR $articleName\n"); myLog($diff); if ($online == 0) {	   # Isolate changed files for later runs open (FIXME, ">>./fixme.interwiki.txt.$$"); $text =~ s/\t/\\t/g; $text =~ s/\n/\\n/g; print FIXME $articleName."\t".$text."\n"; close (FIXME); }

myLog($articleName." changed by fixCategoryInterwiki: $message\n"); print STDOUT $articleName." changed by fixCategoryInterwiki: $message\n";

if ($online == 1) {	   postPage ($articleName, $editTime, $startTime, $token, $text, $message, "yes"); }   }    else {	print STDOUT "--- No change for ${articleName}.\n"; myLog ("--- No change for ${articleName}.\n"); ### TEMPORARY ### ### Uncomment this line if you want category changes to ### trigger null edits. This is useful if you have have ### changed the category on a template, but due to a bug this ### does not actually move member articles until they are ### edited. postPage ($articleName, $editTime, $startTime, $token, $textOrig, "null edit", "yes"); ### TEMPORARY ### } }

sub substantiallyDifferent {   my($a, $b);

$a = $_[0]; $b = $_[1];

$a =~ s/\s//g; $b =~ s/\s//g; return ($a ne $b); }

sub fixCategoryInterwiki {
 * 1) Given some wikicode as input, this function will tidy up the
 * 2) category and interwiki links and return the result and a comment
 * 3) suitable for edit summaries.

my ($input, @segmentNames, @segmentContents, $langlist, $i,	$message, $output, $flagForReview, $interwikiBlock,	$categoryBlock, $flagError, $bodyBlock, $contents, $name,	@interwikiNames, @interwikiContents, @categoryNames,	@categoryContents, @bodyNames, @bodyContents, $bodyFlag,	@bottomNames, @bottomContents, @segmentNamesNew,	@segmentContentsNew, $lastContents, @stubContents,	@stubNames, $stubBlock, $msgFlag);

$input = $_[0];

# The algorithm here is complex. The general idea is to split the # page in to segments, each of which is assigned a type, and then # to rearrange, consolidate, and frob the segments as needed.

# Start with one segment that includes the whole page. @::segmentNames = ("bodyText"); @::segmentContents = ($input);

# Recognize and tag certain types of segments. The order of   # processing is very important.

metaTagInterwiki("nowiki", "^(.*?)(\s* .*? \s*)"); metaTagInterwiki("comment", "^(.*?)(<!.*?>\\n?)"); metaTagInterwiki("html", "^(.*?)(<.*?>\\n?)"); metaTagInterwiki("category", "^(.*?)(\\[\\[\\s*Category\\s*:\\s*.*?\\]\\]\\n?)"); $langlist = `type langlist`; $langlist =~ s/^\s*//s; $langlist =~ s/\s*$//s; $langlist =~ s/\n/\|/gs; $langlist .= "|minnan|zh\-cn|zh\-tw|nb"; metaTagInterwiki("interwiki", "^(.*?)(\\[\\[\\s*($langlist)\\s*:\\s*.*?\\]\\]\\n?)"); metaTagInterwiki("tag", "^(.*?)(\{\{.*?\}\})");

# Allow category and interwiki segments to be followed by HTML # comments only (plus any intervening whitespace).

$i = 0; while ($i < @::segmentNames) {	$name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; # ->  conversion if (($name eq "tag") and	   ($contents =~ m/^/)) {	   $msgFlag = 1; $contents =~ s/^//; }

if (($name eq "category") or ($name eq "interwiki")) {	   if (!($contents =~ m/\n/) and ($::segmentNames[$i+1] eq "comment")) {		push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents.$::segmentContents[$i+1]); $i += 2; next; }	   if (!($contents =~ m/\n/) 		and ($::segmentNames[$i+1] eq "bodyText")		and ($::segmentContents[$i+1] =~ m/^\s*$/)		and !($::segmentContents[$i+1] =~ m/^\n$/)		and ($::segmentNames[$i+2] eq "comment")		) {		push (@segmentNamesNew, $name); push (@segmentContentsNew, 		     $contents.$::segmentContents[$i+1].$::segmentContents[$i+2]); $i += 3; next; }
 * 1) DEBUG		print "AAA - ".$contents.$::segmentContents[$i+1]);
 * 1) DEBUG		print "BBB".$contents.$::segmentContents[$i+1].$::segmentContents[$i+2]);

# Consolidate with any following whitespace if (($::segmentNames[$i+1] eq "bodyText")		and ($::segmentContents[$i+1] =~ m/^\s*$/)		) {		push (@segmentNamesNew, $name); push (@segmentContentsNew, 		     $contents.$::segmentContents[$i+1]); $i += 2; next; }	}	push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); $i++; }   # Clean up results @::segmentNames = @segmentNamesNew; @::segmentContents = @segmentContentsNew; @segmentContentsNew = ; @segmentNamesNew = ;

# Move category and interwiki tags that precede the body text (at   # the top of the page) to the bottom of the page.

$bodyFlag = 0; foreach $i (0 ... @::segmentNames-1) {	$name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; if ($bodyFlag == 1) {	   push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); }	elsif (($name eq "category") or ($name eq "interwiki")) {	   push (@bottomNames, $name); push (@bottomContents, $contents); }	else {	   push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); $bodyFlag = 1; }   }    # Clean up results @::segmentNames = (@segmentNamesNew, @bottomNames); @::segmentContents = (@segmentContentsNew, @bottomContents); @segmentContentsNew = ; @segmentNamesNew = ; @bottomNames = ; @bottomContents = ;

# Starting at the bottom of the page, isolate category, interwiki, # and body text. If categories or interwiki links are mixed with # body text, flag for human review.

### DEBUG ### # foreach $i (0 ... @::segmentNames-1) # {   #  print "---$i ".$::segmentNames[$i]."---\n"; # print "%%%".$::segmentContents[$i]."%%%\n"; # }   ### DEBUG ###

### DEBUG ### #my ($first); #$first = 1; ### DEBUG ###

$bodyFlag = 0; $flagForReview = 0; foreach $i (reverse(0 ... @::segmentNames-1)) {	$name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; if (($name eq "category") and ($bodyFlag == 0)) {	   # Push in reverse @categoryNames = ($name, @categoryNames); @categoryContents = ($contents, @categoryContents); next; }	elsif (($name eq "interwiki") and ($bodyFlag == 0)) {	   # Push in reverse @interwikiNames = ($name, @interwikiNames); @interwikiContents = ($contents, @interwikiContents); next; }	elsif (($bodyFlag == 0)	      and ($name eq "tag") 	       and (($contents =~ m/\{\{[ \w\-]+[\- ]?stub\}\}/) or		    ($contents =~ m/\{\{[ \w\-]+[\- ]?stub\|.*?\}\}/))) {	   ### IF THIS IS A STUB TAG AND WE ARE STILL $bodyFlag == 0, ### THEN ADD THIS TO $stubBlock!

# Canonicalize by making into s/^\{\{\s*msg:(.*?)\}\}/\{\{$1\}\}/i; # Push in reverse @stubNames = ($name, @stubNames); @stubContents = ($contents, @stubContents); next; }	elsif (($name eq "category") or ($name eq "interwiki")) # bodyFlag implicitly == 1 {	   if ($flagForReview == 0) {		$flagForReview = 1; $lastContents =~ s/^\s*//s; $lastContents =~ s/\s*$//s; $flagError = substr ($lastContents, 0, 30); }	   # Drop down to push onto main body stack. }

# Handle this below instead. ## Skip whitespace #if (($contents =~ m/^\s*$/s) and ($bodyFlag == 0)) #{	#   next; #}

# Delete these comments if (($bodyFlag == 0) and ($name == "comment")) {	   if (		($contents =~ m//i) or		($contents =~ m//i) or		($contents =~ m//i) or		($contents =~ m//i) or		($contents =~ m//i) or		($contents =~ m//i)		) {		### DEBUG ### #print STDOUT ("YELP!\n"); #		#foreach $i (0 ... @bodyNames-1) #{		#   print "---$i ".$bodyNames[$i]."---\n"; #   print "%%%".$bodyContents[$i]."%%%\n"; #}		#		#print STDOUT ("END-YELP!"); ### DEBUG ###

next; }	}

# Push onto main body stack (in reverse). @bodyNames = ($name, @bodyNames); @bodyContents = ($contents, @bodyContents); ### DEBUG ### #if (($flagForReview == 1) and ($first == 1)) #{	#   $first = 0; #   print "\@\@\@${lastContents}\#\#\#\n"; #}	### DEBUG ###

# This should let tags mixed in with the category and # interwiki links (not comingled with body text) bubble up. unless (($contents =~ m/^\s*$/s) or ($name eq "tag")) {	   $bodyFlag = 1; }

$lastContents = $contents; }   ### DEBUG ### ### DEBUG ###
 * 1)    foreach $i (0 ... @bodyNames-1)
 * 2)        print "---$i ".$bodyNames[$i]."---\n";
 * 3) 	print "%%%".$bodyContents[$i]."%%%\n";
 * 4)    }
 * 5)    foreach $i (0 ... @categoryNames-1)
 * 6)        print "---$i ".$categoryNames[$i]."---\n";
 * 7) 	print "^^^".$categoryContents[$i]."^^^\n";
 * 8)    }
 * 9)    foreach $i (0 ... @interwikiNames-1)
 * 10)        print "---$i ".$interwikiNames[$i]."---\n";
 * 11) 	print "&&&".$interwikiContents[$i]."&&&\n";
 * 12)    }
 * 1)        print "---$i ".$interwikiNames[$i]."---\n";
 * 2) 	print "&&&".$interwikiContents[$i]."&&&\n";
 * 3)    }

# Assemble body text, category, interwiki, and stub arrays into strings foreach $i (0 ... @bodyNames-1) {	$name = $bodyNames[$i]; $contents = $bodyContents[$i]; $bodyBlock .= $contents; }   foreach $i (0 ... @categoryNames-1) {	$name = $categoryNames[$i]; $contents = $categoryContents[$i]; # Enforce style conventions $contents =~ s/\[\[category\s*:\s*/\[\[Category:/i; # Enforce a single newline at the end of each category line. $contents =~ s/\s*$//; $categoryBlock .= $contents."\n"; }   foreach $i (0 ... @interwikiNames-1) {	$name = $interwikiNames[$i]; $contents = $interwikiContents[$i]; # Canonicalize minnan to zh-min-nan, since that's what's in # the officially distributed langlist. $contents =~ s/^\[\[minnan:/\[\[zh-min-nan:/;

# Canonicalize zh-ch, Chinese (simplified) and zn-tw, Chinese # (traditional) to "zh"; the distinction is being managed # implicitly by software now, not explicitly in wikicode. $contents =~ s/^\[\[zh-cn:/\[\[zh:/g; $contents =~ s/^\[\[zh-tw:/\[\[zh:/g;

# Canonicalize nb to no	$contents =~ s/^\[\[nb:/\[\[no:/g;

# Canonicalize dk to da	$contents =~ s/^\[\[dk:/\[\[da:/g;

# Enforce a single newline at the end of each interwiki line. $contents =~ s/\s*$//; $interwikiBlock .= $contents."\n"; }   foreach $i (0 ... @stubNames-1) {	$name = $stubNames[$i]; $contents = $stubContents[$i]; # Enforce a single newline at the end of each stub line. $contents =~ s/\s*$//; $contents =~ s/^\s*//; $stubBlock .= $contents."\n"; }

# Minimize interblock whitespace $bodyBlock =~ s/^\s*//s; $bodyBlock =~ s/\s*$//s; $categoryBlock =~ s/^\s*//s; $categoryBlock =~ s/\s*$//s; $interwikiBlock =~ s/^\s*//s; $interwikiBlock =~ s/\s*$//s; $stubBlock =~ s/^\s*//s; $stubBlock =~ s/\s*$//s;

# Assemble the three blocks into a single string, flagging for # human review if necessary. $output = ""; if ($bodyBlock ne "") {	$output .= $bodyBlock."\n\n"; }   if (($flagForReview == 1) 	and !($input =~ m/\{\{interwiki-category-check/)	and !($input =~ m/\{\{split/)	and !($input =~ m/\[\[Category:Pages for deletion\]\]/)) {

$output .= "\n\n"; }   if ($categoryBlock ne "") {	$output .= $categoryBlock."\n"; }   if ($interwikiBlock ne "") {	$output .= $interwikiBlock."\n"; }   if ($stubBlock ne "") {	$output .= $stubBlock."\n"; }
 * 1) 	$output .= "\n".$interwikiBlock."\n";

if ($input ne $output) {	$message = "Minor category, interwiki, or template style cleanup"; if ($flagForReview == 1) {	   $message = "Flagged for manual review of category/interwiki style"; }	if ($msgFlag == 1) {	   $message .= ";  ->  conversion for MediaWiki 1.5+ compatibility"; }   }    else {	$message = "No change"; }   return($output, $message); }


 * 1) sub displayInterwiki
 * 2)    my ($i);
 * 3)    ## THIS FUNCTION CANNOT BE CALLED DUE TO SCOPING; YOU MUST MANUALLY
 * 4)    ## COPY THIS TEXT INTO fixCategoryInterwiki.  IT IS ONLY USEFUL
 * 5)    ## FOR DIAGNOSTIC PURPOSES.
 * 6)    foreach $i (0 ... @::segmentNames-1)
 * 7) 	print "---$i ".$::segmentNames[$i]."---\n";
 * 8) 	print "%%%".$::segmentContents[$i]."%%%\n";
 * 9)    }
 * }
 * 1) 	print "%%%".$::segmentContents[$i]."%%%\n";
 * 2)    }
 * }

sub metaTagInterwiki {
 * 1) A subroutine of fixCategoryInterwiki, this function isolates
 * 2) certain parts of existing segments based on a regular expression
 * 3) pattern, and tags them with the supplied name (which indicates their
 * 4) type).  Sorry for the global variables.

my ($tag, $pattern, $i, $meta, $body, @segmentNamesNew,	@segmentContentsNew, $name, $contents, $bodyText, );

$tag = $_[0]; $pattern = $_[1];

foreach $i (0 ... @::segmentNames-1) {	$name = $::segmentNames[$i]; $contents = $::segmentContents[$i]; unless ($name eq "bodyText") {	   push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); next; }	while (1) {	   if ($contents =~ m%$pattern%is) {		$bodyText = $1; $meta = $2; if ($bodyText ne "") {		   push (@segmentNamesNew, "bodyText"); push (@segmentContentsNew, $bodyText); }		push (@segmentNamesNew, $tag); push (@segmentContentsNew, $meta); $contents =~ s/\Q${bodyText}${meta}\E//s; }	   else {		if ($contents ne "") {		   push (@segmentNamesNew, $name); push (@segmentContentsNew, $contents); }		last; }	}   }    @::segmentNames = @segmentNamesNew; @::segmentContents = @segmentContentsNew; @segmentContentsNew = ; @segmentNamesNew = ; }

sub nullEdit {   my ($text, $articleName, $comment, $editTime, $startTime, $token);

$articleName = $_[0];

print "nullEdit($articleName)\n"; myLog ("nullEdit($articleName)\n");

($text, $editTime, $startTime, $token) = getPage($articleName); postPage ($articleName, $editTime, $startTime, $token, $text, "null edit"); }

sub cleanupDate {   my ($article, @articles);

# Get all articles from Category:Wikipedia cleanup @articles = getCategoryArticles ("Category:Wikipedia cleanup");

@articles = (sort(@articles));
 * 1)    @articles = reverse (sort(@articles));

foreach $article (@articles) {	if (($article =~ m/^Wikipedia:/)	   or ($article =~ m/^Template:/)	    or ($article =~ m/^User:/)	    or ($article =~ m/talk:/i)	    ) {	   next; }

cleanupDateArticle($article); limit; } }

sub cleanupDateArticle #($target) {   my (@result, $link, $currentMonth, $currentYear, $junk, $line,	$month, $year, $found, $lineCounter, $target); $target = $_[0]; print "cleanupDateArticle($target)\n"; @result = parseHistory($target); ($currentMonth, $currentYear, $junk) = split(" ", $result[0]); $found = ""; foreach $line (@result) {	$lineCounter++; ($month, $year, $link) = split(" ", $line); if (($month eq $currentMonth)	   and ($year eq $currentYear)) {	   next; }
 * 1) 	   print "$month $year - SKIP\n";


 * 1) Skip this, because it produces false positives on articles that were
 * 2) protected at the end of last month, but no longer are.  The correct
 * 3) thing to do is to check if an article is CURRENTLY protected by
 * 4) fetching the current version, but this seems like a waste of network
 * 5) resources.


 * 1) 	if (checkForTag("protected", $link) eq "yes")
 * 2) 	   print "$target is ; skipping\n";
 * 3) 	   myLog("$target is ; skipping\n");
 * 4) 	   return;
 * }
 * }

if (checkForTag("sectionclean", $link) eq "yes") {	   print "$target has \n"; myLog("$target has \n"); nullEdit($target); return; }

if (checkForTag("Sect-Cleanup", $link) eq "yes") {	   print "$target has \n"; myLog("$target has \n"); nullEdit($target); return; }

if (checkForTag("section cleanup", $link) eq "yes") {	   print "$target has \n"; myLog("$target has \n"); nullEdit($target); return; }

if (checkForTag("sectcleanup", $link) eq "yes") {	   print "$target has \n"; myLog("$target has \n"); nullEdit($target); return; }

if (checkForTag("cleanup-section", $link) eq "yes") {	   print "$target has \n"; myLog("$target has \n"); nullEdit($target); return; }

if (checkForTag("cleanup-list", $link) eq "yes") {	   print "$target has \n"; myLog("$target has \n"); nullEdit($target); return; }

if (checkForTag("cleanup-nonsense", $link) eq "yes") {	   print "$target has \n"; myLog("$target has \n"); nullEdit($target); return; }

if ((checkForTag("cleanup", $link) eq "yes") or	   (checkForTag("clean", $link) eq "yes") or	    (checkForTag("CU", $link) eq "yes") or	    (checkForTag("cu", $link) eq "yes") or	    (checkForTag("cleanup-quality", $link) eq "yes") or	    (checkForTag("tidy", $link) eq "yes")) {	   $currentMonth = $month; $currentYear = $year; next; }	else {	   $found = "Tag added $currentMonth $currentYear\n"; last; }   }    if ($found eq "") {
 * 1) 	   print "$month $year - YES\n";
 * 1) 	   print "$month $year - NO\n";
 * 2) 	   print "Tag added $currentMonth $currentYear\n";
 * 1) 	print "HISTORY EXHAUSTED\n";

if ($lineCounter < 498) {	   $found = "Tag added $currentMonth $currentYear\n"; }	else {	   myLog("Unable to determine when tag was added to $target.\n"); die("Unable to determine when tag was added to $target.\n"); }   }
 * 1) 	   print "Unable to determine when tag was added to $target.\n";

if ($found ne "") {	changeTag("cleanup", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("tidy", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("CU", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("cu", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("cleanup-quality", "cleanup-date\|${currentMonth} ${currentYear}", $target) || changeTag("clean", "cleanup-date\|${currentMonth} ${currentYear}", $target) || nullEdit($target); } }

sub changeTag {   my ($tagFrom, $tagFromUpper, $tagTo, $tagToUpper, $articleName,	$editTime, $startTime, $text, $token, $comment, $junk);

$tagFrom = $_[0];     # "cleanup" $tagTo = $_[1];       # "cleanup-date|August 2005" $articleName = $_[2]; # Article name

print "changeTag (${tagFrom}, ${tagTo}, ${articleName})\n"; myLog("changeTag (${tagFrom}, ${tagTo}, ${articleName})\n");

$tagFromUpper = ucfirst($tagFrom); $tagToUpper = ucfirst($tagTo);

if ($articleName =~ m/^\s*$/) {	myLog("changeTag: Null target."); die("changeTag: Null target."); }   ($text, $editTime, $startTime, $token) = getPage($articleName); unless (($text =~ m/\{\{\s*\Q$tagFrom\E\s*\}\}/)	   or ($text =~ m/\{\{\s*\Q$tagFromUpper\E\s*\}\}/) 	    or ($text =~ m/\{\{\s*\Qmsg:$tagFrom\E\s*\}\}/)	    or ($text =~ m/\{\{\s*\Qmsg:$tagFromUpper\E\s*\}\}/)	    or ($text =~ m/\{\{\s*\QTemplate:$tagFrom\E\s*\}\}/)	    or ($text =~ m/\{\{\s*\QTemplate:$tagFromUpper\E\s*\}\}/)	    or ($text =~ m/\{\{\s*\Q$tagFrom\E\|.*?\s*\}\}/)	    or ($text =~ m/\{\{\s*\Q$tagFromUpper\E\|.*?\s*\}\}/)	    ) {	myLog("changeTag: is not in $articleName.\n"); print "changeTag: is not in $articleName.\n";

# die("changeTag: is not in $articleName.\n"); ### TEMPORARY ### # Just skip articles with, , , etc.

sleep(1); # READ THROTTLE! return(0);

}   if (($text =~ m/\{\{\s*\Q$tagTo\E\s*\}\}/)	or ($text =~ m/\{\{\s*\Q$tagToUpper\E\s*\}\}/)) {	myLog("changeTag: $articleName already contains ."); die("changeTag: $articleName already contains ."); }

if ($text =~ m/^\s*\#REDIRECT/is) {	myLog ("changeTag.a: $articleName is a redirect!\n"); die ("changeTag.a: $articleName is a redirect!\n"); sleep(1); # READ THROTTLE! return(0); }   # Escape special characters $tagFrom =~ s%\(%\\(%g; $tagFrom =~ s%\)%\\)%g; $tagFrom =~ s%\'%\\\'%g; # We're lazy and don't fully parse the document to properly check # for escaped tags, so there may be some unnecssary aborts from # the following, but they are rare and easily overridden by   # manually editing the page in question. if (($text =~ m/ .*?\Q$tagFrom\E.*?<\/nowiki>/is) or	($text =~ m/ .*?\Q$tagFrom\E.*?<\/pre>/is)) #    {	myLog ("changeTag.r: $articleName has a dangerous nowiki or pre tag!\n"); die ("changeTag.r: $articleName has a dangerous nowiki or pre tag!\n"); }   # Make the swap! $text =~ s/\{\{\s*\Q$tagFrom\E\s*\}\}//g; $text =~ s/\{\{\s*\Q$tagFromUpper\E\s*\}\}//g; $text =~ s/\{\{\s*\Qmsg:$tagFrom\E\s*\}\}//g; $text =~ s/\{\{\s*\Qmsg:$tagFromUpper\E\s*\}\}//g; $text =~ s/\{\{\s*\QTemplate:$tagFrom\E\s*\}\}//g; $text =~ s/\{\{\s*\QTemplate:$tagFromUpper\E\s*\}\}//g; $text =~ s/\{\{\s*\Q$tagFrom\E\|(.*?)\s*\}\}//g; $text =~ s/\{\{\s*\Q$tagFromUpper\E\|(.*?)\s*\}\}//g;

# Tidy up the article in general ($text, $junk) = fixCategoryInterwiki($text);

# Post the changes $comment = "Changing \{\{${tagFrom}\}\} to \{\{${tagTo}\}\}"; postPage ($articleName, $editTime, $startTime, $token, $text, $comment, "yes"); return (1); }

sub parseHistory {   my ($pageName, $html, @lines, $line, $date, $month, $year,	$htmlCopy, $link, @result);

$pageName = $_[0];

$html = getURL("http://en.wikipedia.org/w/index.php?title=${pageName}&action=history&limit=500");

$htmlCopy = $html;

$html =~ s%^.*? %%s; $html =~ s%(.*?)</ul>.*$%$1%s; $html =~ s%</li>\s*%%s;

@lines = split ("</li>", $html); foreach $line (@lines) {	$line =~ s/\n/ /g;

if ($line =~ m/^\s*$/) {	   next; }	$line =~ s% .*?$%%; $line =~ s%^.*?Select a newer version for comparison%%; $line =~ s%^.*?Select a older version for comparison%%; $line =~ s%^.*?name="diff" />%%;
 * 1) 	print "LINE: ".$line."\n";

$line =~ m%<a href="(.*?)" title="(.*?)">(.*?)</a>%; $link = $1; $date = $3;


 * 1) 	print $link." / $date\n";

if ($date =~ m/Jan/) {	   $month = "January"; }	elsif ($date =~ m/Feb/) {	   $month = "February"; }	elsif ($date =~ m/Mar/) {	   $month = "March"; }	elsif ($date =~ m/Apr/) {	   $month = "April"; }	elsif ($date =~ m/May/) {	   $month = "May"; }	elsif ($date =~ m/Jun/) {	   $month = "June"; }	elsif ($date =~ m/Jul/) {	   $month = "July"; }	elsif ($date =~ m/Aug/) {	   $month = "August"; }	elsif ($date =~ m/Sep/) {	   $month = "September"; }	elsif ($date =~ m/Oct/) {	   $month = "October"; }	elsif ($date =~ m/Nov/) {	   $month = "November"; }	elsif ($date =~ m/Dec/) {	   $month = "December"; }	else {	   $month = "Unknown month"; myLog ("Unknown month - parse failure! $line\nHTML:\n$html\n"); die ("Unknown month - parse failure! (see log) LINE: $line\n"); }	$date =~ m/(\d\d\d\d)/; $year = $1;

@result = (@result, "$month $year $link"); }   return (@result); }

sub checkForTag #($targetURLWithOldIDAttached) {   my ($tag, $target, $text);

$tag = $_[0]; $target = $_[1];

# Must be absolute; assuming English Wikipedia here. $target =~ s%^/w/index.php%http://en.wikipedia.org/w/index.php%;

# Decode HTML entities in links $target =~ s/\&amp;/\&/g;

if ($target eq $::cachedTarget) {	$text = $::cachedText; }   else {	$text = getURL ($target."&action=edit"); $::cachedTarget = $target; $::cachedText = $text; }

if ($text =~ m/\{\{\s*\Q$tag\E\s*\}\}/) {	return "yes"; }
 * 1) 	print $text; die "Cough!";

$tag = ucfirst($tag); if ($text =~ m/\{\{\s*\Q$tag\E\s*\}\}/) {	return "yes"; }
 * 1) 	print "\n\nSneeze!\n\n"; print $text."\n\n";

return "no"; }

sub getURL #($target) {   # Read throttle! sleep (1);

my ($attemptStartTime, $attemptFinishTime, $request, $response, $reply, $url); $url = $_[0];

# Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time; # Create a request-object print "GET ${url}\n"; myLog("GET ${url}\n"); $request = HTTP::Request->new(GET => "${url}"); $response = $::ua->request($request);

if ($response->is_success) {	$reply = $response->content;

# Monitor wiki server responsiveness $attemptFinishTime = Time::HiRes::time; retry ("success", "getURL", sprintf("%.3f", $attemptFinishTime-$attemptStartTime)); # This may or may not actually work $::ua->cookie_jar->save;

return ($reply); }    else {	myLog ("getURL: HTTP ERR (".$response->status_line.") ${url}\n".$response->content."\n"); print ("getURL: HTTP ERR (".$response->status_line.") ${url}\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server if (($response->status_line =~ m/^500/)	   or ($response->status_line =~ m/^502/)	    or ($response->status_line =~ m/^503/)) {	   return(retry("getURL", @_)); }	else {	   # Unhandled HTTP response die ("getURL: HTTP ERR (".$response->status_line.") ${url}\n"); }   } }

sub opentaskUpdate {

my ($target, $historyFile, $opentaskText, $editTime, $startTime,	$token, $key, $historyDump);

$target = "User:Beland/workspace"; $historyFile = "/home/beland/wikipedia/pearle-wisebot/opentask-history.pl";

($opentaskText, $editTime, $startTime, $token) = getPage($target);

eval(`type $historyFile`);

$opentaskText = doOpentaskUpdate("NPOV",				    "Category:NPOV disputes",				     $opentaskText);

$opentaskText = doOpentaskUpdate("COPYEDIT",				    "Category:Wikipedia articles needing copy edit",				     $opentaskText);

$opentaskText = doOpentaskUpdate("WIKIFY",				    "Category:Articles that need to be wikified",				     $opentaskText);

$opentaskText = doOpentaskUpdate("MERGE",				    "Category:Articles to be merged",				     $opentaskText);

# Dump history

$historyDump = "\%::history = (\n";   foreach $key (sort(keys(%::history)))    {	$historyDump .= "\"${key}\" => \"".$::history{$key}."\",\n";    }    $historyDump =~ s/,\n$//s;    $historyDump .= "\n)\n";

open (HISTORY, ">".$historyFile); print HISTORY $historyDump; close (HISTORY);

postPage ($target, $editTime, $startTime, $token, $opentaskText, "Automatic rotation of NPOV, copyedit, wikify, and merge", "yes"); }

sub doOpentaskUpdate {

my ($categoryID, $sourceCategory, $opentaskText, @articles,	$article, %rank, $featuredString, $characterLimit,	$featuredStringTmp);

$categoryID = $_[0]; $sourceCategory = $_[1]; $opentaskText = $_[2];

$characterLimit = 100;

@articles = getCategoryArticles ($sourceCategory); # Shuffle and clean up article names; and exclude unwanted entries foreach $article (@articles) {	if (($article =~ m/^Wikipedia:/)	   or ($article =~ m/^Template:/)	    or ($article =~ m/^User:/)	    or ($article =~ m/talk:/i)	    ) {	   next; }

$article = urlDecode($article); $article =~ s/_/ /g;

$rank{$article} = rand; }   # Pick as many articles as will fit in the space allowed foreach $article (sort {$rank{$a} <=> $rank {$b}} (keys(%rank))) {	if (length($article)+1 < $characterLimit - length($featuredString)) {	   $featuredString .= "${article},\n";

# Record how many times each article is featured. $::history{"${article}-${categoryID}"}++; }   }

$featuredStringTmp = $featuredString; $featuredStringTmp =~ s/\n/ /g; print "Featuring: $featuredStringTmp\n"; myLog("Featuring: $featuredStringTmp\n");

# Insert into actual page text and finish $opentaskText =~ s/.*?/${1}\n$featuredString${2}/gs; return ($opentaskText); }