User:OrphanBot/libPearle2.pl


 * 1) IMPORTANT ###


 * 1) This code is released into the public domain.


 * 1) RECENT CHANGES ###


 * 1) 30 Nov 2005: Created, based off of the  12 Nov 2005 version of Pearle Wisebot
 * 2) 15 Feb 2006: Modifed "retry" to work with any function that signals failure by dying, modified to use a simple exponential backoff formula
 * 3)              Simplified "limit" code, modified to take an optional parameter
 * 4)              Added "config" function as a clean interface to change internal parameters
 * 5)              Modified Wiki-access functions for use with the new "retry" function
 * 6)              Cleanup of boolean config vars to use standard Perl boolean conventions
 * 7) 28 Feb 2006: Added checkLogin bottleneck, option to allow editing while logged out
 * 8)              Added support for proxy servers
 * 9)  8 Mar 2006: Added support for getting a user's contributions
 * 10)              Added support for retrieving logs
 * 11)              Separated out some common regex parts into variables
 * 12) 29 Mar 2006: Added protection against Unicode in URLs
 * 13)              Made thrown exceptions consistent
 * 14)              Sanity-checking on postPage: talkpage without article, userpage or user talkpage without user
 * 15) 17 May 2005: Improved log retrieval
 * 16) 12 Jul 2007: Added timestamp to information retrieved from logs


 * 1) Errors thrown by this package always begin with a three-digit number
 * 2)     4xx: HTTP client errors
 * 3)     505: Server error: HTTP version not supported
 * 4)     509: Server error: Bandwidth exceeded
 * 5)     900: Unspecified internal error.
 * 6)     901: Library not initialized.  You didn't call Pearle::init before calling this function.
 * 7)     902: Parameter error.  You made a function call, but forgot a mandatory parameter, or provided an invalid one.
 * 8)     920: Unexpected response.  The MediaWiki site returned something unexpected.
 * 9)     921: Unexpected logout.  The MediaWiki site logged us out unexpectedly.
 * 10)     922: Edit conflict.  Someone edited the article while we were.
 * 11)     923: Deleted article conflict.  Someone deleted the article while we were editing.
 * 1)     922: Edit conflict.  Someone edited the article while we were.
 * 2)     923: Deleted article conflict.  Someone deleted the article while we were editing.

package Pearle;

use strict; use warnings;

use Time::HiRes;

use utf8;

use LWP::UserAgent; use HTTP::Cookies; use HTTP::Request::Common qw(POST); use HTML::Entities;

$Pearle::regex_timestamp = '(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d)';				# Match and capture a Wikipedia timestamp $Pearle::regex_timestamp_nc = '\d\d:\d\d, \d\d? \w+ \d\d\d\d';						# Match a Wikipedia timestamp
 * 1) Standard regex parts

$Pearle::regex_pagelink = '';	# Match and capture any page $Pearle::regex_redpagelink = '';	# Match and capture nonexistant pages only $Pearle::regex_bluepagelink = '';				# Match and capture existing pages only $Pearle::regex_pagelink_nc = '';	# Match any page $Pearle::regex_redpagelink_nc = '';	# Match nonexistant pages only $Pearle::regex_bluepagelink_nc = '';				# Match existing pages only
 * 1) 
 * 2) 

@Pearle::namespaces = ("", "Talk", "User", "User talk", "Wikipedia", "Wikipedia talk", "Image", "Image talk", "MediaWiki", "MediaWiki talk", "Template", "Template talk", "Help", "Help talk", "Category", "Category talk");
 * 1) Standard MediaWiki namespaces

$Pearle::logfile = ""; $Pearle::_inited = 0; $Pearle::username = ""; $Pearle::password = ""; $Pearle::speedLimit = 10;	# Seconds to wait by default when limit is called $Pearle::_speedMult = 1;	# Multiplier for default wait time if the wiki is being slow $Pearle::roughMode = 0;		# Ignore most errors $Pearle::nullOK = 0;		# Permit editing non-existent pages $Pearle::sanityCheck = 0;	# Sanity checking on edits $Pearle::silent = 0;		# Silent mode $Pearle::quiet = 0;			# Quiet mode $Pearle::logoutOK = 0;		# Permit editing while logged out $Pearle::proxy = undef;		# Proxy to use

sub init {	$Pearle::username = $_[0] or die("902 No username provided!\n"); $Pearle::password = $_[1] or die("902 No password provided!\n"); $Pearle::logfile = $_[2] or die("902 No logfile name provided!\n"); $Pearle::cookies = $_[3] or die("902 No cookie file provided!\n"); $Pearle::useragent = $_[4] or $Pearle::useragent = "PearleLib/0.2";
 * 1) This must be the first function from the library called

$Pearle::ua = LWP::UserAgent->new(timeout => 300); $Pearle::ua->agent($Pearle::useragent); $Pearle::ua->cookie_jar(HTTP::Cookies->new(file => $Pearle::cookies, autosave => 1)); $Pearle::ua->cookie_jar->load;

$Pearle::roughMode = "no";

# Hot pipes $| = 1;	$Pearle::_inited = 1; }

sub config {	my %params = @_; $Pearle::speedLimit = $params{speedLimit} if(defined($params{speedLimit})); $Pearle::roughMode = $params{roughMode} if(defined($params{roughMode})); $Pearle::nullOK = $params{nullOK} if(defined($params{nullOK})); $Pearle::silent = $params{silent} if(defined($params{silent})); $Pearle::quiet = $params{quiet} if(defined($params{quiet})); $Pearle::logfile = $params{logfile} if(defined($params{logfile})); $Pearle::logoutOK = $params{logoutOK} if(defined($params{logoutOK})); $Pearle::sanityCheck = $params{sanityCheck} if(defined($params{sanityCheck})); if(exists($params{proxy})) {		if(defined($params{proxy})) {			myPrint("Proxying: $params{proxy}\n"); myLog("Proxying: $params{proxy}\n"); $Pearle::ua->proxy('http', $params{proxy}); $Pearle::proxy = $params{proxy}; }		else {			myPrint("Not proxying\n"); myLog("Not proxying\n"); $Pearle::ua->no_proxy; $Pearle::proxy = undef; }	} }

sub myLog {	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

open (LOG, ">>", $Pearle::logfile) || die "900 Could not append to log!"; print LOG $_[0]; close (LOG); }

sub myPrint {	return if($Pearle::silent); return if($Pearle::quiet); print @_; }

sub myErrPrint {	return if($Pearle::silent); return if($Pearle::quiet); print STDERR @_; }

sub limit {	my ($i); $i = ($_[0] or ($Pearle::speedLimit * $Pearle::_speedMult)); $i = 10 if($i < 10);
 * 1) Rate-limiting.  Can be sensibly run even if libPearle isn't initialized

# Rate-limiting to avoid hosing the wiki server # Min 30 sec unmarked # Min 10 sec marked # May be raised by retry if load is heavy

### ATTENTION ### # Increasing the speed of the bot to faster than 1 edit every 10 # seconds violates English Wikipedia rules as of April, 2005, and # will cause your bot to be banned. So don't change $normalDelay # unless you know what you are doing. Other sites may have # similar policies, and you are advised to check before using your # bot at the default speed. #################

while ($i >= 0) {		sleep (1); myErrPrint("Sleeping $i seconds...\r"); $i--; }	myErrPrint("                                  \r"); }

sub login {	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my $res = $Pearle::ua->post(		"http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogin&action=submitlogin",		Content => [			wpName        => $Pearle::username,			wpPassword     => $Pearle::password,			wpRemember     => 1,			wpLoginAttempt => 1		]	);

if( 302 == $res->code ) {		myPrint("Logged in as $Pearle::username\n"); myLog("Logged in as $Pearle::username\n"); # This may or may not actually work $Pearle::ua->cookie_jar->save; return 1; }	else {		myPrint("Login failed\n"); myPrint("Code: ".$res->code."\n"); myLog("Login failed\n"); return 0; } }

sub logout { my $res = $Pearle::ua->post(               "http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogout",        );

return 1; }

sub checkLogin {	my ($reply_text); $reply_text = $_[0]; if ($reply_text !~ m/>My talk<\/a>/ and !($Pearle::logoutOK)) {		# We've lost our identity. myLog ("Wiki server is not recognizing me.\n"); die ("921 Wiki server is not recognizing me.\n"); } }

sub httpRequest {	my ($request, $response, $attemptStartTime, $attemptEndTime); $request = $_[0]; # Since not every server handles UTF-8 in URLs, and LWP doesn't escape them properly, escape every character > 255 $request->uri(unicodeToUrl($request->uri)); $response = $Pearle::ua->request($request);
 * 1) Make an HTTP request, performing basic error checking and handling.  Suitable for use with the "retry" function

# Monitor wiki server responsiveness $attemptStartTime = Time::HiRes::time;

if ($response->is_success or $response->is_redirect) {		return $response } 	else {		myLog ("HTTP ERR (".$response->status_line.")\n".$response->content."\n"); myPrint("HTTP ERR (".$response->status_line.")\n".$response->content."\n"); # 50X HTTP errors mean there is a problem connecting to the wiki server. Can be remedied by waiting and trying again if (500 <= $response->code and 504 >= $response->code) {			die("retry:".$response->status_line); }		else {			# Unhandled HTTP response. Waiting probably won't fix it			die ($response->status_line."\n"); }	}	# Monitor wiki server responsiveness $attemptEndTime = Time::HiRes::time;

if($request->method eq "POST") {		if (($attemptEndTime - $attemptStartTime) > 20) {			$Pearle::_speedMult = 60;

myPrint("Wikipedia is very slow. Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n"); myLog("Wikipedia is very slow. Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n"); }

# If the response time is between 10 and 20 seconds... elsif (($attemptEndTime - $attemptStartTime) > 10) {			$Pearle::_speedMult = 6;

myPrint("Wikipedia is somewhat slow. Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n"); myLog("Wikipedia is somewhat slow. Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n"); }

# If the response time is less than 10 seconds... else {			if ($Pearle::_speedMult != 1) {				$Pearle::_speedMult = 1;

myPrint( "Returning to normal minimum wait time.\n"); myLog("Returning to normal minimum wait time.\n"); }		}	} }

sub getPage {	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
 * 1) Check out a page for editing.

my ($target, $request, $response, $reply, $text, $text2,	$editTime, $startTime, $attemptStartTime, $attemptFinishTime,	$token, $targetSafe);

$target = $_[0];

if ($target =~ m/^\s*$/) {		myLog("getPage: Null target."); die("902 getPage: Null target."); }

$targetSafe = $target; $targetSafe =~ s/\&/%26/g; $targetSafe =~ s/\+/%2B/g;

# Create a request-object myPrint("GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n"); myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${targetSafe}&action=edit"); $response = startRetry(\&httpRequest, $request);

$reply = $response->content;

# This detects whether or not we're logged in. checkLogin($reply);

# Check for blocking if($reply =~ /User is blocked<\/h1>/) {		myLog("Blocked\n"); die("900 Blocked"); }	$reply =~ m%(.*?) %s; $text = $1;

$reply =~ m/value="(\d+)" name="wpEdittime"/; $editTime = $1;

# Added 22 Aug 2005 to correctly handle articles that have # been undeleted $reply =~ m/value="(\d+)" name="wpStarttime"/; $startTime = $1;

# Added 9 Mar 2005 after recent software change. $reply =~ m/value="([^"]+)" name="wpEditToken"/;	$token = $1;	###

if (($text =~ m/^\s*$/)	and !$Pearle::nullOK) {		myLog ("getPage($target): Null text!\n"); myLog ("\n---\n$reply\n---\n"); if ($Pearle::roughMode) {			return; }		else {			die ("920 getPage($target): Null text!\n"); }	}

if (($editTime =~ m/^\s*$/) and !$Pearle::nullOK) {		myLog ("getPage($target): Null time!\n"); myLog("\n---\n$reply\n---\n"); die ("920 getPage($target): Null time!\n"); }

if (($text =~ m/>/) or ($text =~ m/ " ) etc	# This function is from HTML::Entities.	decode_entities($text);

# This may or may not actually work $Pearle::ua->cookie_jar->save;

return ($text, $editTime, $startTime, $token); }

sub postPage {	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($request, $response, $pageName, $textToPost, $summaryEntry,	$editTime, $startTime, $actual, $expected, $date, $editToken,	$minor, $pageNameSafe);

$pageName = $_[0]; $editTime = $_[1]; $startTime = $_[2]; $editToken = $_[3]; $textToPost = $_[4]; $summaryEntry = $_[5]; # Max 200 chars! $minor = $_[6];

$summaryEntry = substr($summaryEntry, 0, 200);

if ($pageName eq "") {		myLog ("postPage: Empty pageName.\n"); die ("902 postPage: Empty pageName.\n"); }	if(!defined($minor)) {		die "902 postPage: Not enough parameters!\n"; }

if ($summaryEntry eq "") {		$summaryEntry = "Automated editing."; }

$pageNameSafe = $pageName; $pageNameSafe =~ s/\&/%26/g; $pageNameSafe =~ s/\+/%2B/g;

if ($minor eq "yes") {		$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpMinoredit => "on", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis }	else {		$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageNameSafe}&action=submit", [wpTextbox1 => $textToPost, wpSummary => $summaryEntry, wpSave => "Save page", wpEditToken => $editToken, wpStarttime => $startTime, wpEdittime => $editTime]; # Optional: wpWatchthis, wpMinoredit }

# ---	## If posts are failing, you can uncomment the below to see what ## HTTP request is being made. # myLog($request->as_string); # print $request->as_string;	$::speedLimit = 60 * 10; # print $::ua->request($request)->as_string; # ---

myLog("POSTing..."); myPrint("POSTing..."); # Pass request to the user agent and get a response back $response = startRetry(\&httpRequest, $request); myLog("POSTed.\n"); myPrint("POSTed.\n");

if ($response->content =~ m/Please confirm that really want to recreate this article./) {		myLog ($response->content."\n"); die ("923 Deleted article conflict! See log!"); }

# Check the outcome of the response $response->code; if ($response->code != 302 and $response->code != 200) {		myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n"); myLog ($request->as_string); myLog ("EXPECTED: 302'\n"); myLog (" ACTUAL: '" . $response->status_line . "'\n");

if ($Pearle::roughMode eq "yes") {			return; }		else {			die ("920 postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n"); }	}

$expected = "http://en.wikipedia.org/wiki/${pageName}"; $expected = Pearle::urlEncode($expected);

$actual = $response->headers->header("Location");

if (($expected ne $actual) and ($Pearle::roughMode ne "yes") 	   and !(($actual eq "") and ($response->code == 200))) {		myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n"); myLog ("EXPECTED: '${expected}'\n"); myLog (" ACTUAL: '${actual}'\n"); die ("920 postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n"); }

if ($response->content =~ m/Edit conflict/) {		myLog ("Edit conflict on '$pageName' at '$editTime'!\n"); die ("922 Edit conflict on '$pageName' at '$editTime'!\n"); }	if($Pearle::sanityCheck and $pageName =~ /talk[ _]*:/i)	# Check for accidental creation of a talkpage without a mainpage. Only works with bots using the "monobook" skin. {		# Monobook:Article # Classic: <a href="/w/index.php?title=Kjsahfjrf&amp;action=edit" class="new" title="Kjsahfjrf">View article</a> if($response->content =~ /<li id="ca-nstab-[^"]" class="new">/)		{			myLog ("postPage(${pageName}) - Talkpage without article!\n");			die ("920 postPage(${pageName}) - Talkpage without article!\n");		}	}	if($Pearle::sanityCheck and $pageName =~ /^user[ _]*talk[ _]*:/)	# Check for user talkpage for non-existant user	{		if($response->content !~ /User contributions/)		{			myLog ("postPge(${pageName}) - User talkpage for non-existant user!\n");			die ("920 postPge(${pageName}) - User talkpage for non-existant user!\n");		}	}

$Pearle::ua->cookie_jar->save; return ($response->content); }

sub getCategoryArticles {	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
 * 1) Get a list of the names of articles in a given category.

my ($target, $request, $response, $reply, $articles, $article,	@articles, 	$targetSpace, $offset, $numberOfArticles, $url,	@moreArticles);

$target = $_[0]; $offset = $_[1];

# Need both _ and spaces for precise matching later $target =~ s/ /_/g; $targetSpace = $target; $targetSpace =~ s/_/ /g;

unless ($target =~ m/^Category:/) {		myLog ("getCategoryArticles: Are you sure '$target' is a category?\n"); die ("902 getCategoryArticles: Are you sure '$target' is a category?\n"); }

if ($offset eq "") {		$url = "http://en.wikipedia.org/wiki/${target}"; }	else {		$url = "http://en.wikipedia.org/w/index.php?title=${target}&from=${offset}"; }

# Create a request-object if ($offset eq "") {		myPrint("GET ${url}\n"); }	myLog("GET ${url}\n"); $request = HTTP::Request->new(GET => "${url}"); $response = startRetry(\&httpRequest, $request);

$reply = $response->content;

# This detects whether or not we're logged in. checkLogin($reply);

$articles = $reply; $articles =~ s%^.*? Articles in category.*? %%s; $articles =~ s% .*?$%%s; @articles = $articles =~ m%<li><a href="/wiki/(.*?)" title=%sg;

if ($reply =~ m/<a\s+href=\"\/w\/index.php\?title=${target}\&amp;from=(.*?)\"\s+title=\"${targetSpace}\">next 200<\/a>/s) {		sleep (1); # Throttle GETs @moreArticles = getCategoryArticles($target, $1); @articles = (@articles, @moreArticles); }

$Pearle::ua->cookie_jar->save;

$numberOfArticles = @articles;

if ($offset eq "") {		myPrint("Got $numberOfArticles articles.\n"); myLog ("Got $numberOfArticles articles.\n"); }	return decodeArray(@articles); }

sub getCategoryImages {	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($target, $from, $request, $response, $reply, $images, @images,	$image, %imagesHash);

$target = $_[0]; $from = $_[1];

unless ($target =~ m/^Category:/) {		myLog ("getCategoryImages: Are you sure '$target' is a category?\n"); die ("902 getCategoryImages: Are you sure '$target' is a category?\n"); }

# Create a request-object if(!defined($from))	# Default: Start at the beginning of a category {		myPrint("GET http://en.wikipedia.org/wiki/${target}\n"); myLog("GET http://en.wikipedia.org/wiki/${target}\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}"); }	else	# Start somewhere middle-ish {		myPrint("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&from=$from\n"); myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&from=$from\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${target}\&from=$from"); }	$response = startRetry(\&httpRequest, $request); $reply = $response->content;

# This detects whether or not we're logged in. checkLogin($reply);
 * 1) 	unless ($reply =~ m%<a href="/wiki/User_talk:$Pearle::username">My talk</a>%)
 * 2) 		# We've lost our identity.
 * 3) 		myLog ("Wiki server is not recognizing me (2).\n---\n${reply}\n---\n");
 * 4) 		die ("Wiki server is not recognizing me (2).\n");
 * }
 * }

$images = $reply; $images =~ s/^.*?<table class="gallery"//s; $images =~ s/ .*?$//s; @images = $images =~ m/<a\s+href="\/wiki\/(.*?)"\s+title=\"Image:/g;

@images = grep {$_ =~ /^Image:/} @images;

if($images =~ /&amp;from=([^"]+)" title="Category:[^"]*">next 200/)	{		print "More: $1\n";		@images = (@images, getCategoryImages($target, $1));	}

# Uniqify to prevent duplicates @images = uniquify(@images);

$Pearle::ua->cookie_jar->save; return decodeArray(@images); }

sub getSubcategories {	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

my ($target, $request, $response, $reply, $subcats, $subcat,	@subcats, $attemptStartTime, $attemptFinishTime);

$target = $_[0];

unless ($target =~ m/^Category:/) {		myLog ("getSubcategories: Are you sure '$target' is a category?\n"); die ("902 getSubcategories: Are you sure '$target' is a category?\n"); }

# Create a request-object myPrint("GET http://en.wikipedia.org/wiki/${target}\n"); myLog("GET http://en.wikipedia.org/wiki/${target}\n"); $request = HTTP::Request->new(GET => "http://en.wikipedia.org/wiki/${target}"); $response = startRetry(\&httpRequest, $request); $reply = $response->content;

# This detects whether or not we're logged in. checkLogin($reply);

$subcats = $reply;

if ($subcats =~ m%^.*? Subcategories (.*?) Pages in category.*? .*?$%s) {		$subcats =~ s%^.*? Subcategories (.*?) Pages in category.*? .*?$%$1%s; }	else {		return ; }

@subcats = $subcats =~ m%<li><a href="/wiki/(.*?)" title=%sg;

$Pearle::ua->cookie_jar->save; return decodeArray(@subcats); }

sub getUserArticles {	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
 * 1) Get up to $max most recent articles edited by a user

my ($url, $request, $response, $reply, @contribs,	   $target, $namespace, $max, $offset); $target = $_[0]; $max = $_[1]; $offset = $_[2]; $namespace = namespaceToNumber($_[3]);

# Create a request-object if(defined($namespace)) {		$url = "http://en.wikipedia.org/w/index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}&namespace=$namespace"; }	else {		$url = "http://en.wikipedia.org/w/index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}"; }

myPrint("GET $url\n"); myLog("GET $url\n"); $request = HTTP::Request->new(GET => "$url"); $response = startRetry(\&httpRequest, $request); $reply = $response->content;

# This detects whether or not we're logged in. checkLogin($reply); # Extract the contributions # <li>23:18, 6 March 2006 (<a href="/w/index.php?title=User_talk:OrphanBot&amp;action=history" title="User talk:OrphanBot">	while($reply =~ /<li>$Pearle::regex_timestamp_nc \($Pearle::regex_bluepagelink/g) {		push @contribs, $1; }	# Remove duplicates return @contribs; }
 * 1) 	@contribs = uniquify(@contribs);

sub getLogArticles {	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
 * 1) Gets a list of (articles, actor, summary) tuples from the specified log (upload, delete, move, protect)

my ($url, $request, $response, $reply, @articles,	   $log, $max, $offset, $user); $log = $_[0]; $max = $_[1] || 50; $offset = $_[2] || 0; $user = $_[3] || '';

# Create a request-object # http://en.wikipedia.org/w/index.php?title=Special:Log&type=upload&user=&page=&limit=2000&offset=0 $url = "http://en.wikipedia.org/w/index.php?title=Special%3ALog&limit=${max}&offset=${offset}&user=${user}&type=${log}";

myPrint("GET $url\n"); myLog("GET $url\n"); $request = HTTP::Request->new(GET => "$url"); $response = startRetry(\&httpRequest, $request); $reply = $response->content;

# This detects whether or not we're logged in. checkLogin($reply); # Extract the articles #<li>19:55, 7 March 2006 <a href="/wiki/User:Jimbo_Wales" title="User:Jimbo Wales">Jimbo Wales</a> deleted "<a href="/w/index.php?title=Image:Justinsfriends.jpg&amp;action=edit" class="new" title="Image:Justinsfriends.jpg">Image:Justinsfriends.jpg</a>" (blatant copyvio) </li> #<li>19:54, 7 March 2006 <a href="/wiki/User:MrD9" title="User:MrD9">MrD9</a> moved <a href="/w/index.php?title=Statsoft&amp;redirect=no" title="Statsoft">Statsoft</a> to <a href="/wiki/StatSoft" title="StatSoft">StatSoft</a> (<a href="/w/index.php?title=Special:Movepage&amp;wpOldTitle=StatSoft&amp;wpNewTitle=Statsoft&amp;wpReason=revert&amp;wpMovetalk=0" title="Special:Movepage">revert</a>)</li> #<li>19:53, 7 March 2006 <a href="/w/index.php?title=User:Biederman&amp;action=edit" class="new" title="User:Biederman">Biederman</a> uploaded "<a href="/wiki/Image:Rockingham_Raymond_NH.PNG" title="Image:Rockingham Raymond NH.PNG">Image:Rockingham Raymond NH.PNG</a>" (Changed Image:Rockingham_Portsmouth_NH.PNG to highlight Raymond ) </li> #<li>19:31, 7 March 2006 <a href="/wiki/User:Francs2000" title="User:Francs2000">Francs2000</a> protected <a href="/wiki/Manoeuvre.org" title="Manoeuvre.org">Manoeuvre.org</a> ( [edit=sysop:move=sysop]) </li> #<li>19:30, 7 March 2006 <a href="/wiki/User:Tony_Sidaway" title="User:Tony Sidaway">Tony Sidaway</a> unprotected <a href="/wiki/Will_McWhinney" title="Will McWhinney">Will McWhinney</a> (This looks like the protection that time forgot.) </li>

while($reply =~ /<li>($Pearle::regex_timestamp_nc) ${Pearle::regex_pagelink}.*?<\/a> \(${Pearle::regex_pagelink_nc}Talk<\/a> \| ${Pearle::regex_pagelink_nc}contribs<\/a>\) (?:deleted|moved|uploaded|protected|unprotected) "?${Pearle::regex_pagelink}.*?<\/a>"(?:\s* (.*)<\/span>|)/g) {		my $summary = $3 || ''; push @articles, [$3, $2, $summary, $1]; }	@articles = uniquify_ref1(@articles); return @articles; }
 * 1) 	while($reply =~ /<li>$Pearle::regex_timestamp_nc ${Pearle::regex_pagelink}.*?<\/a> (?:deleted|moved|uploaded|protected|unprotected) "?${Pearle::regex_pagelink}.*?<\/a>"(?:\s* (.*)<\/span>|)/g)

sub Export {	my ($request, $response, $reply, $articles); $articles = join "\n", @_; $request = POST "http://en.wikipedia.org/w/index.php?title=Special:Export&action=submit", [action => 'submit', pages => $articles, curonly => 1]; $response = startRetry(\&httpRequest, $request); $reply = $response->content;
 * 1) Use the Special:Export interface to get the wikitext of one or more articles

return $reply; }

sub nullEdit {	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
 * 1) Do a null edit to an article

my ($text, $articleName, $comment, $editTime, $startTime, $token);

$articleName = $_[0];

myPrint("nullEdit($articleName)\n"); myLog ("nullEdit($articleName)\n");

($text, $editTime, $startTime, $token) = getPage($articleName); unless ($text eq "") {		postPage ($articleName, $editTime, $startTime, $token, $text, "null edit"); } }

sub parseHistory {   my ($pageName, $html, @lines, $line, $date, $hour, $minute, $day, $month, $year,	$htmlCopy, $link, $user, @result);
 * 1) Get the history of an article and parse the first 500 entries into a list of [link day month year] lists

$pageName = $_[0]; $pageName = escapeUrl($pageName); $html = getURL("http://en.wikipedia.org/w/index.php?title=${pageName}&action=history&limit=500");

$htmlCopy = $html;

$html =~ s%^.*? %%s; $html =~ s%(.*?)</ul>.*$%$1%s;

@lines = split ("</li>", $html); foreach $line (@lines) {		$line =~ s/\n/ /g;

if ($line =~ m/^\s*$/) {	   	next; }		($user) = $line =~ / <a href=[^>]*>([^<]*)/; $line =~ s/ .*?$//; $line =~ s/^.*?Select a newer version for comparison//; $line =~ s/^.*?Select a older version for comparison//; $line =~ s/^.*?name="diff" \/>//; $line =~ m%<a href="(.*?)" title="(.*?)">$Pearle::regex_timestamp</a>%; $link = $1; $hour = $3; $minute = $4; $day = $5; $month = $6; $year = $7;

push @result, [$link, $day, $month, $year, $user]; }   return (@result); }

sub getURL #($target) {	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited); # Read throttle! sleep (1);

my ($request, $response, $reply, $url); $url = $_[0];

# Create a request-object myPrint("GET ${url}\n"); myLog("GET ${url}\n"); $request = HTTP::Request->new(GET => "${url}"); $response = startRetry(\&httpRequest, $request);

$reply = $response->content; # This may or may not actually work $Pearle::ua->cookie_jar->save;

return ($reply); }

sub startRetry {	my ($call_fn, @args) = @_; return retry($Pearle::speedLimit, $call_fn, @args); }
 * 1) Retries a given function repeatedly, with an exponential backoff rate
 * 2) The function should throw an exception beginning with "retry:" (case insensitive) if the call should be retried

sub retry {	my ($call_fn, @args, $delay, @result, $result); ($delay, $call_fn, @args) = @_; if(wantarray) {		@result = eval{ $call_fn->(@args) }; if($@ =~ /^retry:/i) {			limit($delay); @result = retry($delay * 2, $call_fn, @args); }		elsif($@) {			die; }		return @result; }	else {		$result = eval{ &{$call_fn}(@args) }; if($@ =~ /^retry:/i) {			limit($delay); $result = retry($delay * 2, $call_fn, @args); }		elsif($@) {			die; }		return $result; } }

sub namespaceToNumber {	my $namespace = $_[0]; my $i = 0; my $name; if(defined($namespace)) {		foreach $name (@Pearle::namespaces) {			return $i if(lc($name) eq lc($namespace)); $i++; }	}	else {		return undef; } }

sub numberToNamespace {	my $i = shift; if(defined($i)) {		return $Pearle::namespaces[$i]; }	else {		return undef; } }

sub urlDecode {	my ($input);
 * 1) Translate from HTTP URL encoding to the native character set.

$input = $_[0];

$input =~ s/\%([a-f|A-F|0-9][a-f|A-F|0-9])/chr(hex($1))/eg;

return ($input); }

sub escapeUrl {	my $input = shift; $input =~ s/%/%25/g; $input =~ s/&/%26/g; $input = unicodeToUrl($input); return $input; }
 * 1) Basic escaping of special characters in a URL

sub unicodeToUrl {	my ($char, $input, $output);; $input = $_[0];
 * 1) URL-escape any high-unicode chars in a string

foreach $char (split("",$input)) {		if(ord($char) > 255) {			$output .= uc(sprintf("%%%x%%%x", int(ord($char)/256), ord($char) & 0xFF)); # %HH%LL where HHLL is the hex code of $char }		else {			$output .= $char; }	}	return $output; }

sub urlEncode {	my ($char, $input, $output);
 * 1) Translate from the native character set to the Wikipedia HTTP URL encoding.

$input = $_[0];

foreach $char (split("",$input)) {		#	if ($char =~ m/[a-z|A-Z|0-9|\-_\.\!\~\*\'\(\)]/)

# The below exclusions should conform to Wikipedia practice # (possibly non-standard) if ($char =~ m/[a-z|A-Z|0-9|\-_\.\/:]/) {			$output .= $char; }		elsif ($char eq " ") {			$output .= "_"; }		else {			if(ord($char) > 255) {				$output .= uc(sprintf("%%%x%%%x", int(ord($char)/256), ord($char) & 0xFF)); # %HH%LL where HHLL is the hex code of $char }			else {				$output .= uc(sprintf("%%%x", ord($char))); # %HH where HH is the hex code of $char }		}	}

return ($output); }

sub decodeArray {	return map {urlDecode($_)} @_; }

sub uniquify {	my @list = @_; @list = sort @list; my $last = undef; my @new_list; my $item; foreach $item (@list) {		push @new_list, $item if(!defined($last) or ($item ne $last)); $last = $item; }	return @new_list; }
 * 1) Remove duplicates from a list

sub uniquify_ref1 {	my @list = @_; @list = sort {$a->[0] cmp $b->[0]} @list; my $last = undef; my @new_list; my $item;
 * 1) Remove duplicates from a list of array references, grouping on the first subelement

foreach $item (@list) {		push @new_list, $item if(!defined($last) or ($item->[0] ne $last)); $last = $item->[0]; }	return @new_list; }

1;