User:Joe's Olympic Bot/source

''Not finished, but a work in progress on the traversal code. Note the unused subroutine at bottom, which has been tested lightly for grabbing a URL from the london2012.com site's search function.''

use MediaWiki::API; use Encode; use LWP::UserAgent; use utf8;

# Gently pruned from the standard exclusion code to hardcode $user and $opt sub allowBots { my($text) = @_;

my $user = "Joe's Olympic Bot";

return 0 if $text =~ //; return 1 if $text =~ //; if($text =~ //s){ return 1 if $1 eq 'all'; return 0 if $1 eq 'none'; my @bots = split(/\s*,\s*/, $1); return (grep $_ eq $user, @bots)?1:0; }   if($text =~ //s){ return 0 if $1 eq 'all'; return 1 if $1 eq 'none'; my @bots = split(/\s*,\s*/, $1); return (grep $_ eq $user, @bots)?0:1; }   return 1; }

# Within a single MediaWiki call, we ask the API to make up to 5 attempts, 10 s apart, until # the worst-case server lag is better than 5s. my $mw = MediaWiki::API->new; $mw->{config}->{api_url} = 'http://en.wikipedia.org/w/api.php';

# Delay/retry parameters $mw->{config}->{max_lag}        = 5;        # Tell MediaWiki to put us off it there's a 5s+ db lag out there $mw->{config}->{max_lag_delay}  = 10;  # ..and to wait 10s between retries $mw->{config}->{max_lag_retries} = 4;   # ..and to only make 4 retries before dropping back to our code

# Our own delay parameters $standardelay     = 15; $longdelay        = 900;  # ...if the API puts us off several times in a row, take a 15-minute break

my $articles = null;

# login while (1) { if ($mw->login( { lgname => "Joe's Olympic Bot", lgpassword => '[REDACTED]' } )) { last; }

if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code}. ': ' . $mw->{error}->{details}; }  }

$profilesfound = 0;

getsubd;

sub getsubd {

while (1) { $sdirs = $mw->list ( {      action => 'query',       list => 'categorymembers',       cmtitle => 'Category:Competitors at the 2012 Summer Olympics',        cmlimit => "max" },

);

if ($articles) { last; }

if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code}. ': ' . $mw->{error}->{details}; }  }

foreach (@{$sdirs}) { $sdirname = $_->{title}; print "########### $sdirname\n"; getlista($sdirname); } }

sub getlista {

my ($cata) = $_[0];

# skip directories cleaned by hand already, why bother?

return if ($cata =~ m/Beach volleyball /); return if ($cata =~ m/Volleyball /); return if ($cata =~ m/Badminton /); return if ($cata =~ m/Divers /); return if ($cata =~ m/Wrestlers /); return if ($cata =~ m/Triathletes /); return if ($cata =~ m/Archers /); return if ($cata =~ m/Tennis /); return if ($cata =~ m/Taekwondo /); return if ($cata =~ m/Footballers /); return if ($cata =~ m/Handball /);

# Get list of articles while (1) { $articles = $mw->list ( {      action => 'query',       list => 'categorymembers',       cmtitle => $cata,        cmlimit => "max" },

{ hook=> \&dsa	} , );

if ($articles) { last; }

if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code}. ': ' . $mw->{error}->{details}; }  } }

sub dsa {

my ($xyz) = $_[0];

# scan through the articles... foreach (@{$xyz}) {

my $thistitle = $_->{title}; $listcount++;

next if ($thistitle =~ m/^User:/); next if ($thistitle =~ m/^Category:/);

while (1) { my $pagehash = $mw->get_page( { title => $thistitle } ); if ($pagehash) { last; }

if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code}. ': ' . $mw->{error}->{details}; }      }      if (allowBots($pagehash->{'*'})) { my $ref = $mw->get_page( { title => $thistitle } ); $atext = decode_utf8 ( $ref->{'*'} ); my $timestamp = $ref->{'timestamp'};

# There are a couple articles which are not individual athletes but in these categories. Restrict to living people if (!($atext =~ m/Category:Living people/)) { print "NOTLIVING: ". encode("iso-8859-1", $thistitle). "\n"; next; }

# Blow out if we see any of the usual reliable athletic site templates next if ($atext =~ /\{iaaf name\|/i); next if ($atext =~ /\{Cycling archives/i); next if ($atext =~ /\{[Ff]ig\|/); next if ($atext =~ /\{Sports.reference\|/i); next if ($atext =~ /\{FISA\|/i); next if ($atext =~ /\{Swimming Australia name\|/i); next if ($atext =~ /\{cyclingwebsite\|/i); next if ($atext =~ /\{ATP\|/i); next if ($atext =~ /\{WTA\|/i); next if ($atext =~ /\{ITF female profile\|/i); next if ($atext =~ /\{FIFA player\|/i); $refcount = 0; $goodrefcount = 0;

while ($atext =~ m/http(s?):([^ \<\|\'\"]*)/g) {

$xurl = $&;

$refcount ++; #         print "URL: (" . $xurl . ")\n";

if (!($xurl =~ m/london2012/ig)) { $goodrefcount++; } elsif (!($xurl =~ m#2012\.com(\/)?$#)) {

$goodrefcount++;

}             }

if ($goodrefcount == 0) { if ($refcount == 0) { print "UNREF: ". encode("iso-8859-1", $thistitle). "\n";

$ret = findolympian($thistitle); print " ->RESULT: ". encode("iso-8859-1", $ret). "\n"; sleep 5; die if ($profilesfound > 5);

} else { print "BADREF: ". encode("iso-8859-1", $thistitle). "\n";

$ret = findolympian($thistitle); print " ->RESULT: ". encode("iso-8859-1", $ret). "\n";

$striptitle = $thistitle;

if ($striptitle =~ m/([^(]+) \(/) { $striptitle = $1; }

$uastriptitle = $striptitle; $uastriptitle =~ tr/ăäóéí/aaoei/;

if (($londontitle ne $striptitle) && ($londontitle ne $uastriptitle)) { print "GOFIX ". $striptitle. " as it doesn't match ". $londontitle. "\n"; } elsif ($atext =~ m|\\[http:\/\/www.london2012.com(\/)?[ '"][^<]+\<\/ref\>|) {

$revtext = decode_utf8( $`)

. decode_utf8("") . decode_utf8($'); $mw->edit( {                       action => 'edit',                        summary => "Joe's Olympic Bot: Correcting reference.",                        basetimestamp => $timestamp, # to avoid edit conflicts                        title => $thistitle,                          basetimestamp => $timestamp, # to avoid edit conflicts                        text => $revtext   } ) || die $mw->{error}->{code}. ': ' . $mw->{error}->{details};

} elsif ($atext =~ m|\http:\/\/www.london2012.com(\/)?\<\/ref\>|) {

$revtext = decode_utf8( $`)

. decode_utf8("") . decode_utf8($'); $mw->edit( {                       action => 'edit',                        summary => "Joe's Olympic Bot: Correcting reference.",                        basetimestamp => $timestamp, # to avoid edit conflicts                        title => $thistitle,                          basetimestamp => $timestamp, # to avoid edit conflicts                        text => $revtext   } ) || die $mw->{error}->{code}. ': ' . $mw->{error}->{details};

} else { print "ACPHT\n"; }                   sleep 5; die if ($profilesfound > 3); }             }

break; } else { print "….DENIED\n"; } } }

sub findolympian {

my ($olympian) = $_[0];

my @o = split('\(', $olympian); $olympian = $o[0];  $olympian =~ tr/ /+/;

$u = 'http://www.london2012.com/search/index.htmx?q='. $olympian;


 * 1) print "URL: " . $u . "\n";

$profilesfound++;

my $ua = LWP::UserAgent->new; $ua->agent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0'); $ua->default_header('Accept-Language' => "en-us,en;q=0.5"); $ua->default_header('Accept' => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); $ua->default_header('Connection' => "keep-alive");

my $response = $ua->get($u);

$html = $response->content; undef $londontitle;

undef $finishedurl; if (defined($html)) { if ($html =~ m#href=.(\/athlete\/[^\/]+\/)#      ) { $finishedurl = "http://www.london2012.com". $1;

if ($html =~ m#"name"\>([^<]+)\<\/span\>\([^<]+)\<\/span#) { $londontitle = decode_utf8($1). decode_utf8($2);

$londontitle =~ s/\s+$//;

print "LONDONTITLE (" . $londontitle . ")\n"; }   } else { print "CANTFINDLINK\n" ; return $finishedurl; } } else { print "NORETURN\n" ; }

return Encode::decode_utf8($finishedurl); }