User:Joe's Olympic Bot/source2

use MediaWiki::API; use Encode; use LWP::UserAgent; use utf8;

# Gently pruned from the standard exclusion code to hardcode $user and $opt sub allowBots { my($text) = @_;

my $user = "Joe's Olympic Bot";

return 0 if $text =~ //; return 1 if $text =~ //; if($text =~ //s){ return 1 if $1 eq 'all'; return 0 if $1 eq 'none'; my @bots = split(/\s*,\s*/, $1); return (grep $_ eq $user, @bots)?1:0; }   if($text =~ //s){ return 0 if $1 eq 'all'; return 1 if $1 eq 'none'; my @bots = split(/\s*,\s*/, $1); return (grep $_ eq $user, @bots)?0:1; }   return 1; }

# Within a single MediaWiki call, we ask the API to make up to 5 attempts, 10 s apart, until # the worst-case server lag is better than 5s. my $mw = MediaWiki::API->new; $mw->{config}->{api_url} = 'http://en.wikipedia.org/w/api.php';

# Delay/retry parameters $mw->{config}->{max_lag}        = 5;        # Tell MediaWiki to put us off it there's a 5s+ db lag out there $mw->{config}->{max_lag_delay}  = 10;  # ..and to wait 10s between retries $mw->{config}->{max_lag_retries} = 4;   # ..and to only make 4 retries before dropping back to our code

# Our own delay parameters $standardelay     = 15; $longdelay        = 900;  # ...if the API puts us off several times in a row, take a 15-minute break

my $articles = null;

# login while (1) { if ($mw->login( { lgname => "Joe's Olympic Bot", lgpassword => '[REDACTED]' } )) { last; }

if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code}. ': ' . $mw->{error}->{details}; }  }

$profilesfound = 0;

getsubd;

sub getsubd {

while (1) { $sdirs = $mw->list ( {      action => 'query',       list => 'categorymembers',       cmtitle => 'Category:Competitors at the 2012 Summer Olympics',

cmlimit => "max" },

);

if ($articles) { last; }

if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code}. ': ' . $mw->{error}->{details}; }  }

foreach (@{$sdirs}) { $sdirname = $_->{title}; print "########### $sdirname\n"; getlista($sdirname); } }

sub getlista {

my ($cata) = $_[0];

# skip directories cleaned by hand already, why bother?

# Get list of articles while (1) { $articles = $mw->list ( {      action => 'query',       list => 'categorymembers',       cmtitle => $cata,        cmlimit => "max" },

{ hook=> \&dsa	} , );

if ($articles) { last; }

if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code}. ': ' . $mw->{error}->{details}; }  } }

sub dsa {

my ($xyz) = $_[0];

# scan through the articles... foreach (@{$xyz}) {

my $thistitle = $_->{title}; $listcount++;


 * 1) $thistitle = "User:Joe's Olympic Bot/Test";


 * 1)      print  "T: " . encode("iso-8859-1", $thistitle) . "\n";

next if ($thistitle =~ m/^User:/); next if ($thistitle =~ m/^Category:/);

while (1) { my $pagehash = $mw->get_page( { title => $thistitle } ); if ($pagehash) { last; }

if ($mw->{error}->{details} =~ /Server has reported lag above the configure/) { sleep $longdelay; } else { die $mw->{error}->{code}. ': ' . $mw->{error}->{details}; }      }      if (allowBots($pagehash->{'*'})) { my $ref = $mw->get_page( { title => $thistitle } ); $atext = decode_utf8 ( $ref->{'*'} ); my $timestamp = $ref->{'timestamp'};

# There are a couple articles which are not individual athletes but in these categories. Restrict to living people if (!($atext =~ m/Category:Living people/)) { print "NOTLIVING: ". encode("iso-8859-1", $thistitle). "\n"; next; }

if (($atext =~ m|\\[http:\/\/www.london2012.com(\/)?[ '"][^<]+\<\/ref\>|) ||                                    ($atext =~ m|\http:\/\/www.london2012.com(\/)?\<\/ref\>|)) {

print "BADREF: ". encode("iso-8859-1", $thistitle). "\n";

$ret = findolympian($thistitle); print " ->RESULT: ". encode("iso-8859-1", $ret). "\n";

$striptitle = $thistitle;

if ($striptitle =~ m/([^(]+) \(/) { $striptitle = $1; }

$uastriptitle = $striptitle; $uastriptitle =~ tr/ãâăáăäóěéíçćčÁúůřšșĽňńțśžŠ/aaaaaaoeeicccAuurssLnntszS/;

if (($londontitle ne $striptitle) && ($londontitle ne $uastriptitle)) { print "GOFIX ". $striptitle. " as it doesn't match ". $londontitle. "\n"; } else { print "AUTOFIX ". $striptitle. " seems to match ". $londontitle. "\n";

}              }   # there's a bad URL

} else { print "….DENIED\n"; } }

die if (profilesfound > 50); }

sub findolympian {

my ($olympian) = $_[0];

my @o = split('\(', $olympian); $olympian = $o[0];  $olympian =~ tr/ /+/;

$u = 'http://www.london2012.com/search/index.htmx?q='. $olympian;


 * 1) print "URL: " . $u . "\n";

$profilesfound++;

my $ua = LWP::UserAgent->new; $ua->agent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:15.0) Gecko/20100101 Firefox/15.0'); $ua->default_header('Accept-Language' => "en-us,en;q=0.5"); $ua->default_header('Accept' => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); $ua->default_header('Connection' => "keep-alive");

my $response = $ua->get($u);

$html = $response->content; undef $londontitle;

undef $finishedurl; if (defined($html)) { if ($html =~ m#href=.(\/athlete\/[^\/]+\/)#      ) { $finishedurl = "http://www.london2012.com". $1;

if ($html =~ m#"name"\>([^<]+)\<\/span\>\([^<]+)\<\/span#) { $londontitle = decode_utf8($1). decode_utf8($2);
 * 1)  Felismina Cavela

$londontitle =~ s/\s+$//;

print "LONDONTITLE (" . $londontitle . ")\n"; }   } else { print "CANTFINDLINK\n" ; return $finishedurl; } } else { print "NORETURN\n" ; }

return Encode::decode_utf8($finishedurl); }

#                     . decode_utf8($'); #                                     #           $mw->edit( {            #            action => 'edit',            #            summary => "Joe's Olympic Bot: Correcting reference.",            #            basetimestamp => $timestamp, # to avoid edit conflicts            #            bot => 'true',            #            title => $thistitle,            #              basetimestamp => $timestamp, # to avoid edit conflicts            #            text => $revtext   } ) #          || die $mw->{error}->{code}. ': ' . $mw->{error}->{details};
 * 1) if (0) {
 * 2)                     $revtext = decode_utf8( $`)
 * 3)                           . decode_utf8("")
 * 1)                     $revtext = decode_utf8( $`)
 * 2)                           . decode_utf8("")
 * 1)                           . decode_utf8("")
 * }