User:HBC Searchbot/Source

See also templates.pl use strict; use LWP::UserAgent; use HTTP::Request::Common; use XML::Simple; use URI::Escape;

my %revisions = get_complete_history('User talk:HighInBC'); open(PL,'templates.pl'); sysread(PL,my $templates,-s(PL)); close(PL); my(%templates); eval($templates) || die;

my $searches; warn ("Checking ".scalar(keys(%revisions))." revisions.\n"); my(%warnings); foreach my $revision (sort {$a <=> $b} keys(%revisions)) { my $rh_rev = $revisions{$revision}; my $text = ${$rh_rev}{'text'}{'content'}; foreach my $template (sort keys(%templates)) {   $searches++; my $start = index(lc($text), lc($templates{$template})); if ($start > 0) {     my $end = (index($text,"\n",$start) - $start-1); $end = (length($text)-$start) if ($end < 0); my $string = substr($text,$start,$end); $warnings{$string}{'template'}	= $template; $warnings{$string}{'regex'}	= $templates{$template}; push(@{$warnings{$string}{'revisions'}},$revision); }   }  } warn "$searches searches performed.\n"; warn Dumper(\%warnings);

sub get_complete_history { mkdir('cache') unless (-d('cache')); my $page = shift; my(%revisions); my $count; my $offset; my $fname = 'cache/'.uri_escape($page); if (-f($fname)) {   warn "found $fname in cache, loading\n"; open(IN,$fname); sysread(IN,my $code,-s(IN)); close(IN); my $VAR1; eval($code); %revisions = %{$VAR1}; my(@keys) = sort {$a <=> $b} keys(%revisions); $offset = ($revisions{$keys[scalar(@keys)-1]}{'timestamp'}); warn (scalar(keys(%revisions))." loaded from cache.\n"); } else {   warn "No cache, starting fresh.\n"; $offset = '0'; } my $total; GETMORE: warn "Downloading 100 revisions.\n"; my $ua = LWP::UserAgent->new('agent' => 'HighInBC warning checker .01b'); my $index = 'http://en.wikipedia.org/w/index.php'; my $res = $ua->request (	 POST $index."?title=Special:Export",	 Content_Type => 'application/x-www-form-urlencoded',	 Content       =>	[( 'pages'	=> $page, 'action'	=> 'submit', 'submit'	=> 'Export', 'limit'	=> 100, 'offset'	=> $offset )]	); my $current = $res->content; unless ($current =~ m|^<mediawiki|) {   warn "Failed somehow, trying again\n"; goto GETMORE; } my $index = rindex($current, ' '); my $string = substr($current,$index,43); $string =~ m| (.+?) |; $offset = $1; my $xml_data = XMLin($current); $count = 0; if (${$xml_data}{'page'}{'revision'}{'timestamp'} eq $offset) {   # do nothing } elsif (${$xml_data}{'page'}{'revision'}{'comment'}) {   ($count++ && $total++) unless ($revisions{${$xml_data}{'page'}{'revision'}{'id'}}); $revisions{${$xml_data}{'page'}{'revision'}{'id'}} = ${$xml_data}{'page'}{'revision'}; } else {   foreach my $revision (sort {$a <=> $b} keys(%{${$xml_data}{'page'}{'revision'}})) {     ($count++ && $total++) unless ($revisions{$revision}); $revisions{$revision} = ${$xml_data}{'page'}{'revision'}{$revision}; }   }  warn "Got $count revisions\n"; if ($count == 100) {   warn "Still more past $offset to get, waiting 5 seconds between hits\n"; sleep(5); goto GETMORE; } if ($total > 0) {   warn "Saving cache...\n"; open(OUT, '>'.$fname); print OUT (Dumper(\%revisions)); close(OUT); warn "done.\n"; } return %revisions; }