User:HBC Archive Indexerbot/source

Main code

 * 1) This script is released under the GFDL license, see
 * 2) http://en.wikipedia.org/w/index.php?title=User:HBC Archive Indexerbot/source&action=history
 * 3) for a full list of contributors

my $write_delay = 5; my $download_max = 25;
 * 1) Configuration ###
 * 2) Time to sleep between writes (in seconds)
 * 1) Max pages to download at once

my $default_template = 'User:HBC Archive Indexerbot/default template';
 * 1) Default template

my $cache = 'cache'; my $wiki_cache = "$cache/wiki"; my $obj_cache = "$cache/obj"; my $report_cache = "$cache/reports";
 * 1) Cache paths
 * 1) End Configuration ###

use strict; use warnings; use Data::Dumper; use Date::Parse; use Digest::SHA1 qw(sha1_hex); use Encode qw(decode_utf8 encode_utf8); use HTML::Entities qw(encode_entities); use IO::Socket; use MediaWiki; use POSIX qw(strftime); use Storable; use Time::Duration; use Time::Local; use URI::Escape; use XML::Simple;

my($log_file,$pages_watched,$pages_downloaded,$pages_attempted,$dl,$ul) = ('',0,0,0,0,0); my $nowiki = 'nowiki'; my $start_time = undef; die "Cache directories must be created in advance\n" unless (-d $cache && -d $wiki_cache && -d $obj_cache); open(PASS,'password');                 # A file with only the password, no carraige return sysread(PASS, my $password, -s(PASS)); # No password in sourcecode. close(PASS); writelog                 ('Connecting to Wikipedia'); my $c                 =   MediaWiki->new; $c->setup ({                         'bot' => {'user' => 'HBC Archive Indexerbot','pass' => $password},                          'wiki' => {'host' => 'en.wikipedia.org','path' => 'w'}                        }) || die 'Failed to log in'; my $whoami             =  $c->user; writelog                ($whoami.' connected');
 * 1) Log into Wikipedia                    #
 * 1) Log into Wikipedia                    #

writelog                ('Gathering jobs'); my @master_job_list    =  gather_jobs; my @post_jobs          = @master_job_list;
 * 1) Update cache to modern state          #
 * 1) Update cache to modern state          #

writelog                 (scalar(@master_job_list).' jobs found'); writelog                ('Parsing cache'); my $rh_cache_data      =  parse_cache; writelog                ('done'); writelog                ('Parsing watchlist'); my $rh_modified_data   =  parse_watchlist; writelog                ('done'); download_pages(find_updated); download_pages(find_templates(@master_job_list)); fetch_pages(@master_job_list); writelog                ("$pages_watched pages added to watchlist."); writelog                ("$pages_downloaded out of $pages_attempted downloaded.");
 * 1) push                    (@needed,&find_holes);
 * 1) Parse cached data and create reports  #
 * 1) Parse cached data and create reports  #

writelog ('Creating reports'); foreach my $ra_job (@post_jobs) { my $page = decode_utf8(encode_utf8($ra_job->{'page'})); my $dest = decode_utf8(encode_utf8($ra_job->{'target'})); my $dest_escaped = _escape($dest); my $mask = decode_utf8(encode_utf8(join(', ', @{$ra_job->{'mask'}}))); my $index_here = $ra_job->{'indexhere'}; unless (check_text(1000,$dest_escaped)) {   writelog ('Not writing to '.$dest.' as I cannot find permission (sourced from: '.$page.')'); next; } my $report = create_report($ra_job); open(REPORT, ">$report_cache/$dest_escaped.$$"); use bytes; print REPORT $report; close(REPORT); if (-e "$report_cache/$dest_escaped") { my $result = `diff --brief "$report_cache/$dest_escaped" "$report_cache/$dest_escaped.$$"`; unless ($result) { writelog ('No change, skipping '.encode_entities($dest).''); unlink "$report_cache/$dest_escaped.$$"; next; } }  $c->login; writelog ('Writing report at '.encode_entities($dest).''); my $edit_summary = "Writing index of archives in ". encode_entities($mask). " due to request from " . encode_entities($page) . " - Bot edit"; my $result = send_report($dest,$report,$edit_summary); if ($result) { rename "$report_cache/$dest_escaped.$$", "$report_cache/$dest_escaped"; } else { unlink("$report_cache/$dest_escaped.$$"); } }

$ul += 120 + length($log_file); writelog ('Complete, downloaded NaN kilobyte(s) and uploaded NaN kilobyte(s) (figures approximate)'); &post_log; exit;


 * 1) Subroutines                           #
 * 1) Subroutines                           #

sub check_text { my $bytes = shift; my $page = shift;

my $host = 'en.wikipedia.org'; my $path = "/w/index.php?title=$page&action=raw"; my $sock        = new IO::Socket::INET (      PeerAddr    => $host,       PeerPort    => 80,       Proto       => 'tcp',      ); return 0 unless ($sock); my $header = ('GET http://'.$host.$path.' HTTP/1.1'."\r\n".'User-Agent: HBC Archive Indexerbot 0.9a'."\r\n\r\n"); syswrite ($sock, $header); my($buf, $content, $done); while (!$done) {   ($done = 1) unless sysread($sock, $buf, $bytes); $content .= $buf; if ((length($content) >= $bytes) || ($content =~ m|!-- HBC Archive Indexerbot can blank this --|)) {     $done = 1; }   }  close($sock); $dl += length($content); return ($content =~ m|!-- HBC Archive Indexerbot can blank this --|); }

sub create_report { my ($ra_job) = @_; my ($rh_index, $numbered_links) = index_headings($ra_job); my $template = get_template($ra_job->{'template'}); my $report = sprintf("%sThis report has been generated because of a request at %s. It covers the archives that match %s\n Report generated at by \n\n\n",                       $template->{'lead'}, $ra_job->{'page'}, join(', ', @{$ra_job->{'mask'}})); $report .= $template->{'header'}; my $i = 0; foreach my $key (sort {lc($a) cmp lc($b) || $rh_index->{$a}->{'root_path'} cmp $rh_index->{$b}->{'root_path'}} (keys(%{$rh_index}))) { $rh_index->{$key}->{'topic'} =~ s:(|[|!]{2}):<$nowiki>$1:g; my $row = $template->{'row'}; if ($template->{'altrow'}) { unless ($i++ % 2 == 0) { $row = $template->{'altrow'} }   }    foreach ('topic','replies','link','first','last','duration',             'firstepoch','lastepoch','durationsecs') { $row =~ s:%%$_%%:${$rh_index}{$key}{$_}:gi; }   $report .= $row; } $report .= sprintf("%s\n%s", $template->{'footer'}, $template->{'tail'}); return $report; }

sub download_pages { my (@pages) = @_; return unless @pages; my $requests = scalar(@pages);

my (@received_names);

while (@pages) { my @batch; while ((scalar(@batch) < 50) && @pages) { my $item = shift(@pages) || last; $item = _underscore($item); push (@batch, $item); }   $pages_attempted += scalar(@batch); my $xml_code = $c->special_export(@batch); $dl += length($xml_code); my $xml_result = XMLin($xml_code); next unless ($xml_result->{'page'}); if ($xml_result->{'page'}{'title'}) { push (@received_names, handle_chunk($xml_result->{'page'})); } else { foreach my $key (keys %{$xml_result->{'page'}}) { push (@received_names, handle_chunk($xml_result->{'page'}->{$key})); }   }  }  writelog('Downloaded '.scalar(@received_names)." pages from $requests requests"); return (@received_names); }

sub fetch_pages { my (@jobs) = @_;

my (@cache_names) = keys(%$rh_cache_data); foreach my $ra_job (@jobs) { my @fetch;

if ($ra_job->{'indexhere'}) { my $page = _underscore($ra_job->{'page'}); push(@fetch, $ra_job->{'page'}) unless (defined($rh_cache_data->{$page})); }

my $fetch_size = 0; foreach my $mask (@{$ra_job->{'mask'}}) { if ($mask =~ m|<#>|) { $fetch_size += 10; my $pattern = _underscore($mask); my ($part1, $part2) = split(m|<#>|, $pattern, 2); $pattern = qr/\Q$part1\E(\d+)/; $pattern .= qr/\Q$part2\E/ if $part2; my $leading_zeros = $ra_job->{'leading_zeros'}+1; my $marker = '%d'; $marker = '%0'.$leading_zeros.'d' if ($leading_zeros > 1); my $printf_pattern = $mask; $printf_pattern =~ s|<#>|$marker|; my (@mask_pages) = grep(/^$pattern/,@cache_names); my $largest = 0; foreach my $key (@mask_pages) { ($key =~ m|$pattern|) || next; $largest = $1 if ($1 > $largest); }       my $count = $largest; my (@pages); until ($count >= ($largest + $fetch_size)) { $count++; my $page_name = sprintf($printf_pattern, $count); push(@fetch,$page_name); }     # MONTHLY: elsif here for the or whatever is used } else { my $check = _underscore($mask); push (@fetch, $mask) unless (defined($rh_cache_data->{$check})); }   } continue { if (scalar(@fetch)) { my (@received) = download_pages(@fetch); $rh_cache_data = parse_cache; (@cache_names) = keys(%$rh_cache_data); if (scalar(@fetch) == scalar(@received)) { @fetch = ; redo; } else { @fetch = ; }     }      $fetch_size = 0; } } }

sub find_holes # This sub will find gaps in the archive(mabye a page was deleted then restored) and {            # adds them to the list of potentially needed pages return; }

sub find_templates { my (@jobs) = @_; my %templates; my @templates_needed; foreach my $ra_job (@jobs) { $templates{$ra_job->{'template'}}++; } foreach my $template (keys %templates) { $template = $default_template if $template eq 'default'; my $tmpl_under = _underscore($template); push(@templates_needed, $template) unless defined($rh_cache_data->{$tmpl_under}); } writelog (scalar(@templates_needed).' templates needed'); return @templates_needed; }

sub find_updated # Find items that have changed { my(@need_update); foreach my $page (keys(%{$rh_cache_data})) { if ($rh_modified_data->{$page}) { # If it's not on the watchlist, it hasn't                                     # been modified in the past month, ignore if ($rh_cache_data->{$page} < ${$rh_modified_data}{$page}) { push(@need_update,$page); my $fname = ("$wiki_cache/".uri_escape_utf8($page).' '.$rh_cache_data->{$page}); unlink($fname); # Remove old item }   }  }  writelog (scalar(@need_update).' pages need updating'); return @need_update; }

sub gather_jobs { my (@jobs); my $html_list        =  $c->{ua}->get($c->{index}."?title=Special:Whatlinkshere/User:HBC Archive Indexerbot/OptIn&limit=5000")->content; $dl += length($html_list); my @targets; while ($html_list =~ s|>([^<]*?) \(transclusion\)||) {   push(@targets,$1); } my $xml_source = XMLin($c->special_export(@targets)); my $xml = $xml_source; $dl += length($xml_source); my $rh_pages = ${$xml}{'page'}; my %targets; foreach my $key (keys(%{$rh_pages})) { my $content = ${$rh_pages}{$key}{'revision'}{'text'}{'content'}; if ($content =~ m"\Q\E"s) { my @params = split(/\s*\|\s*/, $1); my %job = ( page => $rh_pages->{$key}{'title'}, leading_zeros => 0 ); foreach my $param (@params) { my ($key, $value) = split(/\s*=\s*/, $param); next unless ($key && defined($value));

$value =~ s:^\.?/:$job{'page'}/:;

if ($key eq 'target') { $job{'target'} = $value; } elsif ($key eq 'mask') { next unless $value; push (@{$job{'mask'}}, $value); } elsif ($key =~ /leading_zeroe?s/) { if ($value =~ m/^(\d+)$/) { $job{'leading_zeros'} = $1; }       } elsif ($key eq 'indexhere') { $job{'indexhere'} = (($value =~ m|ye?s?|i) ? ('1') : ('0')); } elsif ($key eq 'template') { $job{'template'} = $value; }

}     $job{'template'} = 'default' unless $job{'template'}; $job{'template'} = 'default' if $job{'template'} eq 'template location';

next unless ($job{'target'} && $job{'mask'});

if ($targets{$job{'target'}}) { writelog("Request on $job{'page'} duplicates target $job{'target'}; skipping"); next; } else { $targets{$job{'target'}}++; }

push(@jobs,\%job); } }  return @jobs; }

sub get_template { my ($template) = (@_);

if ($template eq 'default') { $template = $default_template; }

my $tmpl_fn = _escape($template); my ($file) = glob("$wiki_cache/$tmpl_fn*"); unless ($file) { if ($template eq $default_template) { die "$template missing from cache\n"; } else { return get_template('default'); } }  open(TMPL, $file); my @content = ; close(TMPL);

my %template = (lead => , header => , row => , altrow => ,                 footer => , tail => ); my $section = ''; foreach my $line (@content) { chomp $line; if ($line =~ m:^$:) { $section = lc($1); $section =~ s/\s+//g; last if $section eq 'end'; } else { if ($section) { next unless $line; $template{$section} .= "$line\n"; }   }  }  $template{'lead'} .= "\n" if $template{'lead'};

unless ($template{'row'}) { die "Default template missing 'row' parameter!\n" if $template eq $default_template; writelog("Invalid template: '$template', using default instead"); return get_template('default'); }

return \%template; }

sub handle_chunk { my $chunk = shift; my $name = _underscore(${$chunk}{'title'}); my $fname = "$wiki_cache/".uri_escape_utf8($name); ${$chunk}{'revision'}{'timestamp'} =~ m|(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2})Z|; my $time = timegm($6,$5,$4,$3,$2-1,$1); watch($name) unless (${$rh_cache_data}{$name}); open(OUT,">$fname $time"); binmode(OUT); use bytes; print OUT (${$chunk}{'revision'}{'text'}{'content'}); no bytes; close(OUT); $pages_downloaded++; return $name; }

sub index_headings { my ($ra_job) = @_;

my $mask_re = ''; foreach my $mask (@{$ra_job->{'mask'}}) { my $mask2 = _escape($mask); if ($mask2 =~ m|%3C%23%3E|) { my ($part1, $part2) = split(m|%3C%23%3E|, $mask2, 2); $mask_re .= '(?:';     $mask_re .= qr/\Q$part1\E\d+/;      $mask_re .= qr/\Q$part2\E/ if $part2;      $mask_re .= ')|'; # MONTHLY: elsif here for } else { $mask_re .= qr/\Q$mask2\E/.'|'; } }  chop($mask_re);

opendir(CACHE,$wiki_cache); my(@cache) = readdir(CACHE); closedir(CACHE); my @files = grep(m|^(?:$mask_re)|,@cache); if ($ra_job->{'indexhere'}) { my $page = _escape($ra_job->{'page'}); push(@files, grep(m|^\Q$page\E \d+$|,@cache)); } my (%index, %used_headings); my $numbered_links = 0; foreach my $file (@files) { my (%used_names); next unless ($file =~ m|^(.*) (\d+)$|); my $root_path = decode_utf8(uri_unescape($1)); my $display_name = $root_path; $display_name =~ s/_/ /g; open(WIKI, "$wiki_cache/$file"); my @content = ; close(WIKI); my $prev_heading = ''; my ($comment_count,$first,$last) = (0,0,0); foreach my $line (@content) { if ($line =~ m|^==\s*([^=].+?)\s*==|) { if ($prev_heading && $comment_count > 0) { ## WARNING: This code is duplicated below vvvvvv $index{$prev_heading}->{'replies'} = $comment_count; if ($first && $last) { $index{$prev_heading}->{'firstepoch'} = $first; $index{$prev_heading}->{'first'} = strftime('%F %T',gmtime($first)); $index{$prev_heading}->{'lastepoch'} = $last; $index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last)); $index{$prev_heading}->{'durationsecs'} = $last - $first; if ($comment_count > 1) { $index{$prev_heading}->{'duration'} = duration($last - $first); } else { $index{$prev_heading}->{'duration'} = 'None'; }         }          $comment_count = 0; $first = 0; $last = 0; }       my $heading = $1; my $head_link; ($head_link, $numbered_links) = path_fix($heading, $numbered_links); $used_names{lc($head_link)}++; my $suffix = (($used_names{lc($head_link)} > 1) ? ('_'.$used_names{lc($head_link)}) : ('')); $used_headings{lc($head_link.$suffix)}++; $prev_heading = $head_link.$suffix.'_'.$used_headings{lc($head_link.$suffix)}; $index{$prev_heading} = { topic => encode_entities(decode_utf8($heading)), link => ("$display_name"), root_path => $root_path, head_link => $head_link, replies => 'Unknown', first => 'Unknown', 'last' => 'Unknown', duration => 'Unknown', firstepoch => 0, lastepoch => 0, durationsecs => 0, };     } elsif ($line =~ m/\[\[User.*[\]>)}].*?\s+(.*\(UTC\))/) {        $comment_count++;        my $time = str2time($1);        if ($time && (!$first || $time < $first)) {          $first = $time;        }        if ($time && ($time > $last)) {          $last = $time;        }      }    }    if ($prev_heading && $comment_count > 0) {      ## WARNING: This code is duplicated from above ^^^^^^      $index{$prev_heading}->{'replies'} = $comment_count;      if ($first && $last) {        $index{$prev_heading}->{'firstepoch'} = $first;        $index{$prev_heading}->{'first'} = strftime('%F %T', gmtime($first));        $index{$prev_heading}->{'lastepoch'} = $last;        $index{$prev_heading}->{'last'} = strftime('%F %T', gmtime($last));        $index{$prev_heading}->{'durationsecs'} = $last - $first;        if ($comment_count > 1) {          $index{$prev_heading}->{'duration'} = duration($last - $first); } else { $index{$prev_heading}->{'duration'} = 'None'; }     }    }  }  return \%index; }

sub parse_cache { my (@pages,$count); opendir(CACHE,$wiki_cache); my(@files) = readdir(CACHE); closedir(CACHE); my(%cache); foreach my $file (@files) {   next unless ($file =~ m|^(.*) (\d+)$|); my $page_name = decode_utf8(uri_unescape($1)); my $time = $2; $cache{$page_name} = $time; } return \%cache; }

sub parse_watchlist { my $watchlist         =  $c->{ua}->get($c->{index}."?title=Special:Watchlist&days=0")->content; $dl += length($watchlist); my @lines            =  split("\n",$watchlist); my @date; my %watchlist; while (scalar(@lines)) {   my $line = shift(@lines); if ($line =~ m| (\d{4})-(\d{2})-(\d{2}) |i) {     @date = ($1,$2,$3); }   if ($line =~ m|title="([^"]*?)">hist|i) # " {     my $page_name = _underscore($1); $line =~ m|(\d{2}):(\d{2}):(\d{2})|; $watchlist{$page_name} = timegm($3,$2,$1,$date[2],$date[1]-1,$date[0]); }   }  return \%watchlist; }

sub path_fix { my ($path,$numbered_links) = @_; ($path =~ s|'{2,4}||g); ($path =~ s|<.*?>||g); ($path =~ s/\[\[:?.*?\|(.*?)\]\]/$1/g); ($path =~ s|\[\[:?(.*?)\]\]|$1|g); while ($path =~ m|\[.*?\]|) { my $title; if ($path =~ m|\[[^ ]* (.*?)\]|) {     $title = $1;    } else {      $numbered_links++;      $title = ".5B$numbered_links.5D";    }    $path =~ s|\[.*?\]|$title|;  }  ($path =~ s|\s|_|g);  ($path =~ s| |.C2.A0|g);  while ($path =~ m|([^/a-z0-9\.:_'-])|i) {    my $bad = $1;    my $fix = uc('.'.sprintf("%x",ord($bad)));    ($path =~ s/\Q$bad/$fix/g);  }  return ($path,$numbered_links); }

sub post_log { my $pg               =  $c->get('User:HBC Archive Indexerbot/logs', 'rw'); $pg->{summary}       =  ('Writing log file for '.$start_time).' - Bot edit'; $pg->{content}       =  $log_file; $pg->save; }

sub send_report { my $dest     = shift; my $report   = shift; my $edit_summary = shift; my $pg       = $c->get($dest, 'w'); $pg->{summary}       =  $edit_summary; $pg->{content}       =  ''."\n".$report; $ul += length($report); my $result = $pg->save; unless ($result) { my $dest_entities = encode_entities($dest); writelog("Failed to save report to $dest_entities"); } sleep($write_delay); return $result; }

sub watch { my $page_name = shift; my $success = $c->{ua}->get($c->{index}."?title=$page_name&action=watch")->is_success; $pages_watched++ if ($success); return $success; }

sub writelog { my $entry = shift; my @month_table = (  'January',   'February',   'March',   'April',   'May',   'June',   'July',   'August',   'September',   'October',   'November',   'December',  ); my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday) = gmtime(time); my $time = sprintf("%02d:%02d:%02d %02d %s %04d", $hour,$min,$sec,$mday,$month_table[$mon],($year+1900)); $start_time ||= $time; $log_file .= ('* '.$time.': '.$entry.' '."\n"); warn "$entry\n"; }

sub _escape { my ($val) = @_; $val = _underscore($val); $val = uri_escape_utf8($val); return $val; }

sub _hash { my ($val) = @_; $val = _escape($val); $val = sha1_hex($val); return $val; }

sub _underscore { my ($val) = @_; $val =~ s|\s|_|g; return $val; }

MediaWiki Change
Fixing an error in MediaWiki/page.pm (forgotten &):

return $obj->{ua}->get($obj->_wiki_url . "&action=" . ($unwatch ? "un" : "") . "watch")->is_success;