User:HBC archive builderbot/source

use strict; use Storable; use LWP::UserAgent; use HTTP::Request::Common; use XML::Simple; use URI::Escape; use Data::Dumper; use Algorithm::Diff qw(diff);

my $ua = LWP::UserAgent->new('agent' => 'HBC archive builderbot v0.1 - developing (Operated by User:HighInBC)'); my $nowiki = ('nowiki'); # So it doesn't screw up the display of the source code on wiki

my $page = 'Wikipedia:Requests for comment/User names'; my $shortcut; $shortcut = 'WP:RFCN'; $shortcut ||= $page; my %revisions = get_complete_history($page);

my(@old_content); my($old_key); my $day; KEY: foreach my $key (sort {$a <=> $b} keys(%revisions)) { my(@content) = split("\n",${$revisions{$key}}{'text'}{'content'}); my $timestamp = ${$revisions{$key}}{'timestamp'}; my $summary = ${$revisions{$key}}{'comment'}; $summary =~ s|/\*.*\*/\s*||; my $user = ${$revisions{$key}}{'contributor'}{'username'}; my (@headings); if (scalar(@content) && scalar(@old_content)) {   my @diffs = diff(\@old_content, \@content); foreach my $ra_hunk (@diffs) {     foreach my $ra_diff (@{$ra_hunk}) {       my($action,$content) = @{$ra_diff}[0,2]; if (($content =~ m|==\s?([^=]*)\s?==|) && ($action eq '-')) {         my $heading = $1; ($heading =~ s|(\{\{.*:.*\}\})|<$nowiki>$1|) if ($heading =~ m|\{\{.*:.*\}\}|); push(@headings,$heading); }       }      }    }  if (scalar(@headings)) {   $timestamp =~ m|(\d{4}-\d{2}-\d{2})T(\d{2}:\d{2}):\d{2}Z|; if ($1 ne $day) {     $day = $1; warn "$day\n"; }   my $time = $2; my $archive_link = "[ Archive link]"; if (scalar(@headings) > 1) {     warn "* $time: $archive_link - ($summary ($user)) - (".scalar(@headings)." entries)\n"; foreach my $heading (@headings) {       warn "** $heading\n"; }     }    elsif (scalar(@headings) == 1) {     warn "* $time: $archive_link - $headings[0] - ($summary ($user))\n"; }   }  @old_content = @content; $old_key = $key; }

sub get_complete_history # Add Gzip, 100 times smaller, gee where did that ratio come from?? { mkdir('cache') unless (-d('cache')); my $page = shift; my(%revisions); my $count; my $offset; my $fname = 'cache/'.uri_escape($page); if (-f($fname)) {   warn "Found '$page' in cache, loading...\n"; %revisions = %{retrieve($fname)}; my(@keys) = sort {$a <=> $b} keys(%revisions); $offset = ($revisions{$keys[scalar(@keys)-1]}{'timestamp'}); # Get timestamp of most recent revision warn (scalar(keys(%revisions))." loaded from cache.\n"); } else {   warn "No cache, starting fresh.\n"; $offset = '0'; } my $total; GETMORE: warn "\nDownloading as many as 100 revisions starting at ".($offset || 'the start')."\n"; my $index = 'http://en.wikipedia.org/w/index.php'; my $res = $ua->request (	 POST $index."?title=Special:Export",	 Content_Type => 'application/x-www-form-urlencoded',	 Content       =>	[( 'pages'	=> $page, 'action'	=> 'submit', 'submit'	=> 'Export', 'limit'	=> 100, 'offset'	=> $offset )]	); my $current = $res->content; unless ($current =~ m|^<mediawiki|) {   warn "Failed somehow, trying again.\n"; goto GETMORE; } my $index = rindex($current, ' '); my $string = substr($current,$index,43); $string =~ m| (.+?) |; $offset = $1; my $xml_data = XMLin($current); $count = 0; if (!scalar(keys(%{${$xml_data}{page}{revision}}))) {} # do nothing elsif (${$xml_data}{'page'}{'revision'}{'id'}) {   unless ($revisions{${$xml_data}{'page'}{'revision'}{'id'}}) {$count++;$total++;} $revisions{${$xml_data}{'page'}{'revision'}{'id'}} = ${$xml_data}{'page'}{'revision'}; } else {   foreach my $revision (sort {$a <=> $b} keys(%{${$xml_data}{'page'}{'revision'}})) {     unless ($revisions{$revision}) {$count++;$total++;} $revisions{$revision} = ${$xml_data}{'page'}{'revision'}{$revision}; }   warn Dumper($xml_data) unless ($total); } warn "Got $count revisions\n"; if ($count == 100) {   warn "Still more.\n"; goto GETMORE; } if ($total > 0) {   warn "Saving cache...\n"; store(\%revisions, $fname); warn "done.\n"; } return %revisions; }