User:Interiot/Tool/code

Notes:  Green bars are for edit summaries, red bars are for edits with no summaries The statistics are real-time (it scrapes data off of the Special:Contributions page while you wait). It's somewhat slow for edit counts over 5000 It's unable to count deleted edits It should work with most wikis out there that use MediaWiki, since it doesn't need privileged access to the databases.

Source code is in the public domain and available here</a> <li>Warning: <a href="http://www.bbc.co.uk/dna/h2g2/A1091350">metrics are evil</a> </ul>

For bug reports/comments, see <a href="http://en.wikipedia.org/wiki/User_talk:Interiot">User talk:Interiot</a> or <a href="http://en.wikipedia.org/wiki/Special:Emailuser/Interiot">email him</a>. EOF } else { $this_namespace = $valid_namespaces{lc $site}; #cgi_dumper(\$this_namespace); exit;

$username =~ s/^_+|_$//g;

#print "$site $username\n"; $namespace_totals{earliest} = get_5000($site, $username, 0); #cgi_dumper(\@urls, \%namespace_totals); exit; #cgi_dumper(\%unique_articles); $namespace_totals{"number of unique articles"} = scalar(keys %unique_articles); $namespace_totals{"avg edits per article"} = sprintf("%5.2f", $namespace_totals{total} / $namespace_totals{"number of unique articles"});

print $xml_lang, <<'EOF'; <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> td {padding: .1em 1em .1em}

table.months {padding-top: 2em} table.months td.date {font-weight: bold} table.months td {font-size: 75%}

div.red, div.green { height:1em; float:left; }       div.red {background-color: #f00} div.green {background-color: #0f0}

div.topN { float: left; min-height: 30em;          /* otherwise, they get ALL jumbled up */ }       table.topN { float: left; border: 1px solid black; }       table.topN th { background-color: #000; color: #fff; }       table.topN td { /* override the above */ padding: .1em .3em; } EOF

print "<ul style='padding-left:10em'><a href='$ENV{SCRIPT_NAME}'>Go back</a> to see caveats or to check another user.</ul>\n"; print " User:$username \n"; print " \n";

#### output the months stats #cgi_dumper(\%month_editsummary_totals); my @months = list_months; my $max_width = 0; $max_width = ($_ > $max_width ? $_ : $max_width) foreach (values %month_totals); if ($max_width > 0) { print " \n"; }

#### output the top-15 namespace stats my $num_to_present = 15; if ($this_namespace) {         # only do it if we're sure about the namespaces print " \n"; #print "<ul>NOTE: This section has a tendency to hilight a user's \"youthful indiscretions\". Please take the dates of the edits into account.</ul>\n"; foreach my $nmspc ("Mainspace", @{$this_namespace->{"\x00order"}}) { next unless %{$namespace_unique_articles{$nmspc}}; my @articles = sort {$namespace_unique_articles{$nmspc}{$b} <=> $namespace_unique_articles{$nmspc}{$a}} grep { $namespace_unique_articles{$nmspc}{$_} > 1}         # filter out items with only 1 edit keys(%{$namespace_unique_articles{$nmspc}}); next unless @articles;

#print " \n"; print " \n"; #print " \n"; }   }

#### output the bottom summary print "<p style='clear:left'> If there were any problems, please <a href='http://en.wikipedia.org/wiki/Special:Emailuser/Interiot'>email Interiot</a> or post at <a href='http://en.wikipedia.org/wiki/User_talk:Interiot'>User talk:Interiot</a>.\n"; #print " Based on these URLs:\n<ul>\n", join("\n", map {"<li><a href='$_>$_</a>"} @urls), "</ul>\n"; print " Based directly on these URLs:\n"; foreach my $ctr (0..$#urls) { print "<a href='$urls[$ctr]'>[", ($ctr+1), "]</a>"; print ", " unless ($ctr >= @urls - 1); print "\n"; }   print " \n";

#### log the bandwidth used open FOUT, ">>". LOGFILE or die; printf FOUT "%s %-20s %-30s  %5dK  %7d\n", scalar(localtime), $username, $site, int($bandwidth_down / 1024), $namespace_totals{total}; close FOUT; }

sub get_5000 { my $site = shift; my $username = shift; my $offset = shift;

my $earliest = "";

my $url = "http://$site/w/index.php?title=Special:Contributions&target=$username&offset=${offset}&limit=5000"; if (! $LWP::Simple::ua) { LWP::Simple::_init_ua; #$LWP::Simple::ua->agent("Mozilla/4.0 WebTV/2.6 (compatible; MSIE 4.0)");    # apparently they're picky about useragent strings $LWP::Simple::ua->agent("Wget/1.9.1");    # apparently they're picky about useragent strings. Use the same as wget. }   push(@urls, $url); if (@urls >= 10) { print "Too many pages fetched. Terminating. \n"; #cgi_dumper(\@urls); exit; }   my $page; if (1) { my $request = HTTP::Request->new(GET => $url); my $response = $LWP::Simple::ua->request($request); if (!$response->is_success) { print "While trying to fetch <a href='$url'>$url</a>, $site responded: \n", $response->status_line, "  ", $response->content; exit; }       $page = $response->content; $bandwidth_down += length($page); if (0) { local *FOUTOUT; open FOUTOUT, ">/var/tmp/kate/tmp.out" or die; print FOUTOUT $page; close FOUTOUT; }   } else { open FININ, "</var/tmp/kate/tmp.out" or die; local $/ = undef; $page = <FININ>; close FININ; }

if ($page =~ /(<html [^>]+>)/i) { $xml_lang = $1; }

## parse each individual contribution #while ($page =~ /^<li>(\d\d:\d\d,.*)/igm) { while ($page =~ /^<li>([^(]+\(<a href="[^"]+action=history.*)/igm) { my $this_time;

local $_ = $1;

my $edit_summary; #$edit_summary++ if (m#<a href="/wiki/[^"]*"\s+title="[^"]*">[^<]*</a>\s*\(#is);       $edit_summary++ if (/ /si);

my $article_url; if (m#<a href="/wiki/([^"]+)" title="[^"]+">([^<]+)#si) { $article_url = $1; $article_titles{$1} = $2; }       $unique_articles{$article_url}++;

## strip out all the HTML tags s/<[^>]*>//gs; if (/^(.*?) \(/) {           my $date = $1;            $earliest = $date;

# translate months into english, so Date::Parse chn handle them # languages believed to work here:     EN, DE, IT            $date =~ s/\b(?:gen                 )\b/jan/gix; $date =~ s/\b(?:mÃ¤r                )\b/mar/gix; $date =~ s/\b(?:mai|mag            )\b/may/gix; $date =~ s/\b(?:giu                )\b/jun/gix; $date =~ s/\b(?:lug                )\b/jul/gix; $date =~ s/\b(?:ago                )\b/aug/gix; $date =~ s/\b(?:set                )\b/sep/gix; $date =~ s/\b(?:okt|ott            )\b/oct/gix; $date =~ s/\b(?:dez|dic            )\b/dec/gix;

$this_time = str2time($date); if ($this_time == 0) { #print "XXXXXXXXXXXXXXXXXXXXXXXXX \n"; } else { #print scalar(gmtime($this_time)), " \n"; $earliest_perldate = $this_time;       # record the earliest and latest month we see $latest_perldate ||= $this_time;

my $monthkey = monthkey(localtime($this_time)); $month_totals{$monthkey}++; $edit_summary && $month_editsummary_totals{$monthkey}++; }       }        s/^[^]*\([^]*\) \([^]*\) (?:\S )? //;

my $subspace = "Mainspace"; if (/^([^\s\d\/:]+(?:\s[^\s\d\/:]+)?:)/) { if (!$this_namespace || exists $this_namespace->{$1}) { $subspace = $1; }       }        $namespace_totals{$subspace}++; $namespace_totals{total}++; $namespace_unique_articles{$subspace}{$article_url}++;

#print "$_ \n"; }

## if they have more than 5000 contributions, go to the next page while ($page =~ /href="[^"]+:Contributions[^"]+offset=(\d+)/ig) {       #print "Trying again at offset $1 \n";        next unless $1 > 0 && ($offset == 0 || $1 < $offset);        return get_5000($site, $username, $1);      # tail recursion until there are no more    }

return $earliest; }

sub list_months { my $last_monthkey = ''; my @ret; # yes, this is a fairly odd algorithm. oh well. for (my $date=$earliest_perldate; $date<=$latest_perldate; $date+=10*24*60*60) { my $monthkey = monthkey(localtime($date)); if ($monthkey ne $last_monthkey) { push(@ret, $monthkey); $last_monthkey = $monthkey; }   }    return @ret; }
 * 1) returns something like [
 * 2)       "2003/10",
 * 3)       "2003/11",
 * 4)       "2003,12"
 * ]

sub monthkey {($_[5] + 1900). "/" . ($_[4] + 1)}

sub cgi_dumper {print " ", HTML::Entities::encode(Dumper(@_)), " "}