User:OvrLoad/listredir.pl


 * 1) !/usr/bin/perl


 * 1) Wikipedia bot to fix redirects to lists.
 * 2) e.g., if Random Thing 3 is a redirect to List of random things,
 * 3) which has a section #Random Thing 3, change all instances of
 * 4) Random Thing 3 to List of random things


 * 1) Usage: listredir.pl [--nostart] Name of list
 * 2) --nostart causes it to not edit the user page to mark start; use when restarting after crash


 * 1) Copyright (C) 2005 Nick Tarleton (Wikipedia username: Nickptar)
 * 2) This code may be redistributed, modified or unmodified, without restriction,
 * 3) provided that this copyright notice and license remain in place.


 * 1) We apologize for the ad-hoc-ness.

use warnings; use strict;

use LWP; use HTTP::Cookies; use POSIX qw(strftime);


 * 1) modify as necessary

my $wikihost = 'en.wikipedia.org'; my $wikipath = 'w/index.php'; my $wikiuser = 'OvrLoad'; my $wikipass = ''; my $sleeptime = 30;

my $jar = HTTP::Cookies->new;

my %sectionsbyredir; # contains redirect->section header mappings my %redirsbypage; # contains pages needing fixing and the redirects on them my %pagesbysection; # contains section->page mappings, for talk page message

$| = 1; # why not

my $date = strftime('%d %B %Y', gmtime); $date =~ s/\[\[0/[[/; # get rid of stupid leading zero on day
 * 1) format: 4 August 2005

my $nostart = 0; if($ARGV[0] eq "--nostart") { $nostart = 1; shift @ARGV; }

my $listname = join(' ', @ARGV); die "Usage: $0 listname" unless $listname =~ /\S/;

print "listredir.pl - by Nick Tarleton (User:Nickptar)\n"; print "processing $listname\n\n";

login; getMap; getPages; setWorking; fixPages; writeNote; setDone;

sub get_ua { my $ua = LWP::UserAgent->new; $ua->agent("listredir.pl operated by User:$wikiuser"); $ua->cookie_jar($jar); push @{ $ua->requests_redirectable }, 'POST'; return $ua; }
 * 1) make and initialize a UserAgent

my $starttime; my $edittime; my $edittoken; my $editingpage; my $editingpage_orig;

sub checkout { $editingpage_orig = $editingpage = shift; $editingpage =~ s/ /_/g; $editingpage =~ s/&amp;/%26/g; print "Retrieving $editingpage_orig... ";
 * 1) get the text of a page, and be prepared to write it back

my $ua = get_ua; my $addr = "http://$wikihost/$wikipath?title=$editingpage&amp;action=edit"; my $response = $ua->get($addr); die $response->status_line unless $response->is_success;

$response->content =~ /]*>([^<]*)<\/textarea>/s or die "syntax error: no textarea"; my $result = $1; $result =~ s/&amp;lt;//g; $result =~ s/&amp;quot;/"/g;	$result =~ s/&amp;amp;/&amp;/g;

$response->content =~ //		or die "syntax error: no starttime";	$starttime = $1;	$response->content =~ // or die "syntax error: no edittime"; $edittime = $1; $response->content =~ //		or die "syntax error: no edittoken";	$edittoken = $1;

print "Success.\n"; return $result; }

sub checkin { my ($text, $summary, $minor) = @_; print "Saving... ";
 * 1) save the page being edited

my $ua = get_ua; my $addr = "http://$wikihost/$wikipath?title=$editingpage&amp;action=submit"; my $response = $ua->post($addr,		Content_Type => 'form-data',		Content => {			'wpTextbox1' => $text,			'wpSummary' => $summary,			'wpSave' => 'Save page',			'wpMinoredit' => ($minor ? '1' : ),			'wpSection' => ,			'wpStarttime' => $starttime,			'wpEdittime' => $edittime,			'wpEditToken' => $edittoken		}	); die $response->status_line unless $response->is_success; # crappy crappy success check $editingpage_orig =~ s/\(/\\(/g; $editingpage_orig =~ s/\)/\\)/g; $editingpage_orig =~ s/&amp;/&amp;amp;/g; unless($response->content =~ / $editingpage_orig/) { open OUT,">results" or die "could not open results"; print OUT $response->content; close OUT; die "error. Dumped to results." }	print "Success. zzz...\n"; sleep $sleeptime; }

sub addsection { my ($page, $text, $title) = @_; my $page_orig = $page; $page =~ s/ /_/g; $page =~ s/&amp;/%26/g; print "Adding to $page_orig... ";
 * 1) add a section to a talk page

my $ua = get_ua; my $addr = "http://$wikihost/$wikipath?title=$page&amp;action=edit"; my $response = $ua->get($addr); die $response->status_line unless $response->is_success;

$response->content =~ //		or die "syntax error: no edittime";	my $section_edittime = $1;	$response->content =~ // or die "syntax error: no edittoken"; my $section_edittoken = $1;

$addr = "http://$wikihost/$wikipath?title=$page&amp;action=submit"; $response = $ua->post($addr,		Content_Type => 'form-data',		Content => {			'wpSummary' => $title,			'wpTextbox1' => $text,			'wpSave' => 'Save page',			'wpSection' => 'new',			'wpEdittime' => $section_edittime,			'wpEditToken' => $section_edittoken		}	); die $response->status_line unless $response->is_success; $page_orig =~ s/\(/\\(/g; $page_orig =~ s/\)/\\)/g; $page_orig =~ s/&amp;/&amp;amp;/g; unless($response->content =~ / $page_orig/) { open OUT,">results" or die "could not open results"; print OUT $response->content; close OUT; die "error. Dumped to results." }	print "Success. zzz...\n"; sleep $sleeptime; }

sub login { print "Logging in $wikiuser... ";
 * 1) log in

my $ua = get_ua; my $addr = "http://$wikihost/$wikipath?title=Special:Userlogin&amp;action=submitlogin"; my $response = $ua->post($addr,		{			'wpName' => $wikiuser,			'wpPassword' => $wikipass		}	); die $response->status_line unless $response->is_success; die "Login failed." unless $response->content =~ /You are now logged in/; print "Success.\n"; }

my $whatlinkshere;

sub getMap { my $listtext = checkout($listname); my @headings; push @headings, $1 while $listtext =~ /^=+ *([^=]+) *=+/gm;
 * 1) populate %sectionsbyredir

# get whatlinkshere {		print "Retreiving whatlinkshere... "; my $ua = get_ua; my $addr = "http://$wikihost/$wikipath/Special:Whatlinkshere/$listname?limit=5000"; $addr =~ s/ /_/g; my $response = $ua->get($addr); die $response->status_line unless $response->is_success; print "Success.\n\n"; $whatlinkshere = $response->content; $whatlinkshere =~ s/&amp;lt;//g; $whatlinkshere =~ s/&amp;quot;/"/g;		$whatlinkshere =~ s/&amp;amp;/&amp;/g;	}

while($whatlinkshere =~ />([^<]+)<\/a> \(redirect page\)$/mg) { my $redir = $1; # straightforward if(grep(/^$redir$/i, @headings) > 0) { print "$redir -> $redir\n"; # fix regexp special characters; more may be necessary $redir =~ s/\(/\\(/g; $redir =~ s/\)/\\)/g; $sectionsbyredir{$redir} = $redir; } else { # handle plurals if($redir =~ /^(.*)s$/) { my $nonplural = $1; if(grep(/^$nonplural$/i, @headings) > 0) { print "$redir -> $nonplural\n"; # fix regexp special characters; more may be necessary $redir =~ s/\(/\\(/g; $redir =~ s/\)/\\)/g; $sectionsbyredir{$redir} = $nonplural; next; }			}

# handle disambig-names like Foo (Fictional Universe) if($redir =~ /^(.*) \(.*\)$/) { my $realname = $1; if(grep(/^$realname$/i, @headings) > 0) { print "$redir -> $realname\n"; # fix regexp special characters; more may be necessary $redir =~ s/\(/\\(/g; $redir =~ s/\)/\\)/g; $sectionsbyredir{$redir} = $realname; next; }			}

# catch-all for forms like: # First Last -> Last, First # Mr. Foo -> Mr. and Mrs. Foo # (and more?)

# get all subnames of the redirect my @names = split(/ /, $redir); my @filter = @headings; # find headings containing all of the subnames foreach my $name (@names) { # fix regexp special characters; more may be necessary $name =~ s/\(/\\(/g; $name =~ s/\)/\\)/g; @filter = grep(/$name/i, @filter); last if @filter == 1; # don't let anything eliminate it too early @filter = @headings if @filter == 0; }			if(@filter == 1) { print "$redir -> ". $filter[0]. "\n"; # fix regexp special characters; more may be necessary $redir =~ s/\(/\\(/g; $redir =~ s/\)/\\)/g; $sectionsbyredir{$redir} = $filter[0]; } elsif(@filter == 0 || @filter == @headings) { # keep everything from being listed print "$redir -> nothing\n"; } else { print "$redir ->\n"; print "\t$_\n" foreach @filter; }		}	}

return if $nostart;

print "Is this OK (y/n)? "; $_ = ; exit unless /^y/; print "\n"; }

sub getPages { while(my ($redir, $section) = each %sectionsbyredir) { # handle redirects with no links thereto next if $whatlinkshere =~ />$redir<\/a> \(redirect page\)\n<\/li>/; $whatlinkshere =~ />$redir<\/a> \(redirect page\)\n(.+?)<\/ul>\n<\/li>/s or die "$redir not found properly in whatlinkshere"; my $curpages = $1; my @pagesthissection; while($curpages =~ />([^<]+?)<\/a><\/li>$/mg) { my $page = $1; # filter out non-main namespace next if $page =~ /^\w*:/; next if $page =~ /^\w* talk:/; push @{$redirsbypage{$page}}, $redir; push @{$pagesbysection{$section}}, $page; }	} }
 * 1) populate %redirsbypage and %pagesbysection

sub setWorking { return if $nostart; my $text = checkout("User:$wikiuser"); my $message = "$listname, started "; $text =~ s/^(Currently working on:).*$/$1 $message/m; checkin $text, "Mark as working on: $listname"; }
 * 1) mark user page as working

sub fixPages { foreach my $page (keys %redirsbypage) { my $text = checkout($page); # handle editing the list itself my $presection = ($page eq $listname) ? '' : $listname; foreach my $redir (@{$redirsbypage{$page}}) { my $section = $sectionsbyredir{$redir}; $text =~ s/\[\[ *($redir) *\]\]/$1/gi; $text =~ s/\[\[ *$redir *\| *([^\]]*?) *\]\]/$1/gi; }		checkin $text, "Changing links-to-redirects-to-lists to links-to-list-sections. See userpage for more info.", 1; } }
 * 1) do the work

sub writeNote { my ($exampleRedir, $exampleSection) = each %sectionsbyredir; my $note = <$exampleRedir is changed to $exampleRedir.
 * 1) leave a note on the talk page

As a result, anyone who intends to split a section out of this page should be aware that, as of $date, the following sections were linked to from the following pages:

EOM

foreach my $section (keys %pagesbysection) { $note .= "* $section: "; foreach my $page (@{$pagesbysection{$section}}) { $note .= "$page, "; }		chop $note; # take off last ", "; chop $note; $note .= "\n"; }

$note .= "\n~"; addsection "Talk:$listname", $note, 'Note to anyone intending on splitting off a section'; }

sub setDone { my $text = checkout("User:$wikiuser"); $text =~ s/^(Currently working on:).*$/$1 nothing/m; {		# chomp off all trailing newlines local $/ = ''; chomp $text; }	$text .= "\n# $listname (finished $date)"; checkin $text, "Mark as done." }
 * 1) mark user page as done