User:Polbot/source/Reffix.pl


 * 1) Use like:
 * 2)   perl reffix.pl Catname

use strict; use Perlwikipedia; use URI::Escape; use LWP::UserAgent; use Encode; use XML::Simple;

my $Polbot_password = '(bot password)'; my $az_AccessKey = '(Amazon.com access code)'; my $crossref_creds = '(username:password)';

my $soonest_next_op = time; my $wait_time = 10; my $ignorenamespaces = 'User|User talk|Talk|Template|Template talk|Portal|Portal talk|Category|Category talk|Portal talk|Wikipedia talk|Image|Image talk|MediaWiki|MediaWiki talk|Template talk|Help|Help talk'; my $editsummary = 'Automated fixes to external links and references. (See the FAQ for details.)'; my $blacklist = '(^Cannot find server|(File|Resource|Article|Page) (was )?not found|(^|\s)Log ?In($|\s)|(^|\s)Sign ?in($|\s))';

my ($Second, $Minute, $Hour, $Day, $Month, $Year, $WeekDay, $DayOfYear, $IsDST) = localtime(time); $Year += 1900; $Month++; $Month =~ s/^(\d)$/0$1/; $Day =~ s/^(\d)$/0$1/; my $Todays_date = "$Year-$Month-$Day";
 * 1) date

my $category = shift;

print "Running Polbot's reffix function, category = $category\n";

print "\nLogging in to Wikipedia.\n" ; my $pw=Perlwikipedia->new; $pw->{mech}->agent('Bot/polbot'); my $login_status=$pw->login('Polbot', $Polbot_password); die "I can't log in." unless ($login_status eq 0);

my $ua = LWP::UserAgent->new; $ua->agent("Firefox/3.0.1"); $ua->cookie_jar({});

print "Opening category '$category'\n"; my @allpages = $pw->get_pages_in_category("Category:$category");

print "There are ". scalar(@allpages). " total pages to go through.\n";

foreach my $articlename (@allpages) { print "Examining $articlename\n"; if ($articlename =~ /^$ignorenamespaces:/i) { print " Not an article. Skipping.\n"; next; }	# -	# First, look at the article and set variables. my $bNeedsChanging = 0; my $newart = ''; my $bHasReferencesTag = 0; my $bHasReflist = 0; my $bHasRefTag = 0;

my $art = $pw->get_text($articlename); # Exclusion compliance if ($art =~ m/\{\{\s*(nobots\s*\}\}|bots\s*\|\s*allow\s*=|bots\s*|\s*deny\s*=\s*all)/si) { print ", skipping.\n"; next; }	# variables if ($art =~ /<\s*references\s*\/\s*>/is) { $bHasReferencesTag = 1; }	if ($art =~ /\{\{\s*(template\s*:\s*)?reflist\s*[\|\}]/is) { $bHasReflist = 1; }	if ($art =~ /<\s*ref(\s+name\s*=\s*(?:"[^"]*"|\w+)|)\s*>/si) {		$bHasRefTag = 1;	}

# -	# Change to 	if ($bHasReferencesTag == 1) { #$bNeedsChanging = 1; $art =~ m/(<\s*references\s*\/>)/si; my $refsect = $1; $art =~ m/(<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<\s*references\s*\/>\s*<\/\s*(span|div)>)/si; my $temp2 = $1; $art =~ m/(<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<(span|div)( class=\"(references-small|small|references-2column))?\">\s*<\s*references\s*\/>\s*<\/\s*(span|div)>\s*<\/\s*(span|div)>)/si; my $temp3 = $1;

if ($temp3) { $refsect = $temp3; } elsif ($temp2) { $refsect = $temp2; }		if ($refsect) { my $newrefsect = $refsect; if ($refsect =~ m/references-2column/) { $newrefsect = ""; } elsif ($refsect =~ m/[^-]column-count:[\s]*?(\d*)/) { $newrefsect = ""; } elsif ($refsect =~ m/-moz-column-count:[\s]*?(\d*)/) { $newrefsect = ""; } else { $newrefsect = ""; }			$art =~ s/$refsect/$newrefsect/si; $bHasReflist = 1; }	}

# -	# Fix http://...

while ($art =~ m/\[\[(https?:\/\/[^\]]*)\]\]/si) { my $badlink = $1; $bNeedsChanging = 1; print " Fixing $badlink\n"; $art =~ s/\[\[\Q$badlink\E\]\]/[$badlink]/si; }

# -	# Fix ext links to Wikimedia # en.wikipedia while ($art =~ m/http:\/\/(?:en\.)?wikipedia\.org\/wiki\/([^\] ]*)/g) { my $extwikilink = $1; $bNeedsChanging = 1; my $intwikilink = $extwikilink; $intwikilink =~ s/_/ /g; $intwikilink =~ s/%([0-9A-Fa-f]{2})%([0-9A-Fa-f]{2})/decode("utf8", chr(hex($1)) . chr(hex($2)))/eg; $intwikilink =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; $intwikilink =~ s/^(Image|Category):/:$1:/; print " Fixing ext wikilink $extwikilink to $intwikilink\n"; # non-renamed $art =~ s/\[http:\/\/(en\.)?wikipedia\.org\/wiki\/\Q$extwikilink\E\]/$intwikilink/g; # renamed $art =~ s/\[http:\/\/(?:en\.)?wikipedia\.org\/wiki\/\Q$extwikilink\E ([^\]]*)\]/$1/g; }	# other.wikipedia while ($art =~ m/\[http:\/\/([^\.]*).wikipedia.org\/wiki\/([^\] ]*)/s) { my $extwikilang = $1; my $extwikilink = $2; $bNeedsChanging = 1; my $intwikilink = $extwikilink; $intwikilink =~ s/_/ /g; $intwikilink =~ s/%([0-9A-Fa-f]{2})%([0-9A-Fa-f]{2})/decode("utf8", chr(hex($1)) . chr(hex($2)))/eg; $intwikilink =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; $intwikilink =~ s/^(Image|Category):/:$1:/; print " Fixing ext link $extwikilang.$extwikilink to $extwikilang:$intwikilink\n"; # non-renamed $art =~ s/\[http:\/\/$extwikilang\.wikipedia\.org\/wiki\/\Q$extwikilink\E\]/$extwikilang:$intwikilink/g; # renamed $art =~ s/\[http:\/\/$extwikilang\.wikipedia\.org\/wiki\/\Q$extwikilink\E ([^\]]*)\]/$1/g; }

# -	# ref the BELs # First, QQQ the PDFlink BELs $art =~ s/(\{\{\s*PDF(?:link)?\s*\|\s*\[)(https?:\/\/[^\]]*\])/$1QQQ$2/gi; # Next, QQQ the html comments $newart = $art; while ($art =~ m/<\!--(.*?)-->/gs) { my $comment = $1; my $newcomment = $comment; if ($newcomment =~ s/\[(https?:\/\/)/\[QQQ$1/g) { $newart =~ s/\Q$comment\E/$newcomment/; }	}      	$art = $newart; # Next, QQQ the links already in refs $newart = $art; while ($art =~ m/(<\s*ref.*?<\s*\/\s*ref\s*>)/gs) { my $ref = $1; my $newref = $ref; if ($newref =~ s/\[(https?:\/\/)/\[QQQ$1/g) { $newart =~ s/\Q$ref\E/$newref/; }	}      	$art = $newart; # Now QQQ the " at [http]" or " in [http]" or " from [http]" $art =~ s/( at| in| from|At|In|From) \[(https?:\/\/[^\]]*\])/ $1 [QQQ$2/g; # And lastly, QQQ links that begin a line $art =~ s/^([\s\*\#\:]*\[)(https?\:\/\/[^\]]*\])/$1QQQ$2/gm; # Okay! Now ref all non-QQQed BELs above the template (or whatever) my $artbefore = $art; my $artafter = ''; if ($art =~ m/(.*?)(\{\{reflist\}\}|=+\s*Notes?\s*=+|=+\s*References?\s*=+|=+\s*External links?\s*=+|=+\s*Sources?\s*=+|=+\s*Further reading\s*=+|=+\s*See also\s*=+)(.*)/is) { $artbefore = $1; $artafter = "$2$3"; }	$newart = $artbefore; while ($artbefore =~ m/\[(https?:\/\/[^ \]]*)\]/g) { my $BEL = $1; $bNeedsChanging = 1; $bHasRefTag = 1; $newart =~ s/ *\[\Q$BEL\E\]/ QQQ$BEL<\/ref>/g; }	$art = "$newart$artafter"; # UnQQQ it all $art =~ s/QQQhttp/http/g; $art =~ s/ *\(( [^<]*<\/ref>)\)/$1/gs; # -	# Add  if missing if ($bHasRefTag - $bHasReflist == 1) { $bNeedsChanging = 1; print " but no \n"; if ($art =~ m/\n=+\s*(references?|notes)\s*=+\s*\n/mi) { my $putrefin = $1; $art =~ s/(\n=+\s*($putrefin)\s*=+\n)/$1\{\{reflist\}\}\n/si; print " Putting reflist after $putrefin section\n"; } else { $art =~ m/(=+\s*see also\s*=+|=+\s*external links?\s*=+|=+\s*sources?\s*=+|=+\s*further reading\s*=+|\[\[\s*category\s*\:)/si; my $putrefsbefore = $1; if ($putrefsbefore) { $art =~ s/\Q$putrefsbefore\E/==Notes==\n\n\n$putrefsbefore/si; print " Putting reflist before $putrefsbefore section\n"; } else { $art .= "\n"; print " Putting reflist at end\n"; }		}	}

# 	# Known links -> cites or templates # Unkown links -> titles or 	# First, QQQ the PDFlink BELs $art =~ s/(\{\{\s*PDF(?:link)?\s*\|\s*\[)(https?:\/\/[^\]]*\])/$1QQQ$2/gi; # Next, QQQ the html comments $newart = $art; while ($art =~ m/<\!--(.*?)-->/gs) { my $comment = $1; my $newcomment = $comment; if ($newcomment =~ s/\[(https?:\/\/)/\[QQQ$1/g) { $newart =~ s/\Q$comment\E/$newcomment/; }	}      	$art = $newart; # And QQQ the already-dead links $art =~ s/\b(https?\:\/\/[^\s\]\<\{]*\]? ?\{\{dead link\}\})/QQQ$1/g;

my @BURLs = ; # bare URLs, e.g. http://www.example.com/subdir/example.html my @BELs = ;  # bare external links, e.g. my @NELs = ;  # named external links, e.g. name

# Those starting a line push @BURLs, ($art =~ m/^[ \*\#\:]*https?\:\/\/[^\s\]\<]*/mg); push @BELs, ($art =~ m/^[ \*\#\:]*\[https?\:\/\/[^ \]\<]*\]/mg); push @NELs, ($art =~ m/^[ \*\#\:]*\[https?\:\/\/[^ \]\<]*(?: [^\]]+)\]/mg); # Those in \n";				$newart =~ s/\Q$thisref$thischar\E/$thischar$thisref/gs;			}		}		while ($art =~ m/(]*\/>)(.)/gs) {			my $thisref = $1;			my $thischar = $2;			if ($thischar =~ m/[\.\,\?\!\;\:]/) {				print "Found $thischar after \n";				$newart =~ s/\Q$thisref$thischar\E/$thischar$thisref/gs;			}		}		$art = $newart;		# Miscaptalizations		$art =~ s/==(\s*)See also(\s*)==/==$1See also$2==/i;		$art =~ s/==(\s*)External links?(\s*)==/==$1External links$2==/i;

# units $art =~ s/(\d) (mph|km|mile|mi|kilometer|mbar|knot|feet|ft|meter|m|metre|kilometre|inch|million|billion|foot|days|kt|millibar|mm|cm|dollar|USD|inHg|hPa|people|hour|liter|degree|°|year|month|square|sq)\b/$1 $2/g; # HTML $art =~ s/\<\/?i\>/\'\'/gi; $art =~ s/\<\/?b\>/\'\'\'/gi; # Date stuff # Century $art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi; $art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi; $art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi; $art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\]\]/$1 centuries/gi; $art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi; $art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi; $art =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\s(AD|BC|CE|BCE)\]\]/$1 centuries $2/gi; # piped decades and years $art =~ s/\[\[(\d{1,4}\'?s)\]\]/$1/gi; $art =~ s/\[\[(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi; $art =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4})\]\]/$1/gi; $art =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi; $art =~ s/\[\[\d{1,4}s?\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi; $art =~ s/\[\[\d{1,4}s?\|(\d{1,2}s?)\]\]/$1/gi; # months $art =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December)\]\]/$1/gi; $art =~ s/\[\[January\|(Jan)\]\]/$1/gi; $art =~ s/\[\[February\|(Feb)\]\]/$1/gi; $art =~ s/\[\[March\|(Mar)\]\]/$1/gi; $art =~ s/\[\[April\|(Apr)\]\]/$1/gi; $art =~ s/\[\[May\|(May)\]\]/$1/gi; $art =~ s/\[\[June\|(Jun)\]\]/$1/gi; $art =~ s/\[\[July\|(Jul)\]\]/$1/gi; $art =~ s/\[\[August\|(Aug)\]\]/$1/gi; $art =~ s/\[\[September\|(Sep)\]\]/$1/gi; $art =~ s/\[\[October\|(Oct)\]\]/$1/gi; $art =~ s/\[\[November\|(Nov)\]\]/$1/gi; $art =~ s/\[\[December\|(Dec)\]\]/$1/gi; # month+year $art =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d{3,4})\]\]/$1/gi; # Month+day_number "March 7th" -> "March 7" $art =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December) (\d?\d)(?:th|st|nd|rd)\]\]/\[\[$1 $2\]\]/gi; $art =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](?:th|st|nd|rd)/\[\[$1\]\]/gi; $art =~ s/\[\[(\d?\d)(?:th|st|nd|rd) (January|February|March|April|May|June|July|August|September|October|November|December)\]\]/\[\[$1 $2\]\]/gi; # Month+day_number piped into number. Preferences do not work. They don't work in sequence because digits in the two dates must be adjacent $art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; $art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1/gi; $art =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2})\]\]/$1/gi; $art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2})\]\]/$1/gi; # solitary day_numbers $art =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi; $art =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi; $art =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi; # days of the week in full. Optional plurals $art =~ s/\[\[(Mondays?|Tuesdays?|Wednesdays?|Thursdays?|Fridays?|Saturdays?|Sundays?)\]\]/$1/gi; # days of the week abbreviated. Leave out 'Sun' as potentially valid link to the Sun. Leave out 'SAT' in upper case as potential link to 'Scholastic achievement/aptitude test'. $art =~ s/\[\[(Mon|Tue|Tues|Wed|Thu|Thur|Thurs|Fri)\]\]/$1/gi; $art =~ s/\[\[(Sat)\]\]/$1/g; $art =~ s/\[\[Mondays?\|(Mondays?)\]\]/$1/gi; $art =~ s/\[\[Tuesdays?\|(Tuesdays?)\]\]/$1/gi; $art =~ s/\[\[Wednesdays?\|(Wednesdays?)\]\]/$1/gi; $art =~ s/\[\[Thursdays?\|(Thursdays?)\]\]/$1/gi; $art =~ s/\[\[Fridays?\|(Fridays?)\]\]/$1/gi; $art =~ s/\[\[Saturdays?\|(Saturdays?)\]\]/$1/gi; $art =~ s/\[\[Sundays?\|(Sundays?)\]\]/$1/gi; # 4 digit years piped into 2 $art =~ s/\[\[\d{1,4}\|(\d{1,2})\]\]/$1/gi; # year: examine characters in link on left for date, examine characters in link on right for date $art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi; # year pair: examine characters in link on left for date, examine characters in link on right for date $art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc\s\-]))/$1$2$3$4$5/gi; # year: examine characters in link on left for date, avoid links on right $art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi; # year pair: examine characters in link on left for date, avoid links on right $art =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi; # year: check for line-ends, text on left, avoid links on right. Run twice to deal better with lists. $art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi; $art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi; # year pair: check for line-ends, text on left, avoid links on right $art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3$4$5/gi; # year: avoid links on left, examine characters in link on right for date $art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi; # year pair: avoid links on left, examine characters in link on right for date $art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3$4$5/gi; # year:avoid links on left, text on right $art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3/gi; # year pair: avoid links on left, text on right $art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3$4$5/gi; # year:text on left, text on right $art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.\'\*\|\&])/$1$2$3/gi; # year pair: avoid links on left, text on right $art =~ s/([\w\(\);=:\.\'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:\.\'\*\|\&])/$1$2$3$4$5/gi; # year:avoid links on both sides $art =~ s/([^\]]{4})\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi; # year pair: avoid links on both sides $art =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi; # 'present' $art =~ s/\[\[Present \(time\)\|(Present)\]\]/$1/gi; # Eliminate 'surprise links' also known as 'easter egg links' $art =~ s/\[\[\d{1,4}s?\sin\s[^\|]{1,30}\|(\d{1,4}s?)\]\]/$1/gi; }

# -	# -- DONE! ---	if ($bNeedsChanging) { wiki_write($articlename, $art, $editsummary); } }

sub wiki_write { my $article_name = shift; my $wiki_out = shift; my $edit_summary = shift; $|=1;	print " Waiting ". ($soonest_next_op - time). " secs... "; $|=1;	while (time < $soonest_next_op) {}; $soonest_next_op = time + $wait_time; print "Writing $article_name\n"; $pw->edit($article_name, $wiki_out, $edit_summary); }

sub citefromlink_GoogleBooks { my $gb_url = shift; my $citetemplate = ''; my $gb_lookup_url = $gb_url; $gb_lookup_url =~ s/(http\:\/\/books\.google\.com\/books\?).*(id\=[^\&]*).*$/$1$2/i; #http://books.google.com/books?hl=en&lr=&id=thmPzIltAV8C&oi=fnd&pg=PP11&sig=81UGsCDc1DxLV3JAWviltyHD_bY&dq=%22Mordecai+Cooke%22#PPA166,M1 #http://books.google.com/books?id=thmPzIltAV8C print " Google book link: $gb_lookup_url\n"; my $res = $ua->get($gb_lookup_url); if ($res->is_success) { print " success.\n"; my $html = $res->content; my $bibdiv = ''; my $gb_title = ''; my $gb_author = ''; my $gb_year = ''; my $gb_pub = ''; my $gb_isbn = ''; my $gb_pages = ''; if ($html =~ m/ ([^<]+)<\/h2>([^\n]*)\n/s) { $gb_title = $1; $bibdiv = $2; }		if ($html =~ m/(.*?)\n/s) { $bibdiv = $1; }		$bibdiv =~ s/ / /g; if ($bibdiv =~ m/By ([^<]*)/) { $gb_author = $1; } elsif ($bibdiv =~ m/ By ([^<]*)/) { $gb_author = $1; }

if ($bibdiv =~ m/\Published by ([^\n\<]*?)\, (\d\d\d\d)\<\/div>/) { $gb_pub = $1; $gb_year = $2; } else { if ($bibdiv =~ m/ Published ([^<]*)/) { $gb_year = $1; }			if ($bibdiv =~ m/q=inpublisher[^>]+>([^<]*)/) { $gb_pub = $1; }		}		if ($bibdiv =~ m/\>ISBN\s*(?:\:\s*)?(\w+)/) { $gb_isbn = $1; }		$citetemplate = ""; } else { print " failed.\n"; } return $citetemplate; }

sub citefromlink_Amazon { my $az_url = shift; my $citetemplate = ''; print " Amazon.com link: "; # First, get the ASIN my $az_ASIN = ''; if ($az_url =~ m/\/(?:dp|product)\/([^\/]*)/) { $az_ASIN = $1; print "$az_ASIN\n"; # Next, plug it into the Amazon API. my $az_api_url = "http://webservices.amazon.com/onca/xml". "?Service=AWSECommerceService". "&AWSAccessKeyId=$az_AccessKey". "&Operation=ItemLookup". "&IdType=ASIN". "&ItemId=$az_ASIN". "&ResponseGroup=Medium"; my $res = $ua->get($az_api_url); my $xml = XMLin( $res->decoded_content ); my $az_binding = $xml->{Items}->{Item}->{ItemAttributes}->{Binding}; if ($az_binding =~ m/^(Hardcover|Paperback|Ring-bound|Kindle Edition|School & Library Binding|Unknown Binding)$/) { # Book my $az_title = $xml->{Items}->{Item}->{ItemAttributes}->{Title}; my $az_date = $xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate}; my $az_pub = $xml->{Items}->{Item}->{ItemAttributes}->{Publisher}; $az_pub = join(", ", @{ $az_pub }) if (ref( $az_pub ) eq "ARRAY" ); my $az_isbn = $xml->{Items}->{Item}->{ItemAttributes}->{ISBN}; #my $az_pages = $xml->{Items}->{Item}->{ItemAttributes}->{NumberOfPages}. " pages"; my $az_author = $xml->{Items}->{Item}->{ItemAttributes}->{Author}; $az_author = join(", ", @{ $az_author }) if (ref( $az_author ) eq "ARRAY" ); $citetemplate = "";

} elsif ($az_binding =~ m/^(Audio CD|Audio Cassette|Music Download|Video Game|DVD|Blu-ray|HD DVD|VHS Tape|UMD for PSP)$/) { # Media my $az_title = $xml->{Items}->{Item}->{ItemAttributes}->{Title}; my $az_date = $xml->{Items}->{Item}->{ItemAttributes}->{ReleaseDate}; $az_date = $xml->{Items}->{Item}->{ItemAttributes}->{PublicationDate} unless ($az_date); my $az_pub = $xml->{Items}->{Item}->{ItemAttributes}->{Publisher}; $az_pub = join(", ", @{ $az_pub }) if (ref( $az_pub ) eq "ARRAY" ); my $az_artist = $xml->{Items}->{Item}->{ItemAttributes}->{Artist}; $az_artist = $xml->{Items}->{Item}->{ItemAttributes}->{Author} unless ($az_artist); $az_artist = join(", ", @{ $az_artist }) if (ref( $az_artist ) eq "ARRAY" ); my $az_isbn = $xml->{Items}->{Item}->{ItemAttributes}->{ISBN};

$citetemplate = ""; }	} else { print " couldn't find ASIN.\n"; } return $citetemplate; }

sub citefromlink_TimeMagazine { my $tm_url = shift; my $citetemplate = ''; print " Time Magazine link.\n"; my $res = $ua->get($tm_url); if ($res->is_success) { my $html = $res->content; my $tm_title = ''; my $tm_date = ''; my $tm_author = '';

if ($html =~ m/RightslinkPopUp\(\'(.*?)\', \'(.*?)\', \'(.*?)\', \'.*?\'\)\;/) { $tm_title = $1; $tm_date = $2; $tm_author = $3; $tm_title =~ s/\\\'/'/g; $tm_author =~ s/\\\'/'/g; if ($tm_title) { $citetemplate = ""; }		}	}	return $citetemplate; }

sub citefromlink_NewYorkTimes { my $nyt_url = shift; my $citetemplate = ''; print " New York Times link: '$nyt_url'\n"; my $res = $ua->get($nyt_url); if ($res->is_success) { my $html = $res->content; my $nyt_title = ''; my $nyt_date = ''; my $nyt_author = ''; # Title if ($html =~ m/<input\s+type=\"hidden\"\s+name=\"title\"\s+value=\"([^\"]*)\"/s) {			$nyt_title = $1;			$nyt_title =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;			$nyt_title =~ s/\n/ /gs;			$nyt_title =~ s/^ +//;			$nyt_title =~ s/ +$//;		} elsif ($html =~ m/function getShareHeadline\(\) \{\s*return encodeURIComponent\(\'(.*?)\'\)\;/s) {			$nyt_title = $1;			$nyt_title =~ s/\n/ /gs;			$nyt_title =~ s/^\s+//;			$nyt_title =~ s/\s+$//;			$nyt_title =~ s/\\\'/'/g;		} elsif ($html =~ m/(.*?)<\/NYT_HEADLINE>/s) {			$nyt_title = $1;			$nyt_title =~ s/\n/ /gs;			$nyt_title =~ s/^\s+//;			$nyt_title =~ s/\s+$//;		} elsif ($html =~ m//s) {			$nyt_title = $1;			$nyt_title =~ s/\n/ /gs;			$nyt_title =~ s/^\s+//;			$nyt_title =~ s/\s+$//;		} elsif ($html =~ m/ (.*?)<\/h3>/s) {			$nyt_title = $1;			$nyt_title =~ s/\n/ /gs; $nyt_title =~ s/^\s+//; $nyt_title =~ s/\s+$//; }		$nyt_title =~ s/\<\/?..?\>//g; # Author if ($html =~ m//s) {			$nyt_author = $1;			$nyt_author =~ s/^\s+//;			$nyt_author =~ s/\s+$//;		}		# Date		if ($html =~ m//s) { $nyt_date = $1; } elsif ($html =~ m/Published\: (.*?)<\/div>/s) { $nyt_date = $1; } elsif ($html =~ m/function getSharePubdate\(\) \{\s*return encodeURIComponent\(\'(.*?)\'\)\;/s) { $nyt_date = $1; $nyt_date =~ s/^\s+//; $nyt_date =~ s/\s+$//; }							if ($nyt_title) { $citetemplate = ""; } else { print " not readable\n"; } }	else { print " not is success\n"; } return $citetemplate; }

sub templatefrom_IMDB { my $imdb_url = shift; my $citetemplate = ''; if ($imdb_url =~ m/imdb\.com\/(title|name|company|character)\/(tt|nm|co|ch)(\d+)/) { my $imdbtype = $1; my $imdbtypeabbr = $2; my $imdbnum = $3; print " IMDB link to $imdbtype $imdbnum\n"; my $res = $ua->get("http://www.imdb.com/$imdbtype/$imdbtypeabbr$imdbnum/"); if ($res->is_success) { my $html = $res->content; if ($html =~ m/<\s*title\s*>\s*([^\n<]*)<\s*\/\s*title\s*>/si) { my $title = $1; if ($title =~ m/Page\)? not found/i) {					print " not found on imdb:" . $res->status_line . ".\n";				} else {						$title =~ tr/\[\]//;					print "  changing to \n";					$citetemplate = "";				}			} else {				print "  no title.\n";			}		} else {			print "  not found on imdb. " . $res->status_line . "\n";		}	}	return $citetemplate; }

sub templatefrom_Myspace { my $ms_username = shift; my $citetemplate = ''; print " MySpace link: $ms_username\n"; my $res = $ua->get("http://www.myspace.com/$ms_username"); if ($res->is_success) { print " success.\n"; my $html = $res->content; if ($html =~ m/Invalid Friend ID/) { $citetemplate = ""; } elsif ($html =~ m/(.*?)<\/span>/) { my $ms_showname = $1; $citetemplate = ""; }	} else { print " fail: ". $res->status_line. "\n"; } return $citetemplate; }

sub templatefrom_PG { my $pg_id = shift; my $citetemplate = ''; print " Gutenberg: $pg_id\n"; my $res = $ua->get("http://www.gutenberg.org/etext/$pg_id"); if ($res->is_success) { print " success.\n"; my $html = $res->content; if ($html =~ m/Error<\/h2>/) { $citetemplate = "http://www.gutenberg.org/etext/$pg_id "; } elsif ($html =~ m/.*? ([^<]*)/s) { my $pg_title = $1; $citetemplate = ""; } else {print " no title\n";} } else { print " fail: ". $res->status_line. "\n"; } return $citetemplate; }

sub templatefrom_YouTube { my $yt_id = shift; my $citetemplate = ''; print " Youtube link: $yt_id\n"; my $res = $ua->get("http://www.youtube.com/watch?v=$yt_id"); if ($res->is_success) { print " success.\n"; my $html = $res->content; if ($html =~ m/The URL contained a malformed video ID/) { $citetemplate = "http://www.youtube.com/watch?v=$yt_id "; } elsif ($html =~ m//) {			my $yt_title = $1;			$citetemplate = "";		}	} else { print " fail: " . $res->status_line . "\n"; }	return $citetemplate; }

sub templatefrom_CongBio { my $cb_id = shift; my $citetemplate = ''; print " CongBio link: $cb_id\n"; my $res = $ua->get("http://bioguide.congress.gov/scripts/biodisplay.pl?index=$cb_id"); if ($res->is_success) { my $html = $res->content; if ($html =~ m/File\: $cb_id does not exist\./) { $citetemplate = "http://bioguide.congress.gov/scripts/biodisplay.pl?index=$cb_id "; } elsif ($html =~ m/([^<]*)get($usn_url); if ($res->is_success) { my $html = $res->content; my $usn_title = ''; my $usn_date = ''; my $usn_author = '';

if ($html =~ m/ \s*(.*?)\s*<\/h1>\s* \s*(.*?).*?<\/h2>/s) { $usn_title = "$1: $2"; } elsif ($html =~ m/ \s*(.*?)\s*<\/h1>/s) { $usn_title = $1; }		if ($html =~ m/<div id=\"byline\">By\s*(?:<a href.*?>)?\s*(.*?)<\//s) { $usn_author = $1; }		if ($html =~ m/<div id=\"dateline\">Posted (.*?)<\/div>/s) { $usn_date = $1; }		if ($usn_title) { $citetemplate = ""; }	}	return $citetemplate; }

sub citefromlink_Forbes { my $fo_url = shift; my $citetemplate = ''; print " Forbes link\n"; my $res = $ua->get($fo_url); if ($res->is_success) { my $html = $res->content; my $fo_title = ''; my $fo_date = ''; my $fo_author = '';

if ($html =~ m/<span class=\"mainarttitle\">\s*(.*?)\s*<\/span>/s) { $fo_title = $1; $fo_title =~ s/<\/?b>//gi; }		if ($html =~ m/<span class=\"mainartauthor\">\s*(.*?)\s*<\/?span>/s) { $fo_author = $1; } elsif ($html =~ m/<span class=\"mainarttitle\">.*?<\/span> (.*?)\s*<span/s) { $fo_author = $1; }		if ($html =~ m/<span class=\"mainartdate\">\s*(\d\d)\.(\d\d)\.(\d\d)/) { my $temp_month = $1; my $temp_day = $2; my $temp_year = $3; $fo_date = "20$temp_year-$temp_month-$temp_day"; }		if ($fo_title) { $citetemplate = ""; }	}	return $citetemplate; } sub citefromlink_BBC { my $bbc_url = shift; my $citetemplate = ''; print " BBC news link: $bbc_url\n"; my $res = $ua->get($bbc_url); if ($res->is_success) { print " success.\n"; my $html = $res->content; my $bbc_title = ''; my $bbc_date = '';

if ($html =~ m/<meta name=\"Headline\" content=\"([^\"]*)\"\s*\/?>/si) {			$bbc_title = $1;			$bbc_title =~ s/^\s+//;			$bbc_title =~ s/\s+$//;						if ($html =~ m/<meta name=\"OriginalPublicationDate\" content=\"(\d\d\d\d)\/(\d\d)\/(\d\d)/si) { my $temp_year = $1; my $temp_month = $2; my $temp_day = $3; $bbc_date = "$temp_year-$temp_month-$temp_day"; }			$citetemplate = ""; }	}	print " done.\n"; return $citetemplate; }

sub process_link { my $full_link = shift; my $link_type = shift; $full_link =~ m/(https?\:\/\/[^\s\]\<]*)/s; my $urlonly = $1; my $citetemplate = ''; if ($urlonly =~ m/http\:\/\/books\.google\.com\/books/) { # Google Books $citetemplate = citefromlink_GoogleBooks($urlonly); }	elsif ($urlonly =~ m/http\:\/\/.*amazon\.com\//) { # Amazon.com $citetemplate = citefromlink_Amazon($urlonly); }	elsif ($urlonly =~ m/http\:\/\/www\.time\.com\//) { # Time Magazine $citetemplate = citefromlink_TimeMagazine($urlonly); }	elsif ($urlonly =~ m/https?\:\/\/.*?nytimes\.com\//) { # New York Times $citetemplate = citefromlink_NewYorkTimes($urlonly); } elsif ($urlonly =~ m/http:\/\/.*\.usnews\.com\//) { # US News and World Report $citetemplate = citefromlink_USNews($urlonly); } elsif ($urlonly =~ m/http:\/\/.*\.forbes\.com\//) { # Forbes $citetemplate = citefromlink_Forbes($urlonly); } elsif ($urlonly =~ m/http:\/\/news\.bbc\.co\.uk\//) { # BBC News $citetemplate = citefromlink_BBC($urlonly); } elsif ($urlonly =~ m/http:\/\/www\.imdb\.com\//) { # IMDB $citetemplate = templatefrom_IMDB($urlonly); } elsif ($urlonly =~ m/http:\/\/www\.myspace\.com\/([^\s\< \]]*)/) { # MySpace my $ms_title = $1; $citetemplate = templatefrom_Myspace($ms_title); } elsif ($urlonly =~ m/http:\/\/www\.youtube\.com\/watch\?v\=([^\s\< \]]*)/) { # YouTube my $yt_id = $1; $citetemplate = templatefrom_YouTube($yt_id); } elsif ($urlonly =~ m/http:\/\/bioguide\.congress\.gov\/scripts\/biodisplay.pl\?index\=([^\s\< \]]*)/) { # Congbio my $cb_id = $1; $citetemplate = templatefrom_CongBio($cb_id); } elsif ($urlonly =~ m/http:\/\/www\.gutenberg\.org\/(?:etext|ebooks|files)\/(\d+)/) { # Project Gutenberg my $pg_id = $1; $citetemplate = templatefrom_PG($pg_id); } else { # check for DOI, and add title if none already $citetemplate = check_DOI($urlonly, $link_type); }	if ($citetemplate) { if ($full_link =~ s/\[\Q$urlonly\E[^\]]*\]/$citetemplate/s) { # do nothing } else { $full_link =~ s/\Q$urlonly\E/$citetemplate/s; }	}	return $full_link; }

sub check_DOI { my $url = shift; my $linktype = shift; my $citetemplate = ''; return $citetemplate unless $linktype eq 'bare';

print " Looking up $url\n"; my $res = $ua->get("$url"); unless ($res->content_type eq 'text/html') { print " not html. Skipping.\n"; } else { # It's html. unless ($res->is_success) { print " no connection (probably 404). Skipping.\n"; } else { # It's connected. my $html = $res->content; # Here's where I should check for a DOI, and only check for a title if $linktype eq 'bare' if ($html =~ m/(10\.\d{4}(\/|\%2F)([^\s\"\?\&\>]|\&l?g?t\;|\<[^\s\"\?\&]*\>)*)(?=[\s\"\?]|\<\/)/) {			 # It's got a DOI! Eureka.			  my $DOI = $1;			  # strip trailing flotsam			  $DOI =~ s/(\<\/?\w+\/?\>|[\:\;\)\.\'\,\-\#])+$//; 			  $DOI =~ s/\<.*//;			  # Now run the DOI through crossref.org:			  my $crossref_url = "http://www.crossref.org/openurl/?pid=$crossref_creds&id=doi:$DOI&noredirect=true";				my $res2 = $ua->get($crossref_url);			  my $xml = XMLin( $res2->decoded_content );			  my $j_article_title = $xml->{query_result}->{body}->{query}->{article_title};			  if ($j_article_title) {				  my $j_journal_title = $xml->{query_result}->{body}->{query}->{journal_title};				  my $j_volume = $xml->{query_result}->{body}->{query}->{volume};				  my $j_issue = $xml->{query_result}->{body}->{query}->{issue};				  my $j_pages = $xml->{query_result}->{body}->{query}->{first_page};				  my $j_year = $xml->{query_result}->{body}->{query}->{year}; my $j_format = $xml->{query_result}->{body}->{query}->{publication_type}; $j_format =~ tr/_/ /; my $j_last_name = $xml->{query_result}->{body}->{query}->{contributors}->{contributor}->{given_name}; my $j_first_name = $xml->{query_result}->{body}->{query}->{contributors}->{contributor}->{surname}; $citetemplate = ""; }		 }		  unless ($citetemplate) { # DOI checking. if ($linktype eq 'bare') { # Look for a title print " Looking for a title.\n"; if ($html =~ m/<\s*title\s*>\s*([^\n<]*)\s*<\s*\/\s*title\s*>/si) { my $title = $1; $title =~ tr/[]{}//; $title =~ s/\s/ /g; while ($title =~ s/ / /g) {}; $title =~ s/ $//; $title =~ s/^ //; $title =~ s/<script[^>]*>.*?<\/script>|<style[^>]*>.*?<\/style>||<!\[CDATA\[.*?\]\]>//gi; if (length($title) > 175) { $title =~ s/(.{175}).*/$1.../; }					 $title =~ s/(.*)/\u$1/; if ($title !~ m/$blacklist/i) { # Title not blacklisted print " Title: $title\n"; my $baseurl = $url; $baseurl =~ s/.*https?:\/\/([^\/\s\<]*).*/$1/; $baseurl =~ s/.*\.(blogspot\.com|livejournal\.com|blogger\.com)/$1/; $citetemplate = "[QQQ$url $title] at $baseurl"; } else { print " black-listed title. Skipping.\n"; } } else { print " no title. Skipping.\n"; } }		 }	  }  }  return $citetemplate; }