User:Polbot/source/Polbot.pm

package Polbot;

use strict; use LWP::UserAgent;


 * 1) Here is an example for this sub's usage:
 * 2) my $url1 = 'http://bioguide.congress.gov/scripts/biodisplay.pl?index=H000671';
 * 3) print Polbot::bio2wiki($url1);
 * 1) print Polbot::bio2wiki($url1);

sub bio2wiki { my $url = shift;

# Constants my $pronoun = 'He'; #Unfortunately, there is no way to tell if the person is male or female from the bioguide. I hate assuming male here, but what can you do? my $preps = 'in|near|to|at|of'; my $months = 'January|February|March|April|May|June|July|August|September|October|November|December'; my $states = 'Alaska|Alabama|Arkansas|Arizona|California|Colorado|Connecticut|Deleware|Florida|Georgia|Hawaii|Idaho|Illinois|Indiana|Iowa|Kansas|Kentucky|Louisiana|Maine|Maryland|Massachusetts|Michigan|Minnesota|Mississippi|Montana|Missouri|Nebraska|Nevada|New Hampshire|New Jersey|New Mexico|New York|North Carolina|North Dakota|Ohio|Oklahoma|Oregon|Pennsylvania|Rhode Island|South Carolina|South Dakota|Tennessee|Texas|Utah|Vermont|Virginia|Washington|West Virginia|Wisconsin|Wyoming|Ireland|France|England|Scotland|Wales|Holland|Spain|Germany'; my $He_list = 'attended|became|commenced|completed|continued|declined|did|died|engaged|entered|established|graduated|is|journeyed|left|lived|lives|moved|owned|owns|participated|pursued|received|remained|remains|represented|represents|resigned|resumed|retired|returned|served|settled|signed|studied|successfully|taught|unsuccessfully|was|went|worked|works'; my $Hewas_list = 'a|an|admitted|affiliated|appointed|assigned|author|discharged|editor|educated|employed|engaged|entombed|impeached|interred|interested|not|one|owner|promoted|publisher|reelected|re-elected|reinterred'; my $Servedas_list = 'Court|Democratic|Republican|adjutant|aide|assistant|associate|businessman|businesswoman|captain|chair|chairman|clerk|collector|colonel|commissioner|defense|delegate|director|district|general|governor|inspector|judge|justice|lieutenant|magistrate|master|mayor|[mM]ember|naval|overseer|president|presidential|proprietor|prosecuting|solicitor|special|staff|vice|war'; # Connect to the URL my $ua = new LWP::UserAgent; $ua->agent("Mozilla/6.0"); my $req = new HTTP::Request GET => $url; my $res = $ua->request($req); $res->is_success or die "Could not get content"; # Get the content my $content = $res->content; $content =~ s/^.*([^<]*), ?<\/FONT>([^<]*)<\/(TD|P)>.*$/$2/s; # Just the main text (minus name) my $reversedname = $1; $content =~ s/\n//sg; # as a single line # Parse name $reversedname =~ s/\s+/ /g; $reversedname =~ m/^([^,]*), ([^,]*)(, .*)?$/; my $firstname = $2; my $lastname = $1; my $suffix = $3; #die ">$foundname< =>  >$foundfirstname< >$foundlastname< >$foundsuffix<\n"; $lastname =~ s/(\w+)/\u\L$1/g; $reversedname = "$lastname, $firstname$suffix"; my $fullname = "$firstname $lastname$suffix"; # Do universal search & replaces $content =~ s/\s+/ /g; #take out dbl spaces; $content = unabbreviate_states($content); #expand all state names $content = link_cities_from_pattern($content); $content = link_dates_from_pattern($content); $content = link_colleges_from_pattern($content); $content = replace_recognized_tokens($content); # split into individual lines my @lines = split(/; /, $content); foreach my $line (@lines) { $line =~ s/^ // } #take out leading space (if there) # Set up initial variables my $familyinfo = ''; my $iswas = 'is'; my $initial_description = ''; my $birthdeath = 'unknown birth and death'; my $birth = ''; my $birthyear = ''; my $death = ''; my $deathyear = ''; my $body = ''; my %cats = ; # for categories like "Senator from Kentucky" # line 1. First off, does it start with " (son of . . .), " or something similar? # e.g. brother of John Fitzgerald Kennedy and Robert Francis Kennedy, grandson of John Francis Fitzgerald my $line = shift(@lines); if ($line =~ m/^\(([^)]*)\)/) {		$familyinfo = $1;		$line =~ s/^\([^)]*\), (.*)$/$1/; $familyinfo =~ s/of ([^,]*),/of $1,/g; $familyinfo =~ s/of ([^,]*)$/of $1/g; $familyinfo =~ s/([^],]) and /$1]] and [[/g; }	# Now, make line1 into the initial description, and pick categories. $initial_description = $line; while ($initial_description =~ m/(a Senator and a Representative|a Representative and a Senator) from ($states)/g) { #senator and rep $cats{""} = $2; $cats{""} = $2; }	while ($initial_description =~ m/Senator from ($states)/g) { $cats{""} = $1; }	while ($initial_description =~ m/Representative from ($states)/g) { $cats{""} = $1; }	$initial_description =~ s/(Territory of )?($states)/$1$2/g; $initial_description =~ s/Senator/U.S. Senator/g; $initial_description =~ s/Representative/U.S. Representative/g; # Next line: look for birth place and date. my $line = shift(@lines);

if ($line =~ m/(born|Born)/) { if ($line =~ m/^(.*), in (\d+)$/) { $birthyear = $2; $birth = $2; $line = $1; } elsif ($line =~ m/^(.*), about (\d+)$/) { $birth = "ca. $2"; $birthyear = $2; $line = $1 } elsif ($line =~ m/^(.*?)(?:,)? (?:on )?(\[\[\w* \d+\]\], \[\[(\d+)\]\])$/) { $birth = $2; $birthyear = $3; $line = $1; } elsif ($line =~ m/^(.*), birth date (unknown)/) { $birth = $2; $line = $1; } else { $birth = 'unknown'; }		if ($line =~ s/^(was |probably )?born/Born/) { $body .= "$line, $lastname"; } elsif ($line eq 'birth date unknown') { $body = $lastname; } else { die "I didn't expect: $line"; }	} else { $birth = 'unknown'; $body = prepend_line($lastname, $lastname, $line); }

# Next line. . .	my $line = shift(@lines); $line = prepend_line('', $lastname, $line); $body .= $line; # Subsequent lines. . .	while ($line = shift(@lines)) { if ($line eq 'birth date unknown') { $birth = 'unknown'; $birthyear = ''; next; } 		if ($line =~ m/^[dD]eath date unknown\.? ?$/) { $death = 'unknown'; $deathyear = ''; $iswas = 'was'; next; } 		$line = prepend_line($pronoun, $lastname, $line); # look for death if ($line =~ m/(died|death(?! of)).*(\d\d\d\d)/) { $deathyear = $2; $death = $deathyear; $iswas = 'was'; #TODO - change this to ignore "death of", check against http://bioguide.congress.gov/scripts/biodisplay.pl?index=A000022 if ($line =~ m/(died|death(?! of)).*(\[\[($months) \d+\]\], \[\[\d\d\d\d\]\])/) { $death = $2; }		}		$body .= $line; }	# Finalize Initial description. if ($birth) { if ($death) { $birthdeath = "$birth - $death"; if ($birthdeath eq 'unknown - unknown') { $birthdeath = 'birth and death dates unknown'; } } else { if ($birth eq 'unknown') { $birthdeath = 'unknown date of birth'; } else { $birthdeath = "born $birth"; }		}	}	my $boilerplate = "<!" . "-- This article was automatically created by User:polbot from $url. The prose may be stilted, and there may be grammatical and Wikification errors. Please improve in any way you see fit. --". ">";	$initial_description = "$boilerplate'''". $fullname. "''' ($birthdeath) $iswas ". $initial_description; if ($familyinfo) { $initial_description .= ", ". $familyinfo; }	# Add ending stuff $url =~ m/^.*=(.*)$/; my $ending_stuff = "==Source==\n\n\n\n"; if ($birthyear) { $cats{""} = 'a'		#$ending_stuff .= "\n"; } else { $cats{""} = 'a'		#$ending_stuff .= "\n"; }	if ($iswas eq 'is') { $cats{""} = 'a'		#$ending_stuff .= "\n"; } elsif ($death =~ m/\d\d\d\d/) { $cats{""} = 'a'		#$ending_stuff .= "\n"; } else { $cats{""} = 'a'		#$ending_stuff .= "\n"; }	$ending_stuff .= join("\n", sort keys %cats);

# Done! $body = "$initial_description.\n\n$body\n$ending_stuff"; return $body; # ===================================================================================================	# ====================  Inner subs   =============================================================== # ===================================================================================================	sub prepend_line {		my $starter = shift; my $lastname = shift; my $line = shift; my $analyzeline = $line; # If the line starts with these, skip them. $analyzeline =~ s/^after the war//; $analyzeline =~ s/^again//; $analyzeline =~ s/^also//; $analyzeline =~ s/^originally//; $analyzeline =~ s/^several times//; $analyzeline =~ s/^soon afterward//; $analyzeline =~ s/^subsequently//; #Get my ($initchar) = ($analyzeline =~ m/(.)/); my ($initword) = ($analyzeline =~ m/(\w+)/); if ($initchar eq '[') { $line = "$starter was in the $line.\n"; } elsif ($initword =~ /^(successful|lawyer|teacher)$/) { $line = "$starter was a $line.\n"; } elsif ($initword eq 'unsuccessful') { $line = "$starter was an $line.\n"; } elsif ($initword eq 'elected') { $line = "\n$lastname was $line.\n"; } elsif ($initword =~ m/^($He_list)$/) { $line = "$starter $line.\n"; } elsif ($initword =~ m/^($Hewas_list)$/) { $line = "$starter was $line.\n"; } elsif ($initword =~ m/^($Servedas_list)$/) { $line = "$starter served as $line.\n"; } elsif ($initword =~ /^(re)?interment$/) { $line =~ s/^(re)?interment/$starter was $1interred/; $line = "$line.\n"; $iswas = 'was'; } else { $line =~ s/^([a-z])/\U$1/; $line = "<!" . "-- A grammar fix may be needed here. --". ">$line.\n"; }		# clean up $line =~ s/(\.? \.|\. )$/./; return $line; }	sub replace_recognized_tokens {		my $content = shift; # links

$content =~ s/Amherst College/Amherst College/g; $content =~ s/Civil War/Civil War/g; $content =~ s/Confederate Army/Confederate States Army/g; $content =~ s/Confederate States of America/Confederate States of America/g; $content =~ s/Constitution of the United States/Constitution of the United States/g; $content =~ s/Democratic National Committee/Democratic National Committee/g; $content =~ s/Democratic Party/Democratic Party/g; $content =~ s/Democratic-Republican Party/Democratic-Republican Party/g; $content =~ s/Democratic Republican Party/Democratic Republican Party/g; $content =~ s/Department of Defense/Department of Defense/g; $content =~ s/Department of War/Department of War/g; $content =~ s/Eton College/Eton College/g; $content =~ s/Federalist Party/Federalist Party/g; $content =~ s/Free-Soil Party/Free-Soil Party/g; $content =~ s/Harvard College/Harvard College/g; $content =~ s/justice of the peace/Justice of the Peace/g; $content =~ s/Opposition Party/Opposition Party/g; $content =~ s/Republican National Committee/Republican National Committee/g; $content =~ s/Revolutionary War/Revolutionary War/g; $content =~ s/Union Army/Union Army/g; $content =~ s/Union College/Union College/g; $content =~ s/United States Air Force/United States Air Force/g; $content =~ s/United States Army Medical Corps/United States Army Medical Corps/g; $content =~ s/United States Army Reserve/United States Army Reserve/g; $content =~ s/United States House of Representatives/United States House of Representatives/g; $content =~ s/United States Marine Corps/United States Marine Corps/g; $content =~ s/United States Marines/United States Marine Corps/g; $content =~ s/United States Navy/United States Navy/g; $content =~ s/United States Representative/United States Representative/g; $content =~ s/United States Senate/United States Senate/g; $content =~ s/United States Senator/United States Senator/g; $content =~ s/United States Supreme Court/United States Supreme Court/g; $content =~ s/United States Treasury Department/United States Treasury Department/g; $content =~ s/(Vice )?President of the United States/$1President of the United States/g; $content =~ s/Washington, D.C./Washington, D.C./g; $content =~ s/William and Mary College/William and Mary College/g; $content =~ s/Yale College/Yale College/g;

$content =~ s/Republican Party/Republican Party/g; $content =~ s/United States Army/United States Army/g; $content =~ s/as a Democrat/as a Democrat/g; $content =~ s/as a Federalist/as a Federalist/g; $content =~ s/as a Republican/as a Republican/g; $content =~ s/as a Whig/as a Whig/g; $content =~ s/($states) (state )?senate/$1 Senate/g; $content =~ s/($states) (state )?house of representatives/$1 House of Representatives/g;

# grammar-related replacements $content =~ s/graduated, /graduated from /g; $content =~ s/lawyer, private/lawyer in private/g; $content =~ s/, (\d\d\d\d) ?- ?(\d\d\d\d)/ from $1 to $2/g; $content =~ s/\(([^)]*)\;/($1, and/g;		$content =~ s/(member|chairman|chair), /$1 of the /g;		$content =~ s/\&\#146\;/'/g;		$content =~ s/\&\#14[78]\;/"/g;		return $content;	}	sub link_colleges_from_pattern	{		my $content = shift;		# "Something University"		$content =~ s/(([A-Z][a-z]+ (and )?)*[A-Z][a-z]+ (University|Academy))/\[\[$1\]\]/g;		# "University of Something"		$content =~ s/(University of [A-Z][a-z]+( (at )?[A-Z][a-z]+)*)/\[\[$1\]\]/g;		return $content;	}	sub link_dates_from_pattern	{		my $content = shift;		$content =~ s/($months) (\d+), *(\d\d\d\d)/$1 $2, $3/g;		return $content;	}	sub link_cities_from_pattern	{		my $content = shift;		#prep City, State (or prep County, State)		$content =~ s/ ($preps) ([A-Z][a-z]*( [A-Z][a-z]*)*, ($states))/ $1 $2/g;		#prep City, Something County, State		$content =~ s/ ($preps) ([A-Z][a-z]*( [A-Z][a-z]*)*),( [A-Z][a-z]*)* County, (($states))/ $1 $2, $5/g; #, City, Something County, State $content =~ s/, ([A-Z][a-z]*( [A-Z][a-z]*)*),( [A-Z][a-z]*)* County, (($states))/, $1, $4/g; #, Something, State $content =~ s/, ([A-Z][a-z]*( [A-Z][a-z]*)*, ($states))/, $1/g; return $content; }	sub unabbreviate_states {		my $content = shift; $content =~ s/Ala\./Alabama/g; $content =~ s/Ariz\./Arizona/g; $content =~ s/Ark\./Arkansas/g; $content =~ s/Calif\./California/g; $content =~ s/Colo\./Colorado/g; $content =~ s/Conn\./Connecticut/g; $content =~ s/Del\./Delaware/g; $content =~ s/Fla\./Florida/g; $content =~ s/Ga\./Georgia/g; $content =~ s/Ill\./Illinois/g; $content =~ s/Ind\./Indiana/g; $content =~ s/Kans\./Kansas/g; $content =~ s/Ky\./Kentucky/g; $content =~ s/La\./Louisiana/g; $content =~ s/Md\./Maryland/g; $content =~ s/Mass\./Massachusetts/g; $content =~ s/Mich\./Michigan/g; $content =~ s/Minn\./Minnesota/g; $content =~ s/Miss\./Mississippi/g; $content =~ s/Mo\./Missouri/g; $content =~ s/Mont\./Montana/g; $content =~ s/Nebr\./Nebraska/g; $content =~ s/Nev\./Nevada/g; $content =~ s/N\.H\./New Hampshire/g; $content =~ s/N\.J\./New Jersey/g; $content =~ s/N\.M\./New Mexico/g; $content =~ s/N\.Y\./New York/g; $content =~ s/N\.C\./North Carolina/g; $content =~ s/N\.D\./North Dakota/g; $content =~ s/Okla\./Oklahoma/g; $content =~ s/Ore\./Oregon/g; $content =~ s/Pa\./Pennsylvania/g; $content =~ s/R\.I\./Rhode Island/g; $content =~ s/S\.C\./South Carolina/g; $content =~ s/S\.D\./South Dakota/g; $content =~ s/Tenn\./Tennessee/g; $content =~ s/Tex\./Texas/g; $content =~ s/Vt\./Vermont/g; $content =~ s/Va\./Virginia/g; $content =~ s/Wash\./Washington/g; $content =~ s/W\.Va\./West Virginia/g; $content =~ s/Wis\./Wisconsin/g; $content =~ s/Wyo\./Wyoming/g; return $content; } }


 * 1) Here is an example for this sub's usage:
 * 2) $URL = Polbot::Get_URL_from_name("Mitch McConnell");

sub Get_URL_from_name {	my $article_name = shift; my @URLs = ; my $ErrMsg; my $fname; my $lname;

$article_name =~ s/ \(.*\)//g; # Take out anything parenthesized. if ($article_name =~ m/^(.*) ([^ ]*)(, Jr.|, Sr.| II| III)$/) { $fname = $1. $3;		$lname = $2; } elsif ($article_name =~ m/^(.*) ([^ ]*)$/) { $fname = $1; $lname = $2; } else { return "Malformed article name '$article_name'"; }	@URLs = Get_matching_URLs($fname, $lname); my $nummatches = scalar(@URLs); if ($nummatches eq 1) { return $URLs[0]; } elsif ($nummatches > 1) { return "Multiple hits for '$lname, $fname'."; }

$ErrMsg = "No hits for '$lname, $fname'.";

# Take off the suffix if ($fname =~ s/(, Jr\.|, Sr\.| II| III)$//) { @URLs = Get_matching_URLs($fname, $lname); my $nummatches = scalar(@URLs); if ($nummatches eq 1) { return $URLs[0]; } elsif ($nummatches > 1) { $ErrMsg .= " Multiple hits for '$lname, $fname'."; return $ErrMsg; }		$ErrMsg .= " No hits for '$lname, $fname'."; }	# Try like "C. Everett Coop" if ($fname =~ s/^.\. //) { @URLs = Get_matching_URLs($fname, $lname); my $nummatches = scalar(@URLs); if ($nummatches eq 1) { return $URLs[0]; } elsif ($nummatches > 1) { $ErrMsg .= " Multiple hits for '$lname, $fname'."; return $ErrMsg; }		$ErrMsg .= " No hits for '$lname, $fname'."; }	# Try like "John Q. Adams" if ($fname =~ s/\..*$//) { @URLs = Get_matching_URLs($fname, $lname); my $nummatches = scalar(@URLs); if ($nummatches eq 1) { return $URLs[0]; } elsif ($nummatches > 1) { $ErrMsg .= " Multiple hits for '$lname, $fname'."; return $ErrMsg; }		$ErrMsg .= " No hits for '$lname, $fname'."; }	return $ErrMsg; } sub Get_matching_URLs {	my $firstname = shift; my $lastname = shift; my $url = 'http://bioguide.congress.gov/biosearch/biosearch1.asp'; my $ua = LWP::UserAgent->new; $ua->agent("Mozilla/6.0"); my @links = ; my $res = $ua->post($url, ['lastname' => $lastname, 'firstname' => $firstname]); if ($res->is_success) { my $content = $res->content; @links = ($content =~ m/ /g);	} else {				print "could not connect, lastname = $lastname, firstname=$firstname"	}

return @links; }

sub fix_dates { my $txt = shift; # century without AD,BC etc $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi; $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi; $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\]\]/$1 century/gi; $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\]\]/$1 centuries/gi; # century with AD,BC etc $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi; $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]century\s(AD|BC|CE|BCE)\]\]/$1 century $2/gi; $txt =~ s/\[\[\d{1,2}(?:st|nd|rd|th)[ \-]century\|(\d{1,2}(?:st|nd|rd|th))[ \-]centuries\s(AD|BC|CE|BCE)\]\]/$1 centuries $2/gi; $txt =~ s/(\d(?:st|nd|rd|th))[ \-]Century/$1 century/gi;

# piped decades and years $txt =~ s/\[\[(\d{1,4}\'?s)\]\]/$1/gi; $txt =~ s/\[\[(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi; $txt =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4})\]\]/$1/gi; $txt =~ s/\[\[\d{1,4}s? (?:AD|BC|CE|BCE)\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi; $txt =~ s/\[\[\d{1,4}s?\|(\d{1,4}s? (?:AD|BC|CE|BCE))\]\]/$1/gi; $txt =~ s/\[\[\d{1,4}s?\|(\d{1,2}s?)\]\]/$1/gi;

# months $txt =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December)\]\]/$1/gi; $txt =~ s/\[\[January\|(Jan)\]\]/$1/gi; $txt =~ s/\[\[February\|(Feb)\]\]/$1/gi; $txt =~ s/\[\[March\|(Mar)\]\]/$1/gi; $txt =~ s/\[\[April\|(Apr)\]\]/$1/gi; $txt =~ s/\[\[May\|(May)\]\]/$1/gi; $txt =~ s/\[\[June\|(Jun)\]\]/$1/gi; $txt =~ s/\[\[July\|(Jul)\]\]/$1/gi; $txt =~ s/\[\[August\|(Aug)\]\]/$1/gi; $txt =~ s/\[\[September\|(Sep)\]\]/$1/gi; $txt =~ s/\[\[October\|(Oct)\]\]/$1/gi; $txt =~ s/\[\[November\|(Nov)\]\]/$1/gi; $txt =~ s/\[\[December\|(Dec)\]\]/$1/gi;

#month+year $txt =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d{3,4})\]\]/$1/gi;

#Month+day_number "March 7th" -> "March 7" $txt =~ s/\[\[(January|February|March|April|May|June|July|August|September|October|November|December) (\d?\d)(?:th|st|nd|rd)\]\]/\[\[$1 $2\]\]/gi; $txt =~ s/\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](?:th|st|nd|rd)/\[\[$1\]\]/gi; $txt =~ s/\[\[(\d?\d)(?:th|st|nd|rd) (January|February|March|April|May|June|July|August|September|October|November|December)\]\]/\[\[$1 $2\]\]/gi;

#Month+day_number piped into number. Preferences do not work. They don't work in sequence because digits in the two dates must be adjacent $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with ndash or mdash instead of hyphen $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with slash instead of hyphen $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

$txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with ndash instead of hyphen $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with slash instead of hyphen $txt =~ s/([^\[]{4})\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d?\d)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

$txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with ndash instead of hyphen $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with slash instead of hyphen $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1$2$3$4/gi;

$txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?\-?\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with ndash instead of hyphen $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\s?&[nm]dash;\s?)\[\[(\d{1,2})\]\]/$1$2$3$4/gi; #same again but with slash instead of hyphen $txt =~ s/([^\[]{4})\[\[(\d?\d) (?:January|February|March|April|May|June|July|August|September|October|November|December)\]\](\/)\[\[(\d{1,2})\]\]/$1$2$3$4/gi;

$txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2})\]\]/$1/gi; $txt =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2})\]\]/$1/gi;

$txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2})\]\]/$1/gi;

# solitary day_numbers $txt =~ s/\[\[(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi; $txt =~ s/\[\[\d{1,2} (?:January|February|March|April|May|June|July|August|September|October|November|December)\|(\d{1,2}(?:th|st|nd|rd))\]\]/$1/gi; $txt =~ s/\[\[(\d{1,2}(?:st|nd|rd|th))\]\]/$1/gi;

# days of the week in full. Optional plurals $txt =~ s/\[\[(Mondays?|Tuesdays?|Wednesdays?|Thursdays?|Fridays?|Saturdays?|Sundays?)\]\]/$1/gi; # days of the week abbreviated. Leave out 'Sun' as potentially valid link to the Sun. Leave out 'SAT' in upper case as potential link to 'Scholastic achievement/aptitude test'. $txt =~ s/\[\[(Mon|Tue|Tues|Wed|Thu|Thur|Thurs|Fri)\]\]/$1/gi; $txt =~ s/\[\[(Sat)\]\]/$1/g; $txt =~ s/\[\[Mondays?\|(Mondays?)\]\]/$1/gi; $txt =~ s/\[\[Tuesdays?\|(Tuesdays?)\]\]/$1/gi; $txt =~ s/\[\[Wednesdays?\|(Wednesdays?)\]\]/$1/gi; $txt =~ s/\[\[Thursdays?\|(Thursdays?)\]\]/$1/gi; $txt =~ s/\[\[Fridays?\|(Fridays?)\]\]/$1/gi; $txt =~ s/\[\[Saturdays?\|(Saturdays?)\]\]/$1/gi; $txt =~ s/\[\[Sundays?\|(Sundays?)\]\]/$1/gi;

#4 digit years piped into 2 $txt =~ s/\[\[\d{1,4}\|(\d{1,2})\]\]/$1/gi;

#year: examine characters in link on left for date, examine characters in link on right for date $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi; #year pair: examine characters in link on left for date, examine characters in link on right for date $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc\s\-]))/$1$2$3$4$5/gi;

#year: examine characters in link on left for date, avoid links on right $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi; #year pair: examine characters in link on left for date, avoid links on right $txt =~ s/((?:[^yhletramub\s]..|[^rcianlse\d\s].|[^yhletr\d])\]\]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;

#year: check for line-ends, text on left, avoid links on right. Run twice to deal better with lists. $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi; $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?|\n)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3/gi; #year pair: check for line-ends, text on left, avoid links on right $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4}|\n)/$1$2$3$4$5/gi;

#year: avoid links on left, examine characters in link on right for date $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3/gi; #year pair: avoid links on left, examine characters in link on right for date $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[(?:[^jfmasond\d]|.[^aepuco\d\s]|..[^jfmasondbrylgptvc \s\-]))/$1$2$3$4$5/gi;

#year:avoid links on left, text on right $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3/gi; #year pair: avoid links on left, text on right $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3$4$5/gi;

#year:text on left, text on right $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3/gi; #year pair: avoid links on left, text on right $txt =~ s/([\w\(\);=:.'\*\|\&]\s?,?\-?\s?)\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\](\s?,?\-?\s?[\w\(\);=:.'\*\|\&])/$1$2$3$4$5/gi;

#year:avoid links on left, hyphen but no digits (to avoid ISO date) in link on right. Currently suspended because it isn't fully tested. #$txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](\s?,?\-?\s?\[\[[^\d])/$1$2$3/gi;

#year:avoid links on both sides $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3/gi; #year pair: avoid links on both sides $txt =~ s/([^\]]{4})\[\[(\d{1,4})\]\](.?.?.?.?.?.?)\[\[(\d{1,4})\]\]([^\[]{4})/$1$2$3$4$5/gi;

#'present' $txt =~ s/\[\[Present \(time\)\|(Present)\]\]/$1/gi;

#Eliminate 'surprise links' also known as 'easter egg links' $txt =~ s/\[\[\d{1,4}s?\sin\s[^\|]{1,30}\|(\d{1,4}s?)\]\]/$1/gi;

return $txt; }

sub replace_unlinked_tokens {	my $content = shift; # links

$content =~ s/([^[|:])Amherst College/$1\[\[Amherst College\]\]/; $content =~ s/([^[|:])Confederate Army/$1\[\[Confederate States Army\]\]/; $content =~ s/([^[|:])Constitution of the United States/$1\[\[United States Constitution|Constitution of the United States\]\]/; $content =~ s/([^[|:])Democratic National Committee/$1\[\[Democratic National Committee\]\]/; $content =~ s/([^[|:])Democratic-Republican Party/$1\[\[Democratic-Republican Party (United States)|Democratic-Republican Party\]\]/; $content =~ s/([^[|:])Democratic Republican Party/$1\[\[Democratic-Republican Party (United States)|Democratic Republican Party\]\]/; $content =~ s/Department of Defense([^]|])/\[\[United States Department of Defense|Department of Defense\]\]$1/; $content =~ s/Department of War([^]|])/\[\[United States Department of War|Department of War\]\]$1/; $content =~ s/([^[|:])Eton College/$1\[\[Eton College\]\]/; $content =~ s/([^[|:])Free-Soil Party/$1\[\[Free Soil Party|Free-Soil Party\]\]/; $content =~ s/([^[|:])Harvard College/$1\[\[Harvard College\]\]/; $content =~ s/([^[|:])Republican National Committee/$1\[\[Republican National Committee\]\]/; $content =~ s/([^[|:])Union Army/$1\[\[Union Army\]\]/; $content =~ s/([^[|:])Union College/$1\[\[Union College\]\]/; $content =~ s/([^[|:])United States Army Medical Corps/$1\[\[Army Medical Department (United States)|United States Army Medical Corps\]\]/; $content =~ s/([^[|:])United States Army Reserve/$1\[\[United States Army Reserve\]\]/; $content =~ s/([^[|:])United States Treasury Department/$1\[\[United States Treasury Department\]\]/; $content =~ s/([^[|:])Washington, D\.C\./$1\[\[Washington, D.C.\]\]/; $content =~ s/([^[|:])William and Mary College/$1\[\[William and Mary College\]\]/; $content =~ s/([^[|:])Yale College/$1\[\[Yale College\]\]/;

$content =~ s/as a Democrat/as a \[\[Democratic Party (United States)|Democrat\]\]/; $content =~ s/as a Federalist/as a \[\[Federalist Party (United States)|Federalist\]\]/; $content =~ s/as a Republican/as a \[\[Republican Party (United States)|Republican\]\]/; $content =~ s/as a Whig/as a \[\[Whig Party (United States)|Whig\]\]/; # grammar-related replacements $content =~ s/graduated, /graduated from /g; $content =~ s/lawyer, private/lawyer in private/g; $content =~ s/, (\d\d\d\d) ?- ?(\d\d\d\d)/ from $1 to $2/g; $content =~ s/(member|chairman|chair), /$1 of the /g; $content =~ s/\&\#146\;/'/g; $content =~ s/\&\#14[78]\;/"/g;	return $content; }

1;