User:Brighterorange/punctuation2.js

/* */ // Please don't use this experimental version of autopunctuation. I break it without worrying about potential users. // The stable version is maintained at user:brighterorange/punctuation.js

var punctuationVersion = "19 April 2008 (EXP)"; var punctuationID = 1; var punctuationEdits = undefined; var punctuationOriginalSummary = undefined; var punctuationPageOriginalSummary = undefined; var puCONTEXT = 40; var puWAIT = 1;

var puWORKSPACEID = 'pu_work'; var puTIMERID = 'pu_timer';

var puENDASH = 0; var puSPELL = 1; var puEMDASH = 2; var puCOMMA = 3; var puPERCENT = 4; var puBORN = 5; var puLINKSPACE = 6; var puDECADE = 7; var puPAREN = 8; var puXHTML = 9; var puREF = 10; var puSEMICOLON = 11; var puCITYSTATE = 12; var puDESCRIPTIONS = ["en dash", "spelling", "em dash", "comma", "percent", "born", "link space", "decade", "paren", "xhtml", "ref", "semicolon", "city-state"]; var puNDESC = 13;

// TODO: // finish percent space // http link with double brackets [like this] // fake em dashes - like this - are pretty common // multiple references in a row can screw up some puREF autofixes //  mainly punctuation motion across ref //  perhaps puGetRef should treat the whole sequence as one tag (but also remove interim spaces?) //  (also we don't do any fixes inside a ref that's identified by puREF, //    so I often run it twice.) // identify external links as references in puGetRef? (convert to cite web??) //  also templates like ((fact)) // allow disabling of a specific 'which' for all edits (implement puAllOn/AllOff) // when showing changes, need to paint turned-off edits in fade out color, since //  this currently only happens to the in-dom version, and not when we reshow changes //  after eg. hide or allon/alloff // lowercase words in headings that don't appear capitalized in the document anywhere // false positive in linkspace for image tags.. could find the balanced open brackets //  and check for image: // commas out of links, like that or like this, too. // (sometimes a false positive for URLs, since some editors like to put the comma //   inside the link (ugly) to prevent it from coming after the external //   link arrow graphic (uglier)) // (periods too, but many false positives like Monsters, Inc. // space before periods, or no space after periods (many false positives: urls, abbreviations, etc.) // in link space, if there is no space following closing brackets, add one // en dash false negatives: 500 BC - 400 BC, vii-xi // allegedly, however I'm adding tabs blows away the tab for User:Lightmouse/monobook.js/script.js.

function puReportTime(start) { var te = document.getElementById(puTIMERID); // and measure the total time to do so  te.innerHTML = '' + (0 + (new Date - start)) + ' ms.'; };

function doPunctuation { // just need some prominent element to put our messages in. We use the "From Wikipedia" header. var ss = document.getElementById('siteSub'); var timeelt = document.createElement('div'); timeelt.style.border = '2px solid #000000'; timeelt.id = puTIMERID; var e = document.createElement('div'); ss.appendChild(timeelt); ss.appendChild(e); e.id = puWORKSPACEID; e.innerHTML = ' Running autopunctuation... '; var start = new Date;

puDisableEditing(true);

// We'll represent the document as a list of chunks, where // a chunk can either be raw text (no replacement suggested) // or an edit (the suggested replacement text, the reason, // the original text, and a flag indicating whether the  // change has been rejected). // start by producing the singleton raw chunk: var edits = new puCons(puRaw(document.editform.wpTextbox1.value), undefined);

e.innerHTML = ' References... '; setTimeout(function { // refs  edits = puRawMapConcat(puRef, edits);  e.innerHTML = ' Spelling... ';  setTimeout(function { // spell edits = puSpell(edits); e.innerHTML = ' Born style... '; setTimeout(function { // born  edits = puBorn(edits);  e.innerHTML = ' Em dashes... ';  setTimeout(function { // em dash edits = puRawMapConcat(puEmDash, edits); e.innerHTML = ' En dashes... '; setTimeout(function { // en dash  edits = puRawMapConcat(puEnDash, edits);  e.innerHTML = ' Commas... ';  setTimeout(function { // comma edits = puRawMapConcat(puComma, edits); e.innerHTML = ' Semicolons... '; setTimeout(function { // semicolon  edits = puRawMapConcat(puSemicolon, edits);  e.innerHTML = ' Link space... ';  setTimeout(function { // linkspace edits = puRawMapConcat(puLinkSpace, edits); e.innerHTML = ' Decade... '; setTimeout(function { // decade  edits = puRawMapConcat(puDecade, edits);  e.innerHTML = ' Parens... ';  setTimeout(function { // paren edits = puRawMapConcat(puParen, edits); e.innerHTML = ' XHTML... '; setTimeout(function { // xhtml  edits = puXhtml(edits);  e.innerHTML = ' City-State... ';  setTimeout(function { // city-state edits = puCityState(edits);

punctuationEdits = edits; punctuationOriginalSummary = document.editform.wpSummary.value; document.editform.wpTextbox1.value = puRewrite(edits); document.editform.wpSummary.value = puSummary(edits); // finally, show interface for undos puShowChanges("", edits);

puReportTime(start);

}, puWAIT); // city-state }, puWAIT); // xhtml }, puWAIT); // paren }, puWAIT); // decade }, puWAIT); // linkspace }, puWAIT); // semicolon }, puWAIT); // comma }, puWAIT); // en dash }, puWAIT); // em dash }, puWAIT); // born }, puWAIT); // spell }, puWAIT); // refs };

// don't use textbox's "disable" field, since // it makes the form submit an empty textbox, // blanking the article! function puDisableEditing(flag) { var e = document.editform.wpTextbox1; if (flag) { e.style.display = 'none'; } else { e.style.display = 'block'; }; };

function puSummary(edits) { var counts = new Array; for(var i = 0; i < puNDESC; i ++) counts.push (0); for(var l = edits; l != undefined; l = l.tail) { if (!l.head.israw) { counts[l.head.what] ++; // alert("!" + l.head.what + "(" + puDESCRIPTIONS[l.head.what] + ") = " + counts[l.head.what]); } }  var s = ""; for(var j = 0; j < puNDESC; j ++) { if (counts[j] > 0) { if (s != "") s = s + "; "; s = s + counts[j] + " " + puDESCRIPTIONS[j]; }   // alert("@" + j + ": " + counts[j] + "/" + puDESCRIPTIONS[j] + " -> " + s); } if (s == "") return punctuationOriginalSummary; else { if (punctuationOriginalSummary == punctuationPageOriginalSummary) { // user never did anything except run punctuation, so minor document.editform.wpMinoredit.checked = true; }   return punctuationOriginalSummary + (punctuationOriginalSummary == "" ? "" : " ") + "(auto: " + s + ")"; } };

function puKindButtons(edits) { var counts = new Array; for(var i = 0; i < puNDESC; i ++) counts.push (0); for(var l = edits; l != undefined; l = l.tail) { if (!l.head.israw) { counts[l.head.what] ++; } }  // now for any edit kind we did do, give buttons for them. var s = " '; return s; };

function puContextBefore(ol, ne) { var s = ol + ne; if (s.length < puCONTEXT) return s;  else return s.substring(s.length - puCONTEXT); };

function puContextAfter(l) { var s = ""; for(var z = l; z != undefined; z = z.tail) { if (z.head.israw) s = s + z.head.text; else s = s + z.head.rep; if (s.length >= puCONTEXT) return s.substr(0, puCONTEXT); }  return s; };

// creates the menu for punctuation while in showchanges mode. // for now just a 'done' button function puMenu { return('click this when done with changes '); };

// when clicked, get rid of all the shown changes and re-enable // the textbox. function puDoneClick { puDisableEditing(false); // not workspace, but parent. Need to kill the timer, too. var e = document.getElementById('siteSub'); e.innerHTML = ''; };

// from a chunk list, give an HTML summary with edit buttons // pass in the context c of some previous characters. function puShowChanges(c, l) { var e = document.getElementById(puWORKSPACEID); // XXX actually, if all are deactivated too... if (l == undefined) { e.innerHTML = ' Punctuation: no changes. ';  } else { e.innerHTML = puShowSomeChanges(c, l); } };

function puShowSomeChanges(c, l) { var o = puMenu; o = o + puKindButtons(l) + " "; while (l != undefined) { if (l.head.israw) { var nc = puContextBefore(c, l.head.text); o = o + ' (...) '; c = nc; } else if (l.head.hidden) { var nc = puContextBefore(c, l.head.rep); o = o + ' (hidden) ' c = nc; } else { // XXX hover could select in edit box?? var nc = puContextBefore(c, l.head.rep); var ca = puContextAfter(l.tail); var src = (l.head.dispsrc == undefined)?l.head.orig:l.head.dispsrc; var dst = (l.head.dispdst == undefined)?l.head.rep:l.head.dispdst; o = o + ' (' + puHighlightContext(puEscape(c)) +              '' +                 puHighlight(puEscape(src)) + "&rarr;" + puHighlight(puEscape(dst)) + ' '               + puHighlightContext(puEscape(ca)) +               ') '; c = nc; }    l = l.tail; }  return (o + puMenu); };

// show spaces as light underscores, since many of these involve the deletion/insertion of spaces function puHighlight(s) { // first or it will mess up spaces in our html s = s.replace(/ /g, ' _ '); return s.replace(/__PUREF__/g, ' &lt;REF&gt; '); };

function puHighlightContext(s) { s = s.replace(/\[/g, ' [ '); s = s.replace(/\]/g, ' ] '); s = s.replace(/\{/g, ' { '); s = s.replace(/\}/g, ' } '); s = s.replace(/\|/g, ' | '); // these occur next to false positives for en dashes, commonly s = s.replace (/issn/gi, ' ISSN '); s = s.replace (/isbn/gi, ' ISBN '); // template requires literal dash s = s.replace (/scotus/gi, ' SCOTUS '); return s; };

function puEscape(s) { var s1 = s.replace(//g, "&gt;"); return s2; }; // called from generated html; hides (just don't display) all // from this kind function puAllHide(k) { for(var h = punctuationEdits; h != undefined; h = h.tail) { if (h.head.what == k) { h.head.hidden = true; }   }    // always keep these up to date (actually this should never need a rewrite, right?) // document.editform.wpTextbox1.value = puRewrite(punctuationEdits); document.editform.wpSummary.value = puSummary(punctuationEdits); puShowChanges("", punctuationEdits);

return ; };

// called from generated html above. undoes the specified edit, making // the chunk into a raw chunk and rewriting the textarea. function puUndo(i) { var start = new Date; for(var h = punctuationEdits; h != undefined; h = h.tail) { if (h.head.id == i) { h.head.text = h.head.orig; h.head.israw = true; // undo edit where it matters document.editform.wpTextbox1.value = puRewrite(punctuationEdits); document.editform.wpSummary.value = puSummary(punctuationEdits); var e = document.getElementById('puEdit' + i); e.style.background = ''; // because clicking again would do nothing. XXX, we should be        // able to reenable by clicking again! e.style.cursor = ''; e.onclick = undefined; // e.style.opacity = "0.5"; // e.style.filter = "Alpha(Opacity=50)"; puReportTime(start); return; }   }    alert("Oops, can't undo? " + i + " ... " + punctuationEdits); };

// generate the raw text from a chunk list function puRewrite(l) { var o = ""; while(l != undefined) { if (l.head.israw && l.head.text != undefined) o = o + l.head.text; else if (!l.head.israw && l.head.rep != undefined) o = o + l.head.rep; else o = o + "???"; l = l.tail; }  return o; };

// given a function (f : string -> chunk list) and (l : chunk list) // build a new list where each raw chunk within l has f applied to // it and the result flattened. edit chunks are not modified. // modifies the result of f(...). function puRawMapConcat(f, l) { if (l == undefined) return l;  if (l.head.israw) { var nl = f(l.head.text); // empty if (nl == undefined) return puRawMapConcat(f, l.tail); // otherwise, reuse this list var e = nl; // make e point at the final object. while (e.tail != undefined) { e = e.tail; }    e.tail = puRawMapConcat(f, l.tail); return nl; } else return puCons(l.head, puRawMapConcat(f, l.tail)); };

// XXX obsolete function puAppend (l1, l2) { if (l1 == undefined) return l2; else return puCons(l1.head, puAppend(l1.tail, l2)); };

// lists are represented as head/tail cons cells // with nil = undefined function puCons(h, t) { // if they are both raw, then flatten. if (t != undefined && t.head.israw && h.israw) { var nh = new Object; nh.israw = true; nh.text = h.text + t.head.text; var o = new Object; o.head = nh; o.tail = t.tail; return o;  } else { var o = new Object; o.head = h;    o.tail = t;     return o;   } }

function puRaw(s) { var o = new Object; o.israw = true; o.text = s;  return o; };

// puCleave(small, large) // find the next match of small in large. // return a two-element array of the // string preceding the match, and the string // following the match. If there are no matches, // return undefined. function puCleave(small, large) { var x = large.indexOf(small); if (x == -1) return undefined; else return new Array(large.substr(0, x),                        large.substring(x + small.length)); };

function puBorn(edits) { return puRawMapConcat(puSpellRep("(b. ", "(born ", puBORN), edits); };

function puXhtml(edits) { edits = puRawMapConcat(puSpellRep(" ", " ", puXHTML), edits); edits = puRawMapConcat(puSpellRep("", " ", puXHTML), edits); return edits; };

function puSpell(edits) { edits = puRawMapConcat(puSpellRep("seperat", "separat", puSPELL), edits); edits = puRawMapConcat(puSpellRep("embarass", "embarrass", puSPELL), edits); edits = puRawMapConcat(puSpellRep("existance", "existence", puSPELL), edits); edits = puRawMapConcat(puSpellRep("supercede", "supersede", puSPELL), edits); edits = puRawMapConcat(puSpellRep("accomodat", "accommodat", puSPELL), edits); edits = puRawMapConcat(puSpellRep("foreward", "foreword", puSPELL), edits); edits = puRawMapConcat(puSpellRep("liason", "liaison", puSPELL), edits); edits = puRawMapConcat(puSpellRep("millenium", "millennium", puSPELL), edits); edits = puRawMapConcat(puSpellRep("accomoda", "accommoda", puSPELL), edits); edits = puRawMapConcat(puSpellRep("occassion", "occasion", puSPELL), edits); edits = puRawMapConcat(puSpellRep("occurrance", "occurrence", puSPELL), edits); edits = puRawMapConcat(puSpellRep("privelege", "privilege", puSPELL), edits); edits = puRawMapConcat(puSpellRep("priviledge", "privilege", puSPELL), edits); edits = puRawMapConcat(puSpellRep("withold", "withhold", puSPELL), edits); return edits; };

function puSpellRep(src, dst, wh) { return (function(t) {            // spelling is kinda slow, and most misspellings never appear at all             if (t.indexOf(src) == -1) return puCons(puRaw(t), undefined);             else return puSpellOne (t, src, dst, wh);           }); };

function puSpellOne (t, src, dst, wh) { var a = puCleave(src, t); if (a == undefined) return puCons(puRaw(t), undefined); var subst = puEdit(src, dst, wh); return puCons(puRaw(a[0]), puCons(subst, puSpellOne(a[1], src, dst, wh))); };

function puCityState(edits) { /* for every US State... (could do countries here, too.) */ edits = puRawMapConcat(puCityStateFn("Alabama"), edits); edits = puRawMapConcat(puCityStateFn("Alaska"), edits); edits = puRawMapConcat(puCityStateFn("Arizona"), edits); edits = puRawMapConcat(puCityStateFn("Arkansas"), edits); edits = puRawMapConcat(puCityStateFn("California"), edits); edits = puRawMapConcat(puCityStateFn("Colorado"), edits); edits = puRawMapConcat(puCityStateFn("Connecticut"), edits); edits = puRawMapConcat(puCityStateFn("Delaware"), edits); edits = puRawMapConcat(puCityStateFn("Florida"), edits); edits = puRawMapConcat(puCityStateFn("Georgia", "Georgia (U.S. state)|Georgia"), edits); edits = puRawMapConcat(puCityStateFn("Hawaii"), edits); edits = puRawMapConcat(puCityStateFn("Idaho"), edits); edits = puRawMapConcat(puCityStateFn("Illinois"), edits); edits = puRawMapConcat(puCityStateFn("Indiana"), edits); edits = puRawMapConcat(puCityStateFn("Iowa"), edits); edits = puRawMapConcat(puCityStateFn("Kansas"), edits); edits = puRawMapConcat(puCityStateFn("Kentucky"), edits); edits = puRawMapConcat(puCityStateFn("Louisiana"), edits); edits = puRawMapConcat(puCityStateFn("Maine"), edits); edits = puRawMapConcat(puCityStateFn("Maryland"), edits); edits = puRawMapConcat(puCityStateFn("Massachusetts"), edits); edits = puRawMapConcat(puCityStateFn("Michigan"), edits); edits = puRawMapConcat(puCityStateFn("Minnesota"), edits); edits = puRawMapConcat(puCityStateFn("Mississippi"), edits); edits = puRawMapConcat(puCityStateFn("Missouri"), edits); edits = puRawMapConcat(puCityStateFn("Montana"), edits); edits = puRawMapConcat(puCityStateFn("Nebraska"), edits); edits = puRawMapConcat(puCityStateFn("Nevada"), edits); edits = puRawMapConcat(puCityStateFn("New Hampshire"), edits); edits = puRawMapConcat(puCityStateFn("New Jersey"), edits); edits = puRawMapConcat(puCityStateFn("New Mexico"), edits); edits = puRawMapConcat(puCityStateFn("New York"), edits); edits = puRawMapConcat(puCityStateFn("North Carolina"), edits); edits = puRawMapConcat(puCityStateFn("North Dakota"), edits); edits = puRawMapConcat(puCityStateFn("Ohio"), edits); edits = puRawMapConcat(puCityStateFn("Oklahoma"), edits); edits = puRawMapConcat(puCityStateFn("Oregon"), edits); edits = puRawMapConcat(puCityStateFn("Pennsylvania"), edits); edits = puRawMapConcat(puCityStateFn("Rhode Island"), edits); edits = puRawMapConcat(puCityStateFn("South Carolina"), edits); edits = puRawMapConcat(puCityStateFn("South Dakota"), edits); edits = puRawMapConcat(puCityStateFn("Tennessee"), edits); edits = puRawMapConcat(puCityStateFn("Texas"), edits); edits = puRawMapConcat(puCityStateFn("Utah"), edits); edits = puRawMapConcat(puCityStateFn("Vermont"), edits); edits = puRawMapConcat(puCityStateFn("Virginia"), edits); edits = puRawMapConcat(puCityStateFn("Washington"), edits); edits = puRawMapConcat(puCityStateFn("West Virginia"), edits); edits = puRawMapConcat(puCityStateFn("Wisconsin"), edits); edits = puRawMapConcat(puCityStateFn("Wyoming"), edits); return edits; };

function puCityStateFn(state, statelink) { return (function(t) {            // citystate is kind of slow and there are 50 states; only run a state              // if it appears at all...             if (t.indexOf(', ' + state + ']]') == -1) return puCons(puRaw(t), undefined);             else return puCityStateOne (t, state, statelink);           }); };

function puSplitWhiteEnd(s) { for(var i = s.length - 1; i >= 0; i --) { if (s.charAt(i) != ' '.charAt(0)) return new Array(s.substr(0, i + 1), s.substring(i + 1)); }  // all whitespace! return new Array("", s); };

function puSplitWhiteStart(s) { for(var i = 0; i < s.length; i ++) { if (s.charAt(i) != ' '.charAt(0)) return new Array(s.substr(0, i), s.substring(i)); }  return new Array(s, ""); };

// XXX allow decimal places function puNumberEnd(s) { var n = ""; for(var i = s.length - 1; i >= 0; i --) { if ((s.charCodeAt(i) >= '0'.charCodeAt(0) && s.charCodeAt(i) <= '9'.charCodeAt(0)) ||          s.charAt(i) == '-') n = s.charAt(i) + n;     // years are often linked else if (s.charAt(i) == '[' || s.charAt(i) == ']') /* nothing */ ; else return n;  } return n; };

// XXX now just takes the next token up to whitespace or |, ignoring brackets function puNumberStart(s) { var n = ""; for(var i = 0; i < s.length; i ++) { if (s.charAt(i) == '[' || s.charAt(i) == ']') /* nothing */ ; else if (s.charAt(i) != ' ' && s.charAt(i) != '\n' && s.charAt(i) != '|') n = n + s.charAt(i); else return n;  } return n; };

// does this string end with a (partial) http link? function puEndsHTTP (s) { // only http since we want to catch https too var h = s.lastIndexOf('http'); if (h == -1) return false; // is there a space or ] terminating the link, though? if (s.lastIndexOf(' ') > h ||       s.lastIndexOf(']') > h) return false; else return true; };

// are we inside an HTML element? function puIsElement(s) { var h = s.lastIndexOf('&'); if (h == -1) return false; // is there a space or ; terminating the element? if (s.lastIndexOf(' ') > h ||       s.lastIndexOf(';') > h) return false; else return true; };

function puEnDash (t) { // split on every dash var a = puCleave("-", t); if (a == undefined) return puCons(puRaw(t), undefined); // check if dash is preceded by a number and followed by  // a number. var bef = puSplitWhiteEnd(a[0]); var aft = puSplitWhiteStart(a[1]); var befn = puNumberEnd(bef[0]); var aftn = puNumberStart(aft[1]); // alert("[" + bef[0] + "][" + bef[1] + "]-[" + aft[0] + "][" + aft[1] + "] .. [" + befn + "]–[" + aftn + "]"); var befnn = befn * 1; var aftnn = aftn * 1; // exclude ISBNs and certain dates by making sure the number doesn't have dash in it  if (befn.length > 0 && aftn.length > 0 &&       puEnDashBefOK(befn) && puEnDashAftOK(aftn) &&       !(puInLink(a[0], a[1])) &&       !puEndsHTTP(bef[0]) &&       // ranges are usually lo-hi, but sometimes we see 1987-8       (isNaN(befnn) || isNaN(aftnn) || befnn <= aftnn || (befnn >= 1000 && befnn <= 9999 && aftn <= 99) )) { // src has whitespace around dash, replacement does not // (note unicode en dash) return puCons(puRaw(bef[0]), puCons(puEdit(bef[1] + "-" + aft[0], "–", puENDASH), puEnDash(aft[1]))); } else { // don't match. but if we found dashes to the right, we shouldn't look at those // again. (e.g. in ISBN 01-1234-6789, once we look at the first dash and reject it,    // we don't want to then consider 1234-6789, which looks like a match.) var skip = puEnSkip(aft[1]); return puCons(puRaw(a[0] + "-" + aft[0] + skip[0]), puEnDash(skip[1])); } };

// no more hyphens in the number (like when considering the second dash in ISBN 01-1234-6789) function puEnDashBefOK(s) { return (s.indexOf('-') == -1); };

// Sees if this is in a link. That means as a , // or, (but not in the argument part), // or a wiki link, or a link (but not // when in display portion). function puInLink(a,b) { var aa = puFindAnyLeft(a, ["}}", "]]", "", "]]", "{{", "[[", "|"]);

return ( (aa == "{{" && bb == "}}") ||            (aa == "{{" && bb == "|") ||             (aa == "") ||             (aa == "[[" && bb == "") ); };

function puFindAnyLeft(str, finds) { var latest = undefined; var latesti = -1; for(var i = 0; i < finds.length; i ++) { var x = str.lastIndexOf(finds[i]); if (x > latesti) { latest = finds[i]; latesti = x;      } }   return latest; };

function puFindAnyRight(str, finds) { var earliest = undefined; var earliesti = str.length; for(var i = 0; i < finds.length; i ++) { var x = str.indexOf(finds[i]); if (x < earliesti) { earliest = finds[i]; earliesti = x;      } }   return earliest; };

function puEnDashAftOK(s) { // some prefix has to be a number... if (s.charCodeAt(0) >= '0'.charCodeAt(0) && s.charCodeAt(0) <= '9'.charCodeAt(0)) { // but we should avoid certain stuff... return (s.indexOf('-') == -1 &&              s.indexOf('.htm') == -1 &&              s.indexOf('.pdf') == -1 &&              s.indexOf('.png') == -1 &&              s.indexOf('.jpg') == -1 &&              s.indexOf('.gif') == -1 &&              s.indexOf('.svg') == -1 &&              s.indexOf('.stm') == -1); } else { // otherwise something special: var ss = s.toLowerCase; return (      puStartswith(ss, "january") ||       puStartswith(ss, "february") ||       puStartswith(ss, "march") ||       puStartswith(ss, "april") ||       puStartswith(ss, "may") ||       puStartswith(ss, "june") ||       puStartswith(ss, "july") ||       puStartswith(ss, "august") ||       puStartswith(ss, "september") ||       puStartswith(ss, "october") ||       puStartswith(ss, "november") ||       puStartswith(ss, "december") ||       puStartswith(ss, "today") ||       puStartswith(ss, "bc") ||       puStartswith(ss, "present")); } };

function puStartswith(lng, sht) { return (lng.indexOf(sht) == 0); };

// after not matching a dash for en dash replacement, // split a string into two parts: the first is what we // should skip, the rest is what we should look for // more dashes within. function puEnSkip(s) { for(var i = 0; i < s.length; i ++) { if ((s.charCodeAt(i) >= '0'.charCodeAt(0) && s.charCodeAt(i) <= '9'.charCodeAt(0)) ||          s.charAt(i) == '-' ||           s.charAt(i) == '[' ||           s.charAt(i) == ']') /* nothing */ ; else return new Array(s.substr(0, i), s.substring(i)); }  return new Array(s, ""); };

function puEdit(src, dst, what) { return puEditExt(src, dst, what, undefined, undefined); };

function puEditExt(src, dst, what, dispsrc, dispdst) { var subst = new Object; subst.orig = src; subst.rep = dst; subst.israw = false; subst.what = what; subst.hidden = false; subst.dispsrc = dispsrc; subst.dispdst = dispdst; //    alert (src + "&rarr;" + dst); punctuationID ++; subst.id = punctuationID; return subst; };

/* Fix faux em dashes. "--" almost anywhere should almost always be a real em dash (unless there are four or as          part of an html comment) TODO: " - " between words should usually be an em dash. */ function puEmDash(t) { var a = puCleave("--", t); if (a == undefined) return puCons(puRaw(t), undefined); // must be preceded by a word and followed by a word var bef = puSplitWhiteEnd(a[0]); var aft = puSplitWhiteStart(a[1]); if (aft[1].length > 0 && puEmOKChar(aft[1].charAt(0)) &&      bef[0].length > 0 && puEmOKChar(bef[0].charAt(bef[0].length - 1))) { return puCons(puRaw(bef[0]),                   puCons(puEdit(bef[1] + "--" + aft[0], "—", puEMDASH), puEmDash(aft[1]))); } else { /* not an em dash. */    return puCons(puRaw(a[0] + "--"), puEmDash(a[1])); } };

function puEmOKChar(c) { //  alert ("check char: [" + c + "]"); if (c == '>' || c == '!' || c == '<' || c == '-' || c == '|') return false; else return true; };

function puIsDigit(c) { return (c.charCodeAt(0) >= '0'.charCodeAt(0) && c.charCodeAt(0) <= '9'.charCodeAt(0)); };

// Pittsburgh, Pennsylvania to Pittsburgh, Pennsylvania. function puCityStateOne(t, state, statelink) { var a = puCleave(", " + state + "]]", t); // XXX could be improved by generating pipe trick expansion automatically // (pipe trick doesn't work in ref tags, etc.) // but that makes it a little trickier because we have to find "Pittsburgh" in the above // and might fail (because of other edits) // XXX when doing that should detect Image: and Category: if (a == undefined) return puCons(puRaw(t), undefined); var st = (statelink == undefined) ? state : statelink; return puCons(puRaw(a[0]),               puCons(puEdit(", " + state + "]]", ", " + state + "|]], " + st + "", puCITYSTATE), puCityStateOne(a[1], state, statelink))); };

// 1980's to 1980s (Manual of Style (dates and numbers)) // note this isn't always a mistake: // "1981 was a cold year compared to 1980's record temperatures" would be okay // so some context awareness is appropriate (but it is almost always wrong) function puDecade(t) { var a = puCleave("0's", t); if (a == undefined) return puCons(puRaw(t), undefined); if (// date before? (only do it for 4 or 2 digit dates)      ( (a[0].length >= 4 &&        puIsDigit(a[0].charAt(a[0].length - 1)) &&        puIsDigit(a[0].charAt(a[0].length - 2)) &&        puIsDigit(a[0].charAt(a[0].length - 3)) &&       !puIsDigit(a[0].charAt(a[0].length - 4))) || (a[0].length >= 2 &&        puIsDigit(a[0].charAt(a[0].length - 1)) &&       !puIsDigit(a[0].charAt(a[0].length - 2))) )       &&

// safe to correct? a[1].length > 0 && puDecadeOKChar(a[1].charAt(0))) {    return puCons(puRaw(a[0]), puCons(puEdit("0's", "0s", puDECADE),                         puDecade(a[1])));   } else {     /* no problem. */     return puCons(puRaw(a[0] + "0's"), puDecade(a[1]));   } };

function puDecadeOKChar(c) { // should be the end of a word if (c == '\n' || c == ' ' || c == ',' || c == '.' ||      c == '&' || c == '—' || c == '-' || c == '–' ||       // text in tables?       c == '|' || c == '\t' || c == '<' || c == ')' || c == ';' || c == '!' || c == "'" || c == ':' || c == '/' ) return true;  else return false; };

// space before/around(parentheses ) // closing parens are basically the same as commas below. function puParen(t) { var a = puCleave(")", t);  if (a == undefined) return puCons(puRaw(t), undefined);   // must be preceded by a word and followed by a word    var bef = puSplitWhiteEnd(a[0]);   var aft = puSplitWhiteStart(a[1]);   // alert('paren: [' + bef[0] + '][' + bef[1] + ']***[' + aft[0] + '][' + aft[1] + ']');   if (// needs correction? (bef[1].length > 0 || aft[0].length == 0) && // safe to correct? aft[1].length > 0 && puRParenOKChar(aft[1].charAt(0)) && bef[0].length > 0 && puRParenOKChar(bef[0].charAt(bef[0].length - 1))) {    return puCons(puRaw(bef[0]), puCons(puEdit(bef[1] + ")" + aft[0], ") ", puPAREN),                         puParen(aft[1])));   } else {     /* no problem. */     return puCons(puRaw(a[0] + ")"), puParen(a[1])); } };

// XXX perhaps should be okay-on-right and okay-on-left; this may be too conservative function puRParenOKChar(c) { if (c == ")" || c == "(" || c == '|' ||      // otherwise we undo our linkspace fix ;) c == ']' || // title markup c == '=' || // sometimes people do        c == '&' || // quotes, obviously c == '"' || c == '”' || c == '’' || c == "'" ||      // History of Russia (1900-1950)#World War II       c == "#" ||       // other stuff       c == '\n' || c == ':' || c == ';' || c == '.' || c == '-' || c == '—' || c == ',' ||        c == '}' || '{' || c == '<') return false;   else return true; };

function puComma(t) { return puCommaLike(',', puCOMMA, t); };

function puSemicolon(t) { return puCommaLike(';', puSEMICOLON, t); };

// TODO: very important to filter out URL hits, since comma appears in lots of news URLs function puCommaLike(ch, what, t) { var a = puCleave(ch, t); if (a == undefined) return puCons(puRaw(t), undefined); // must be preceded by a word and followed by a word var bef = puSplitWhiteEnd(a[0]); var aft = puSplitWhiteStart(a[1]); // alert('comma: [' + bef[0] + '][' + bef[1] + ']***[' + aft[0] + '][' + aft[1] + ']'); if (// needs correction?      (bef[1].length > 0 || aft[0].length == 0) &&       // safe to correct?       !puEndsHTTP(bef[0]) &&       !puIsElement(bef[0]) &&       aft[1].length > 0 && puCommaOKChar(aft[1].charAt(0)) &&       bef[0].length > 0 && puCommaOKChar(bef[0].charAt(bef[0].length - 1))) { // alert('fix!'); return puCons(puRaw(bef[0]),                   puCons(puEdit(bef[1] + ch + aft[0], ch + ' ', what), puCommaLike(ch, what, aft[1]))); } else { /* no problem. */    return puCons(puRaw(a[0] + ch), puCommaLike(ch, what, a[1])); } };

function puLinkSpace(t) { var a = puCleave(" ]]", t); if (a == undefined) return puCons(puRaw(t), undefined); // maybe multiple spaces... var bef = puSplitWhiteEnd(a[0]); // alert('linkspace: [' + bef[0] + '][' + bef[1] + ']***[' + aft[0] + '][' + aft[1] + ']'); // filter out the common idiom if (a[0].length > 0 && a[0].charAt(a[0].length - 1) != '|') { return puCons(puRaw(bef[0]),                   puCons(puEdit(bef[1] + " ]]", "]]", puLINKSPACE), puLinkSpace(a[1]))); } else { return puCons(puRaw(a[0] + " ]]"), puLinkSpace(a[1])); } };

/// XXX not hooked up -- did I finish implementing this? // between number and %, remove space. function puPercent(t) { var a = puCleave("%", t); if (a == undefined) return puCons(puRaw(t), undefined); // must be preceded by a word and followed by a word var bef = puSplitWhiteEnd(a[0]); var aft = puSplitWhiteStart(a[1]); // alert('pct: [' + bef[0] + '][' + bef[1] + ']***[' + aft[0] + '][' + aft[1] + ']'); if (// needs correction?      (bef[1].length > 0 || aft[0].length == 0) &&       // safe to correct?       aft[1].length > 0 && puPercentBeforeChar(aft[1].charAt(0)) &&       bef[0].length > 0 && puPercentAfterChar(bef[0].charAt(bef[0].length - 1))) { // alert('fix!'); return puCons(puRaw(bef[0]),                   puCons(puEdit(bef[1] + "%" + aft[0], "% ", puPERCENT), puPercent(aft[1]))); } else { /* no problem. */    return puCons(puRaw(a[0] + "%"), puPercent(a[1])); } };

function puCommaOKChar(c) { // definitely not inside numbers if ((c.charCodeAt(0) >= '0'.charCodeAt(0) && c.charCodeAt(0) <= '9'.charCodeAt(0)) ||      // text in tables?       c == '|' ||       // quotes, obviously       c == '"' || c == '”' || c == '’' || c == "'" ||       // link w/ underscores instead of spaces       c == '_' ||       c == '\n' || c == '&' || c == ',' ||        // ref tags       c == '{' || c == '<') return false;   else return true; };

function puRefSpaceOKChar(c) { if (// text in tables?      c == '|' ||       // parenthetical       c == ')' || // or space already... c == ' ' || // ending image: tags c == ']' || // ending template text c == '}' || // before em dashes (see MOS) c == '—' || // ending quotes... c == '"' || c == '”' || c == '’' || c == "'" ||      c == '\n' || c == '&' || c == ',' ||        // ref tags       c == '{' || c == '<') return false;   else return true; };

// for references, we want to find the ref tags, but // they can appear in several common forms: // //  //  // this function returns a three-element array consisting of // [the text before the first ref tag, the ref tag, the text following] // (or it returns undefined if there are no ref tags to be found) function puGetRef(t) { var m = ' (bracketing) // or      // /> (unitary) for(var j = i + m.length; j < t.length; j ++) { if (t.charAt(j) == '/') { if (j < (t.length - 1) && t.charAt(j + 1) == '>') { var rt = t.substr(i, (j + 2) - i); var bef = t.substr(0, i); var aft = t.substr(j + 2, t.length - (j + 2)); return new Array(bef, rt, aft); } else { // XXX report problem? return undefined; }	 } else if (t.charAt(j) == '>') { // found bracketing ref tag. // so now eat until is // encountered. var rest = t.substr(j, t.length - j); var a = puCleave(' ', rest); if (a == undefined) { // XXX warn: unclosed ref tag?? return undefined; } 	  var rt = t.substr(i, j - i) + a[0] + ' '; var bef = t.substr(0, i); var aft = a[1]; // alert("REF. bef: [" + bef + "]\n" + 	  //	 "rt: [" + rt + "]\n" + 	   //	 "aft: [" + aft + "]\n"); return new Array(bef, rt, aft); }      }     }   }   // none found... return undefined; };

// If we find a ref tag, we need to ensure the following: // 1. there should never be any space before the tag. // 2. the ref tag should appear after punctuation (except dashes) //      UNLESS the reference is to a specific term rather than //      to the sentence or comma/semicolon-separated phrase //      (we'll leave it up to the user to reject these false positives) // 3. there shouldn't be double punctuation before/after the ref // 4. there should be space after the ref //      UNLESS the reference is followed by another reference //      (or a dash, or legal punctuation as above) // // (this is according to the manual of style at footnotes; // and conforms to the Chicago Manual of Style) // // So, we grab any punctuation that follows the reference, // erase all space before the reference, // insert space after the ref if needed // and insert any trailing punctuation before the reference, // unless there is already punctuation there. function puRef(t) { var a = puGetRef(t); if (a == undefined) return puCons(puRaw(t), undefined); var bef = puSplitWhiteEnd(a[0]); var tag = a[1]; var aft = puSplitWhiteStart(a[2]); // boolean flags // insist on two newlines since people frequently put refs on their own lines. var parend = aft[1].length > 1 && aft[1].charAt(0) == '\n' && aft[1].charAt(1) == '\n'; var nopuncbefore = bef[0].length == 0 || !(puRefPuncChar(bef[0].charAt(bef[0].length - 1))); var needspuncbefore = nopuncbefore && bef[0].length > 0 && puRefNeedsPunc(bef[0].charAt(bef[0].length - 1));

// the punctuation char or undefined if none var puncafter = (aft[1].length > 0)?aft[1].charAt(0):undefined; if (puncafter != undefined && !puRefPuncChar(puncafter)) puncafter = undefined; if (puncafter != undefined) { aft[1] = aft[1].substr(1, aft[1].length - 1); }  var needspaceafter = aft[1].length > 0 && puRefSpaceOKChar(aft[1].charAt(0)); // DEBUG // var what = ''; // if (nopuncbefore) what = what + " NOPUNCBEFORE."; // if (parend) what = what + " PAREND."; // if (puncafter != undefined) what = what + " puncafter: " + puncafter; // if (needspaceafter) what = what + " NEEDSPACEAFTER."; // alert(what); if (// whitespace before?      bef[1].length > 0 ||       // missing necessary whitespace after?       (aft[0].length == 0 && needspaceafter) ||       // punctuation after?       (puncafter != undefined) ||       // or there is no punctuation at all and this is       // the end of the paragraph       (parend && needspuncbefore)) { // There's something to fix. // the before part will be whatever's before, plus any additional punctuation, // but minus any whitespace. var befplus; if (parend // implies no punctuation after ref             && needspuncbefore) { // assume period at end of paragraph. // XXX note, this will put the period before only the last // reference in a series of references at the end of           // a paragraph, sigh befplus = '.'; } else if (nopuncbefore && puncafter != undefined) { befplus = puncafter; } else befplus = ''; var aftoldplus = ''; if (puncafter != undefined) aftoldplus = puncafter; // XXX: should elide contents of ref in display somehow. return puCons(puRaw(bef[0]),                      puCons(puEditExt(// old:                                         bef[1] + tag + aft[0] + aftoldplus,                                        // new:                                        befplus + tag + (needspaceafter?' ':),                                        puREF,                                        // display versions elide the ref itself:                                        bef[1] + '__PUREF__' + aft[0] + aftoldplus,                                        befplus + '__PUREF__' + (needspaceafter?' ':)), puRef(aft[1]) )); } else { // no change return puCons(puRaw(a[0] + a[1]), puRef(a[2])); } };

function puRefPuncChar(c) { // eta-expansion necessary?? if (c == '.' || c == ';' || c == ',' || c == '?' ||      c == '!' || c == ':') return true; else return false; };

function puRefNeedsPunc(c) { return (c.charCodeAt(0) >= 'a' && c.charCodeAt(0) <= 'z') || (c.charCodeAt(0) >= 'A' && c.charCodeAt(0) <= 'Z') || (c.charCodeAt(0) >= '0' && c.charCodeAt(0) <= '9') || c == ']'; };

// --

// install it.. addOnloadHook(function { // not on talk pages...  if (document.title.indexOf("talk:") != -1) {     return;  }  if (document.title.indexOf("Editing ") != -1) {  addOnloadHook(addPunctuation);  } });

function addPunctuation { // need to see later if user has done any editing... punctuationPageOriginalSummary = document.editform.wpSummary.value; addTab("javascript:doPunctuation", "punctuation (exp.)", "ca-punctuation", "Punctuation", ""); akeytt; }; /* */