User:Closeapple/RMFset.js

// Closeapple RMF set.js // Functions used by Closeapple for // http://meta.wikimedia.org/wiki/TemplateScript // // Copyright (C) 2009-2010 Closeapple // You may copy and modify this file under your choice of these licenses: // * GDFL: GNU Free Documentation License, version of your choice // * CC-BY-SA: Creative Commons Attribution+ShareAlike, version 3.0 //  or later, U.S. or Unported // * CC-BY-NC: Creative Commons Attribution+NonCommercialOnly, version 3.0 //  or later, U.S. or Unported // * LGPL: Lesser GNU Public License, version of your choice // // This is a set of regex scripts to use on MediaWiki using TemplateScript // described at TemplateScript. (You don't need to have it loaded // separately, but you can enable it by going to Special:Preferences and // checking in the Gadgets section.) // // Coding notes // // This JavaScript uses UTF-8 characters like arrows and lines, and even in a // few regexes for dashes and things. // Shame on you if you load it in a non-Unicode editor! // Maybe JS supports some kind of \u syntax in regexes, but I don't know yet, // so I haven't done it. // // Since this is written for JavaScript, it uses the Perl extensions that // JS supports, like (?:) for non-storing, +? and *? for shortest // matches (which are vital in some places!), and, in the future, (?!) // for negative look-ahead. // Also, / in a pattern is always written as \/ here. // // regex is used when affecting the edit summary is undesirable: // for example, when a regex is likely to match even when when not fixed

// TODO: Rewrite most ([^whatever]|$) junk to use (?!whatever) instead // Wishlist: // * mnopqrs -> mnopqrs // * mnopqr's -> mnopqr's (with weird apostrophes too) // * link_with_spaces -> link with spaces (but maybe that's bad) // Far-off wishlist: // * -> (maybe) // * Abcdef -> Abcdef // * Abcdef, Illinois -> Abcdef, Illinois /* global $, pathoschild */

// true to debug; false to not debug if (!rmfCa_debug) { var rmfCa_debug= // true; false; }

/** * TemplateScript adds configurable templates and scripts to the sidebar, and adds an example regex editor. * @see https://meta.wikimedia.org/wiki/TemplateScript * @update-token */ // $.ajax('//tools-static.wmflabs.org/meta/scripts/pathoschild.templatescript.js', { dataType:'script', cache:true }).then(function {	pathoschild.TemplateScript.add([ /*TODO: It's not working! {			name: 'Standardize', scriptUrl: 'meta:User:Pathoschild/standardise.js', script: function { standardize; } },		*/		{			name: 'Safe cleanups', script: function(editor) { // do all the unquestionably appropriate cleanups here rmfCa_removetrailingspaces(editor); rmfCa_brcleaning(editor); rmfCa_refspacing(editor); rmfCa_moverefpunct(editor); rmfCa_wikilinkunderlines(editor); rmfCa_wikilinkspacing(editor); rmfCa_obviousdashdash(editor); rmfCa_htmltypos(editor); rmfCa_obviousreferences(editor); rmfCa_washington_obvious(editor); }		},		{			name: '├ trailing spaces', script: rmfCa_removetrailingspaces },		{			name: '├ cleaning', script: rmfCa_brcleaning },		{			name: '├ _|x→ |x', script: rmfCa_wikilinkunderlines },		{			name: '├  spacing', script: 'rmfCa_wikilinkspacing' },		{			name: '├ safe -- fixes', script: rmfCa_obviousdashdash },		{			name: '├. → . ',			script: rmfCa_moverefpunct },		{			name: '├ spacing', script: rmfCa_refspacing },		{			name: '├ =Ref= section', script: rmfCa_obviousreferences },		{			name: '├ HTML typos', script: rmfCa_htmltypos },		{			name: '└ safe Washington', script: rmfCa_washington_obvious },		{			name: 'Looser cleanups', script: function(editor) { rmfCa_prosedashdash(editor); rmfCa_unlinkfulldates(editor); rmfCa_unsubstreflist(editor); rmfCa_washington_loose(editor); }		},		{			name: '├ Prose -- fixes', script: rmfCa_prosedashdash },		{			name: '├ → ', script: rmfCa_unlinkfulldates },		{			name: '├ unsubst ', script: rmfCa_unsubstreflist },		{			name: '└ looser Washington', script: rmfCa_washington_loose }	]); });

// rmfCa_regex_reason: replaces regex with replacement, then adds // summary or detail to edit box if there was a match and unsets // "minor" flag if edit is major. // return values (not used by other functions, so not necessary): // * 0 if no match // * 1 if match but no changes (from regexes that broadly match things they //    don't necessarily need to fix) // * 2 if match caused some kind of change function rmfCa_regex_reason(editor, pattern, replacement, detail, summary, major) { if (!editor.get.match(pattern)) { if(rmfCa_debug) alert('0 = no match for '+detail); return 0; }	var beforetext = editor.get; editor.replace(pattern, replacement); if (beforetext == editor.get) { if(rmfCa_debug) alert('1 = matches but doesn\'t change: '+detail); return 1; }	rmfCa_setreason(editor, detail, summary, major); if (!detail) detail = summary; if(rmfCa_debug) alert('2 = fixes: '+detail); return 2; } // rmfCa_setreason: one-stop location to add edit summary // detail: detailed description of change; not in edit summary unless //   change is major AND there is no summary // summary: edit summary to append if doesn't already exist // major = if string "true" or "major" then unset "minor edit" checkbog function rmfCa_setreason(editor, detail, summary, major) { if ( major == 'true' || major == 'major' ) { editor.options({ minor: false });

if (detail && !summary) summary = detail; editor.appendEditSummary(summary); }	else { if (!summary) summary = 'minor wikification fixups'; editor.appendEditSummary(summary); } }

// //========== // Actual regexes start here! //========== //

// Aggressiveness: safe function rmfCa_removetrailingspaces(editor) { rmfCa_regex_reason(editor, /[ \t]+$/mg, '', 'removed trailing spaces'); }

// Aggressiveness: safe function rmfCa_brcleaning(editor) { editor .replace(/<\s*BR\s*(?:\/s*)>/g, '') // if all caps, OK .replace(/<\s*(?:bR|[Bb]r)\s*(?:\/\s*)>/g, ' '); // otherwise, lower case! // completely surrounded -> no spaces rmfCa_regex_reason(editor, /\s+()\s+/ig, '$1', 'spacing on both sides of '); editor .replace(/[ \t]+[ \t]()/ig, ' $1') // allow only one leading space .replace(/()[ \t]+[ \t]/ig, '$1 '); // allow only one trailing space }

// moverefpunct: move punctuation to before (a sequence of) references // Aggressiveness: safe // Example:. -> . // Scope: always // Type: Perl/JavaScript (needs (?:) to do non-storing match and *? to select shortest) // Works even across lines if [^<] matches linefeeds in regex implementation. // Spacing within ref tags and reference content is passed and not modified. // /m parameter may or may not be useful - works both ways. // Does not deal with // "!" is not matched because refs show up in table headers and we don't // want the ref to swap with the header separators. Examples: // ! Header A // ! Header B // or ! Header A !! Header B // Note: will not see any // Will be skipped: // TODO: Possibly match, , etc. the same way. function rmfCa_moverefpunct(editor) { // TESTING next line rmfCa_regex_reason(editor, /\s*((?:[.?,:;]|°|&deg;)+)\s*((?:]*\/|(?:\s[^<>]*?)?\s*>[^<]*<\s*\/\s*ref)\s*>\s*?)+)[ \t]*\1+/ig, '$1$2', 'limit duplicate punctuation to before 	// Does not remove newline before - some people might like that.	//  Note: matches tags whether already cleaned or not	rmfCa_regex_reason(editor, /[ \t]<\s*\/\s*ref\s*>/ig, ' ', '< / ref > → '); }

// wikilinkspacing: removes extraneous spaces in   tags // Aggressiveness: safe function rmfCa_wikilinkspacing(editor) { // Category: eliminate spaces between  instead of moving outside	//  Also has side effect of capitalizing [[Category: correctly.	//  Note: matches whether already cleaned or not	rmfCa_regex_reason(editor, /\[\[[ \t]*Category[ \t]*:[ \t]*([^\]|]*[^\]|\s][ \t]*)/ig, '[[Category:$1', 'category spacing 1');	//  Interlanguage: two-letter codes other than WP	//  eliminate spaces instead of moving outside	//  TODO: add more three-letter language codes	rmfCa_regex_reason(editor, /\[\[[ \t]*([a-vx-z][a-z]|[a-z][a-oq-z]|simple|ang)[ \t]*:[ \t]/ig, '[[$1:', 'interwiki spacing 1');	//  Protect special case of "[[Category:Something| ".	//  Note: Matches based on assumption that "[[Category:Something|" has	//  already had its spaces cleaned out above.	//  Note: REQUIRES conversion back later in this fuction.	//  Note: Turns multiple prefix spaces into a single one.	//  Note: JavaScript in Firefox 3.5.5 parses this regex to mean something	//  different if we use the [^]|] as part of this pattern, so we use	//  [^\]|] instead to mean "anything but ] or |".	editor.replace(/(\[\[Category:[^\]|]+\|)[ \t]+/g, '$1%%rmfCaSpace%%');	//  Category/Interlanguage: eliminate spaces just before  instead of	//  moving outside	//  only matches when there's an actual space before ]]	//  Note: Depends on initial spacing being eliminated above already.	//  TODO: add more three-letter language codes	rmfCa_regex_reason(editor, /\[\[a-vx-z][a-z]|[a-z][a-oq-z]|simple|ang):([^+[^]\s]*)[ \t]+\]\]/ig, '$1:$2', 'category spacing 2');	// For other , move spaces to outside of brackets	// only matches when there's an actual space just inside  or [[: or 	// Depends on exceptions already being eliminated above.	// Skips +space+: because that acts strangely on Wikipedia.	rmfCa_regex_reason(editor, /[ \t]*\[\[(:?)[ \t]+([[^ \t:)/g, ' $1$2', 'wikilinks starting with space');	rmfCa_regex_reason(editor, /[ \t]+\]\][ \t]*/g, ' ', 'wikilinks ending with space');	//  Turn protected spaces back into normal spaces.	editor.replace(/%%rmfCaSpace%%/g, ' '); }

// wikilinkunderlines: remove _ from targets in wikilinks // Aggressiveness: intended to be safe // IN TESTING - need to determine if it can remove more than one _ // Removes _ other than at the beginning, end, or next to another _. // Designed to work ONLY if visible part of link does not contain _ also. function rmfCa_wikilinkunderlines(editor) { // Dang - doesn't match more than one _. // rmfCa_regex_reason(editor, /\[\[\s*([^\]|]*[^\]|_])_([^\]|_][^\]|]*)\s*\|\s*([^\]_]+)\s*\]\]/g, '$3', 'rm _ from wikilink target'); // Dang - the next one doesn't match more than one _ either! rmfCa_regex_reason(editor, /\[\[\s*([^\]|]*[^\]|_])_(?=[^\]|_][^\]|]*\|[^\]_]+\]\])/g, '[[$1 ', 'rm _ from wikilink target'); }

// obviousdashdash: multiple hypens = &mdash; ("safe" matches) // Aggressiveness: safe // We're actually very limited here because we have to avoid these: // Linux command line parameters: //   space + -- + alphanum // C language arithmetic: //   alphanum/(/)/[/]/*/& + -- //   -- + alphanum/(/)/[/]/*/& // Note: Also, some URL might be stupid enough to have two hyphens, so we //  should exclude any non-space strings with / before the double hypens. // It calls commondashdash at the end for completeness. // TODO: Could have lots more combinations function rmfCa_obviousdashdash(editor) { // linestart/space + string of non-slash + digit + DASHES + digit // avoiding slash to avoid possible idiocy like 1--2 in URLs rmfCa_regex_reason(editor, /(^|\s[^\/\s]+)(\d\s?)\s*?--+(\s?)\s*?(\d)/g, '$1$2&mdash;$3$4', 'digit--digit→&mdash;' ); // pretty safe: stuff in QUOTE or BLOCKQUOTE with no other <> rmfCa_regex_reason(editor, /<\s*((?:BLOCK)?QUOTE)\s*>([^<>]*)--([^<>]*)<\s*\/\s*((?:BLOCK)?QUOTE)\s*>/ig, '<$1>$2 &mdash; $3', '-- → &mdash; in quote'); rmfCa_commondashdash(editor); }

// prosedashdash: multiple hypens = &mdash; (when all prose) // Aggressiveness: prose; assumes no command lines or programming code // This one assumes that there are no command lines or C programming // code embedded in the text, so it can be a little more aggressively // general. // It calls commondashdash at the end for completeness. // TODO: Find a more clever way than [^>][^>] to make sure that //  doesn't turn into // 	rmfCa_regex_reason(editor, /\s+--+\s*([^>][^>]|$)/g, ' &mdash; $1'); // linestart/space + string of non-/ non-! + hyphens is dash // Avoiding ! because otherwise get mangled! // Avoiding > on the end for the same reason. // Avoiding slash to avoid possible idiocy like a--b in URLs. // Allows one space to survive after the dash. rmfCa_regex_reason(editor, /(^|\s[^\/!\s]+)--+(\s?)\s*([^>][^>]|$)/g, '$1&mdash;$2$3', '-- → &mdash;', '-- → &mdash; in prose'); rmfCa_commondashdash(editor); }

// commondashdash: multiple hypens = &mdash; (whether safe or prose) // Aggressiveness: safe // This has the regexes that bother obviousdashdash and prosedashdash // (safe and less safe) call, so that we don't have to double-process // "safe" things in obviousdashdash that have already been handled by //  more generalized matches in other dashdash handlers. function rmfCa_commondashdash(editor) { // NOT pretty safe: at least one mdash/ndash in a row of mdash/ndash/hyphens // with no inner spaces // (We'll be brave and assume anyone who uses the table construct	// |- + mdash/ndash with no space between deserves what they get.) // TODO: Find a way for this to NOT match // Commented out until then. // rmfCa_regex_reason(editor, /\s*(?:(?:&[mn]dash;|-|–|—)+(?:&[mn]dash;|–|—)+|(?:&[mn]dash;|–|—)+(?:&[mn]dash;|-|–|—)+)\s*/g, ' &mdash;', 'string of dashes → single &mdash;', 'string of dashes → single &mdash;'); }

// htmltypos: fix common HTML typos and screwups // Aggressiveness: safe function rmfCa_htmltypos(editor) { // forgotten semicolon on HTML entities rmfCa_regex_reason(editor, /&([gl]t|[mn]dash|nbsp)(?:[^;];?)([^;]|$)/ig, '&$1;$2', 'missing ; on HTML entity'); // wrong capitalization on HTML entities rmfCa_regex_reason(editor, /&M(?:dash|DASH);/, '&mdash;', '&MDASH→&mdash;' ); rmfCa_regex_reason(editor, /&N(?:dash|DASH);/, '–', '&NDASH→–' ); rmfCa_regex_reason(editor, /&N(?:bsp|BSP);/, ' ', '&NBSP→ ' ); }

// unlinkfulldates: remove   from full dates // UNTESTED but complete // Aggressiveness: mostly OK if it's this MediaWiki's house policy // Does not recognize any way of marking dates as "supposed to be linked". // However, it only unlinks FULL dates, not isolated Year or Month-Day. // Note: Only converts month names if English // TODO: Make it skip prefix/suffix for when on a date= parameter. function rmfCa_unlinkfulldates(editor) { rmfCa_unlinkfulldates_template(editor, , ); } function rmfCa_unlinkfulldates_template(editor, prefix, suffix) { // U.S. style: January 1(of) 2345 or 2345 rmfCa_regex_reason(editor, /\[\u(?:ne?|ly?))|Feb(?:ruary)?|Ma(?:r(?:ch)?|y)|A(?:pr(?:il)?|ug(?:ust)?)|Sep(?:t(?:ember)?)?|Oct(?:ober)?|(?:Nov|Dec)(?:ember)?)[ \t]+([0-2]?\d|3[01])(?:st|[nr]?d|th)?(?:[ \t]*\]\])?[,\s]+(?:(?:in|of)\s+)?(?:\[\[[ \t]*)?(\d{3,4})[ \t]*\]\]/mig, prefix+'$1 $2, $3'+suffix, 'unlink U.S. dates', '[[MOS:UNLINKDATES');	// European style: 1 January(of) 2345 or 2345	rmfCa_regex_reason(editor, /\[\3[01])(?:st|[nr]?d|th)?[ \t]+(J(?:an(?:uary)?|u(?:ne?|ly?))|Feb(?:ruary)?|Ma(?:r(?:ch)?|y)|A(?:pr(?:il)?|ug(?:ust)?)|Sep(?:t(?:ember)?)?|Oct(?:ober)?|(?:Nov|Dec)(?:ember)?)(?:[ \t]*\]\])?[,\s]+(?:(?:in|of)\s+)?(?:\[\[[ \t]*)?(\d{3,4})[ \t]*\]\]/mig, prefix+'$1 $2 $3'+suffix, 'unlink European dates', '[[MOS:UNLINKDATES');	//  ISO 8601: 2345-06-07 or 2345-06-07	//  rmfCa_regex_reason(editor, /\[\[\s*(\d\d\d\d)(?:\s*\]\])?-(?:\[\[\s*)(\d\d?)(?:\s*\]\])?-(?:\[\[\s*)(\d\d?)\s*\]\]/g, prefix+'$1-$2-$3'+suffix, 'unlink ISO dates', 'MOS:UNLINKDATES');	rmfCa_regex_reason(editor, /\[\1[012]?)[ \t]*-[ \t]*([0-2]?\d|3[01])[ \t]*\]\]/mg, prefix+'$1-$2-$3'+suffix, 'unlink ISO-8601 dates', '[[MOS:UNLINKDATES'); }

// obviousreflist: references section changes when obviously correctable // Scope: Wikipedia - requires  template to exist on wiki // Aggressiveness: safe // UNTESTED function rmfCa_obviousreferences(editor) { // "References" or "Sources" section header, followed immediately by	//  either with parameters or any , // gets the following cleanups: // * "Sources" section name changes to "References" // * equals signs and spaces get balanced on that section header // Parameters to or  are passed through unchanged. // May match even when not making changes. editor.replace(/^(=+)([ \t]?)[ \t]*(?:Refe?ren|Sour)ces?[ \t]*=+[ \t]*$\s*(\{\{reflist[^<>}]*\}\}|<\s*references(?:\s+[^\/<>\s][^>]+>))/gim, '$1$2References$2$1\n$3'); // "References" or "Sources" section header, followed immediately by	//  with no parameters, gets the same as above, // plus is turned into. rmfCa_regex_reason(editor, /^(=+)([ \t]?)[ \t]*(?:Refe?ren|Sour)ces?[ \t]*=+[ \t]*$\s*<\s*references[\/\s]*>/gim, '$1$2References$2$1\n', 'References section fixup'); }

// unsubstreflist: turn things like // and  into just // Scope: Wikipedia - requires  template to exist on wiki // Aggressiveness: slightly; see warning below // UNTESTED // // Warning: This pattern doesn't know how to BALANCE start and ending // and tags - it just removes matching ones contiguous // before and after references. So if someone is silly enough to put // those tags contiguous to the references on one side but not the other, // this pattern will blast the ones contiguous to the references anyway, // causing the tags to become unbalanced! // Note: If contains parameters, no conversion is done, // because the pattern doesn't know how to convert those parameters into //  parameters. An existing with parameters is //  still matched, since the parameters don't have to be converted then. function rmfCa_unsubstreflist(editor) { rmfCa_regex_reason(editor, /(?:<(?:div(?:\s+(?:class="(?:\s*references-[-\w])+"|style="(?:[-\w]*column-[-\w]*:[\w\s]*;)\s*"))*|small)>\s*)+(<\s*references[^\/<>]*\/[\s*]>|\{\{reflist[^\}<>]*\}\})(?:\s*<\s*\/\s*(div|small)\s*>)+/ig, '$1', 'unsubst/cleanup '); // optional: run rmfCa_obviousreferences now that References section // might have gotten cleaned up. rmfCa_obviousreferences(editor); }

// washington_obvious: disambig obvious meanings of Washington // IN TESTING function rmfCa_washington_obvious(editor) { // State of Washington // Warning: Assumes Washington is an old link to the state; could nail // "state of Washington, D.C." or "state of Washington's mind" // in a sentence if Washington was improperly linked to begin with. rmfCa_regex_reason(editor, /([Ss])tate(s?) of \[\[Washington\]\]/g, '$1tate$2 of Washington', 'state of Washington (U.S. state)', 'disambig Washington (U.S. state)'); // Governor of Washington // Same as previous one, basically, but not much chance of false positives. rmfCa_regex_reason(editor, /([Gg])overnor(s?) of \[\[Washington\]\]/g, '$1overnor$2 of Washington', 'governor of Washington (U.S. state)', 'disambig Washington (U.S. state)'); // State of Wa(sh(ington)) // rmfCa_regex_reason(editor, /\[\[Washington\s*\|\s*([Ss]tate of W[Aa](?:sh(?:ington))\.?)/ig, '$1', 'Washington|State of $1', 'disambig [[Washington (U.S. state)');	//  something containing "state"	rmfCa_regex_reason(editor, /\[\[Washington\s*\|\s*([^]]*state)/ig,  '$1', 'Washington|state', 'disambig [[Washington (U.S. state)');	//  Washington state	rmfCa_regex_reason(editor, /\[\[Washington\]\] ([Ss])tate\b/g, 'Washington $1tate', 'Washington state', 'disambig Washington state');	//  WA or Wa. or Wa(sh). State	// (but not "Wash." by itself)	rmfCa_regex_reason(editor, /\[\[Washington\s*\|\s*(W(?:[Aa](?:\.?|(?:sh(?:ington))\.? [Ss]tate))(\s+[^]]+)?)\s*\]\]/g, '$1', 'disambig Washington (U.S. state) abbreviation', 'disambig Washington (U.S. state)'); // Seattle, Washington (a U.S. city not requiring state name) rmfCa_regex_reason(editor, /\[\[\s*Seattle(?:,? ?(?:WA|Washington)?(?:\|\s*Seattle\s*)?)?\]\]\s*(,|\sin)\s*\[\[Washington\]\]/g, 'Seattle$1 Washington', 'disambig Seattle, Washington (U.S. state)'); // Something,/in Washington besides Seattle rmfCa_regex_reason(editor, /\[\[\s*([^]]+), W(?:A|ashington)\|\s*\1\s*\]\]\s*(,|\sin)\s*\[\[Washington\]\]/g, '$1$2 Washington', 'X,/in Washington (U.S. state)', 'disambig Washington (U.S. state)'); // some western state "&"/"and"/"or" some direction in Washington // Working: tested on Oregon Penutian languages rmfCa_regex_reason(editor, /((?:(?:Alask|British Columbi|Montan)a|Idaho|Oregon|Utah)(?:\]\])?,?\s+(?:&|and|or)\s+(?:(?:[Nn]or|[Ss]ou)th-?)?(?:(?:[Ee]a|[Ww]e)st)?ern)\s+\[\[Washington\]\]/g, '$1 Washington', 'state and E/N/S/W Washington', 'disambig Washington (U.S. state)'); }

// washington_loose: disambiguations with less reliability // IN TESTING function rmfCa_washington_loose(editor) { // fix parameters like // |*state=Washington -> |*state=Washington rmfCa_regex_reason(editor, /(\|\s*\w+\s*[Ss]tate\d*\s*=\s*)(?:Washington|\[\[Washington\]\])(\s|\|)/g, '$1Washington$2', 'Washington in state parameter', 'disambig Washington (U.S. state) in parameters'); // Washington D.C.: Ambassador of/embassy in Washington rmfCa_regex_reason(editor, /([Aa]mbassador\s+(?:in|of|to)|(?:[Cc]onsulate|[Ee]mbassy)\s+in)\s+\[\[Washington\]\]/g, '$1 Washington', 'ambassador/consulate/embassy in Washington, D.C.', 'disambig Washington, D.C.'); // { { TOCStates } } using Washington as a title // Failing: Can't seem to get newline to work before == // rmfCa_regex_reason(editor, /(\{\{TOCStates\}\}.*\s)(==+)\s*\[\[\s*Washington\s*\]\]\s*\2/, '$1$2Washington$2', 'TOCStates header means Washington (U.S. state)', 'disambig Washington, D.C.'); } //