User:V111P/js/wikiParserV.js

/* * wikiParserV.js * ver. 2013-11-02 * Home: http://en.wikipedia.org/wiki/User:V111P/js/wikiParserV * * This is a library of useful functions, mostly for working with wiki code. * Includes functions for removing html tags. * * You can use the code in this script under the * Creative Commons Attribution 3.0 Unported License (CC-BY 3.0) * http://creativecommons.org/licenses/by/3.0/ * If you do use it, please let me know. Thanks. */

mediaWiki.libs.wikiParserV = window.wikiParser = (function {	"use strict";

var version = 1000; var re = { escForRegExpG: /[.*+?^$|[\]{\\^$]/g, testRe: /<(?!\/?(a|b)>)/g, nonAlphanumericAndHyphenCharsG: /[^A-Za-z0-9_-]/g, htmlCommentsG: /(\n)?\1?/g // replace it with $1 };	var locale = {}; // used in removeElements var $tempDiv = $(' '); // used in unescapeCharEntities var wgScriptPath; var sectionNameUriEncodingAdditionalReplacements;

function unescapeCharEntities(str) { return $tempDiv.html(str.replace('<', '&lt;').replace('>', '&gt;')).text; }

function formatUrl(article, noredir, edit) { wgScriptPath = mw.config.get('wgScriptPath'); article = article.replace(/ /g, '_'); var pagePlusHash = article.match(/(.+)#(.+)/); if (pagePlusHash) article = encodeURIComponent(pagePlusHash[1]) + '#' + encodeURIComponent(pagePlusHash[2]).replace(/%/g, '.'); if (noredir) return wgScriptPath + '/index.php?title=' + article + '&redirect=no'; else if (edit) return wgScriptPath + '/index.php?title=' + article.replace(/#.*/, '') + '&action=edit'; else return '/wiki/' + article; } // formatUrl

function encodeSectionNameForUrl(str) { var res = sectionNameUriEncodingAdditionalReplacements || (sectionNameUriEncodingAdditionalReplacements = [				{re: /~/g, newVal: '.7E'},				{re: /!/g,  newVal: '.21'},				{re: /\*/g, newVal: '.2A'},				{re: /\(/g, newVal: '.28'}, {re: /\)/g, newVal: '.29'},				{re: /\'/g, newVal: '.27'},				{re:/%3A/g, newVal: ':'}			]);

var str = encodeURIComponent(str.replace(/ /g, '_')); $.each(res, function (i, val) {			str = str.replace(val.re, val.newVal);		});

return str.replace(/%/g, '.'); } // encodeSectionNameForUrl

function encodeSectionNameForId(str) { str = encodeSectionNameForUrl(str.replace(/\./g, '_46')) .replace(/:/, '_3A') .replace(re.nonAlphanumericAndHyphenCharsG, '_'); return str; } // encodeSectionNameForId

function escapeForRegExp(str) { return str.replace(re.escForRegExpG, '\\$&'); } // escapeForRegExp

// pretreat for embeded elements with the same closing tag function removeElRegExp(startTag, endTag, startTagOfEmbededEl) { var res = {pretreat: null, main: null}; var startTagEsc = escapeForRegExp(startTag) .replace(/<<>>/g, ')'); var endTagEsc = escapeForRegExp(endTag); if (startTagOfEmbededEl) { var startTagOfEmbededElEsc = escapeForRegExp(startTagOfEmbededEl); res.pretreat = new RegExp('(' + startTagEsc + '(?:(?!' + endTagEsc + ')[\\S\\s])*?)'			 + startTagOfEmbededElEsc + '(?:(?!' +  startTagOfEmbededElEsc + ')[\\S\\s])*?'			  + endTagEsc, 'gi'); }		res.main = new RegExp('(\\n)?' + startTagEsc + '((?!' + startTagEsc + '|' + endTagEsc + ')[\\S\\s])*'		          + endTagEsc + '\\1?', 'gi'); return res; } // removeElRegExp

// startTagOfEmbededEl - needed because for example files and wiki links have the same // closing tags, so to remove files, pass '[[File:' as startTag and '[[' as startTagOfEmbededEl	function removeElRegExpStartArr(startTagPre, startTagArr, startTagPost,									endTag, startTagOfEmbededEl) {		var st = startTagPre + '<<<' + startTagArr.join('@@@') + '>>>' + startTagPost;		return removeElRegExp(st, endTag, startTagOfEmbededEl);	} // removeElRegExpStartArr

function removeEls(data, res, iterationLimit) { var prev, cntr; iterationLimit = iterationLimit || 1000; if (res.pretreat) { cntr = iterationLimit; do { cntr--; // anti infinite-loop var just in case... prev = data; data = data.replace(res.pretreat, '$1'); } while (data != prev && cntr > 0); }		cntr = iterationLimit; do { cntr--; prev = data; data = data.replace(res.main, '$1'); } while (data != prev && cntr > 0); return data; } // removeEls

// saves all versions of some namespace names function saveNsNames { locale.specialNsArr = []; locale.fileNsArr = []; locale.categoryNsArr = []; $.each(mw.config.get('wgNamespaceIds'), function (key, val) {			if (val == '-1') { // 'special'				if ($.inArray(key, locale.specialNsArr) == -1)					locale.specialNsArr.push(key);			}			else if (val == '6' || val == '-2') { // 'file'/'image' or 'media'				if ($.inArray(key, locale.fileNsArr) == -1)					locale.fileNsArr.push(key);			}			else if (val == '14') { // 'category'				if ($.inArray(key, locale.categoryNsArr) == -1)					locale.categoryNsArr.push(key);			}		}); } // saveNsNames

// won't work in all cases function escCharsForNowikiTags(data) { var nowikiCharTranslMap = { '[': '&#91;', ']': '&#93;', '{': '&#123;', '}': '&#125;',			'<': '&lt;', '>': '&gt;', ':': '&#58;', '*': '&#42;', '#': '&#35;' };

//en.wikipedia.org/wiki/Help:Nowiki#WP:NOWIKI

var singleCharEscReG = re.singleCharEscG || (re.singleCharEscG = /(.|^)(?:nowiki ?\/|nowiki><\/nowiki)>(.)/g); data = data.replace(singleCharEscReG, function (m, $1, $2) {			if ($1 == '<') return '&lt;' + $2;			else if (nowikiCharTranslMap[$2]) return $1 + nowikiCharTranslMap[$2];			else if (nowikiCharTranslMap[$1]) return nowikiCharTranslMap[$1] + $2;		});

var noWikiElReG = re.noWikiElG || (re.noWikiElG = /<(nowiki|pre)>([\S\s]*?)<\/\1>/g); var noWikiReplaceCharsReG = re.noWikiReplG || (re.noWikiReplG = /\[|]|\{|}|<|>|:|\*|#/g); data = data.replace(noWikiElReG, function (match, $1, $2) {			return $2.replace(noWikiReplaceCharsReG, function (match) { return nowikiCharTranslMap[$2]; })});

return data; } // escCharsForNowikiTags

function removeElements(data, elStr) { var arr = elStr.split(', ');

if ($.inArray('comments', arr) > -1) data = data.replace(re.htmlCommentsG, '$1'); if ($.inArray('tables', arr) > -1) { data = removeEls(data, re.wikiTable				|| (re.wikiTable = removeElRegExp('{|', '|}'))); data = removeEls(data, re.htmlTable				|| (re.htmlTable = removeElRegExp(' -1) data = removeEls(data, re.templates				|| (re.templates = removeElRegExp('') )); if ($.inArray('references', arr) > -1) data = data.replace(re.refs				|| (re.refs = /]*?(\/>|>[\S\s]*?<\/ref\s*>)/ig), ''); if ($.inArray('files', arr) > -1) { if (!locale.fileNsArr) saveNsNames; data = removeEls(data, re.files				|| (re.files = removeElRegExpStartArr(, locale.fileNsArr, ':', , '| (re.gallery = /(\n)?]*>[\S\s]*?<\/gallery>\1?/gi), '$1');		}		if ($.inArray('categories', arr) > -1) {			if (!locale.categoryNsArr)				saveNsNames;			data = removeEls(data, re.category				|| (re.category = removeElRegExpStartArr('[[', locale.categoryNsArr, ':', )));		}		if ($.inArray('bold/italic', arr) > -1) {			data = data.replace(re.boldItalicG				|| (re.boldItalicG = /<\/?(i|b|strong|em)>|'?|(&#39;){2,3}/gi), '');		}		if ($.inArray('behavior switches', arr) > -1) {			data = data.replace(re.behaviorSwitchesG				|| (re.behaviorSwitchesG = /(\n)?__[^\s]+?__\1?/g), '$1');		}		if ($.inArray('others', arr) > -1) {			data = data.replace(re.timelineG				|| (re.timelineG = /(\n)? [\S\s]*?<\/timeline>\1?/gi), '$1');		}

return data; } // removeElements;

// all files must be removed BEFORE calling this function function unlink(data) { // remove all wikilinks and files var prev, cntr = 1000; var remAddrReG = re.remAddrG || (re.remAddr = /\[\^|\*\|/g); var unlinkLinksReG = re.unlinkLinksReG || (re.unlinkLinksReG = /\[\[([^\]\[]+)\]\]/g); do { cntr--; prev = data; // remove addresses from all links: data = data.replace(remAddrReG, '[[');		} while (data != prev && cntr > 0);

// unlink all links: data = data.replace(unlinkLinksReG, '$1'); return data; } // unlink

function boldAndItalicToHtml(data) { if (!re.boldAndItalicToHtml1) { // the first regex removes four, six, or more apostrophes re.boldAndItalicToHtml1 = /(^|[^'])('{2,})?([^']|$)/g; re.boldAndItalicToHtml2 = /([^'\n][^\n]*?)(|\n)/g; re.boldAndItalicToHtml3 = /([^\n]+?)(|\n)/g; }

return data.replace(re.boldAndItalicToHtml1, '') .replace(re.boldAndItalicToHtml2, '$1') .replace(re.boldAndItalicToHtml3, $1); } // boldAndItalicToHtml

function beforeTheFirstSection(data, removeCategories) { var tempArr; // keep only the text before the start of the first section title // (section titles starts with = on a new line). // If there are no sections, remove the categories var beforeFirstSectRe = re.beforeFirstSect || (re.beforeFirstSect = /^([\S\s]*?)(?=(\n(=+).+?\3[^\S\n]*)(\n|$))/); var newData = (tempArr = beforeFirstSectRe.exec(data)) && tempArr[1]; return newData || (removeCategories ? removeElements(data, 'categories') : data); } // beforeTheFirstSection

function divideSections(data) { var sections = []; sections.push({			eq: ,			level: 0,			heading: ,			contents: beforeTheFirstSection(data, false)		}); var match; var regex = re.divSectionsG || (re.divSectionsG = /(^|\n)(=+)(.+?)\2[^\S\n]*(?=\n)([\S\s]*?)(?=\n(=+).+?\5[^\S\n]*(?:\n|$)|$)/g); var cntr = 1000; while ((match = regex.exec(data)) && cntr > 0) { cntr--; sections.push({				eq: match[2],				level: match[2].length,				heading: $.trim(match[3]),				contents: $.trim(match[4])			}); }

return sections; } // divideSections

function checkRegexSupport { return (''.replace(re.testRe, '&lt;') == '&lt;bd&lt;/e>'); }

// removes html tags and some whole elements, except // for the tags in the comma+space-separated whiteListTagsStr list // Removes all the attributes from the white-listed tags tags. // Converts < before a whitespace character into &lt; function sanitizeHtml(data, whiteListTagsStr, leaveSpecialChars) {

if (!checkRegexSupport) throw 1; // no (lookahead) regex support

var whiteList = (whiteListTagsStr || '').split(', ').join('|'); var commentReG = re.htmlCommentG || (re.htmlCommentG = //g); var nonWhiteListedTagsReG, allTagsG; var lessThanNotBeforeWLTagG; var grThanNotAndAfterWLTagG; var tagAttributesReG; var oldData, cntr;

if (whiteList !== '') { var byAll = re.resByWhitelist = (re.resByWhitelist || {}); var by = byAll[whiteListTagsStr] || (byAll[whiteListTagsStr] = {});

nonWhiteListedTagsReG = by.nonWhiteListedTagsG || (by.nonWhiteListedTagsG = new RegExp('<(?!/?(' + whiteList + ')(\\b|/))[^>]*>', 'gi')); lessThanNotBeforeWLTagG = by.lessThanNotBeforeWLTagG || (by.lessThanNotBeforeWLTagG = new RegExp('<(?!/?(' + whiteList + ')/?>)', 'gi')); grThanNotAndAfterWLTagG = by.grThanNotAndAfterWLTagG || (by.grThanNotAndAfterWLTagG = new RegExp('(', 'gi')); tagAttributesReG = re.tagAttributesG || (re.tagAttributesG = /<(\/?[a-z][a-z0-9]*)[^>]*?(\/)?>/gi); }		else allTagsG = re.allTagsG || (re.allTagsG = /<(\b|\/)[^>]*>/g);

cntr = 1000; do { oldData = data; cntr--; // remove comments: data = data.replace(re.htmlCommentsG, '$1'); // remove all tags except the white-listed ones if (whiteList !== '') { data = data.replace(nonWhiteListedTagsReG, ''); // remove all attributes from the remaining tags: data = data.replace(tagAttributesReG, '<$1$2>'); }			else data = data.replace(allTagsG, ''); } while (oldData != data && cntr > 0); if (cntr <= 0) throw 2; if (!leaveSpecialChars) { var ampNotInCharRefReG = re.ampReG || (re.ampReG = /&(?!#?[xX]?[a-zA-Z0-9]+;)/g); var ltReG = //g; var quoteReG = /"/g;			var aposReG = /'/g;			var graveReG = /`/g;			cntr = 1000;			do {				oldData = data;				cntr--;				if (whiteList !== '') {					// html-escape all except if part of a whitelisted tag					data = data.replace(lessThanNotBeforeWLTagG, '&lt;');					data = data.replace(grThanNotAndAfterWLTagG, function ($0, $1) {						return $1 ? $0 : '&gt;';					});				}				else { // html-escape all chars					data = data.replace(ltReG, '&lt;').replace(gtReG, '&gt;');				}				// escape & to &amp; if obviously not a part of a char ref:				data = data.replace(ampNotInCharRefReG, '&amp;');				// escape all quotes (` is used in old IE)				data = data.replace(quoteReG, '&quot;').replace(aposReG, '&#39;')					.replace(graveReG, '&#96;');			} while (oldData != data && cntr > 0);			if (cntr <= 0) throw 2;		}

return data; } // sanitizeHtml

function focusedSegment(bsa, segmentNames) { segmentNames = (typeof segmentNames == 'object') ? segmentNames : segmentNames.split(', '); for (var i = 0; i < segmentNames.length; i++) { if (segmentNames[i] == 'wikilink') return focusedCustomSegment(bsa, , , '', '[]<>{}'); }	}

// bsa - an array with 3 elements: [text_before_the_selection/cursor, selection, text_after] // the other arguments - the char(s) indicating the start/end of the segment // otherStartChars (optional) - start chars of other segments with the same endChars, //   needed only for some elements, for example if startChars is [[File:,	//    otherStartChars needs to be [[ because links can be embeded in file elements.	// invalidBeforePipe - a string with individual illegal characters. Illigal only if before	//    the first pipe character "|" (or anywhere, if there is no pipe character).	function focusedCustomSegment(bsa, startChars, endChars, otherStartChars, invalidBeforePipe) {

function endMatches(str, endChars) { return (str.slice(-endChars.length) === endChars); }

function startMatches(str, startChars) { return (str.slice(0, startChars.length) === startChars); }

var before = bsa[0]; var selection = bsa[1]; // the selection var after = bsa[2]; var spaces;

if (!startChars || !endChars) return;

if (selection) { // there is some selected text spaces = selection.match(/^\s+/); if (spaces) { // spaces at the beginning of the selected text if (endMatches(before, startChars)) { selection = startChars + selection; before = before.slice(0, -startChars.length); }				else { // move the spaces to the end of the text-before-the-selection: before += spaces[0]; selection = selection.slice(spaces[0].length); // check for startChars at beginning of selection: if (!startMatches(selection, startChars)) return; }			}			else { // while no (complete) startChars string at beginning of selection: // move a char from the end of textBefore to the beginning of selection var startCharsFound = false; for (i = 0; i <= startChars.length; i++) { if (startMatches(selection, startChars)) { startCharsFound = true; break; }					if (before.length == 0) break; selection = before.slice(before.length - 1) + selection; before = before.slice(0, before.length - 1); }				if (!startCharsFound) return;

// TODO: check if selection contains only one outer element, //       and the start-end chars are ballanced }

spaces = selection.match(/\s+$/); if (spaces) { // spaces at the end of the selected text if (startMatches(after, endChars)) { selection = selection + endChars; after = after.slice(endChars.length); }				else { // move spaced to the beginning of the text-after-the-selection: after = spaces[0] + after; selection = selection.slice(0, -spaces[0].length); if (!endMatches(selection, endChars)) return; }			}			else { // while no (complete) endChars string found at end of selection: // move a char from the beginning of textBefore to the end of selection var endCharsFound = false; for (i = 0; i <= endChars.length; i++) { if (endMatches(selection, endChars)) { endCharsFound = true; break; }					if (after.length == 0) break; selection = selection + after.charAt(0); after = after.slice(1); }				if (!endCharsFound) return; }		} // if (selection) else { // no text selected var text = before + after; // TODO: add a loop to allow the cursor to be after an embeded element var startCharsAt = text.lastIndexOf(startChars, before.length + startChars.length - 3); if (startCharsAt == -1) return; var closing = startCharsAt; var opening = startCharsAt; var openingOther; var i = 0; while (i++ < 10) { closing = text.indexOf(endChars, closing + 1); if (closing == -1) { return; }				if (otherStartChars) { openingOther = text.indexOf(otherStartChars, opening); }				opening = text.indexOf(startChars, opening + 1); if (opening == -1) opening = text.length; if (otherStartChars) { if (openingOther > -1) opening = (openingOther < opening ? openingOther : opening); }				if (closing < opening) { if (closing < before.length - endChars.length) { return; }					selection = text.slice(startCharsAt, closing + startChars.length); before = text.slice(0, startCharsAt); after = text.slice(closing + startChars.length); break; }

}		}

if (invalidBeforePipe) { var invalidEscForRe = escapeForRegExp(invalidBeforePipe); var beforePipe = selection.slice(startChars.length, -endChars.length).match(/[^|]*/)[0]; if (beforePipe.match('[' + invalidEscForRe + ']')) return; }

return [before, selection, after]; } // focusedSegment

return { version: version, unescapeCharEntities: unescapeCharEntities, formatUrl: formatUrl, encodeSectionNameForUrl: encodeSectionNameForUrl, encodeSectionNameForId: encodeSectionNameForId, checkRegexSupport: checkRegexSupport, escCharsForNowikiTags: escCharsForNowikiTags, removeElRegExp: removeElRegExp, removeElRegExpStartArr: removeElRegExpStartArr, removeElements: removeElements, unlink: unlink, sanitizeHtml: sanitizeHtml, boldAndItalicToHtml: boldAndItalicToHtml, beforeTheFirstSection: beforeTheFirstSection, divideSections: divideSections, focusedCustomSegment: focusedCustomSegment, // incomplete implementation focusedSegment: focusedSegment // works only for wikilinks right now }; });