Wikipedia:AutoEd/unicodify.js

function autoEdUnicodify(str) { //MAIN FUNCTION describes list of fixes // Task 1: Replace named html entities with unicode // Most common replacements str = str.replace(/&mdash;/gi, '—'); str = str.replace(/–/gi, '–'); // Case insensitive symbols if(str.search(/&[a-z][a-z]+[0-9]*;/i) >= 0) { //XML and HTML Symbols str = str.replace(/&hellip;/gi, '...'); str = str.replace(/&plus;/gi, '+'); str = str.replace(/&plusmn;/gi, '±'); str = str.replace(/&minus;/gi, '−'); str = str.replace(/&times;/gi, '×'); str = str.replace(/&divide;/gi, '÷'); str = str.replace(/&ne;/gi, '≠'); str = str.replace(/&asymp;/gi, '≈'); str = str.replace(/&le;/gi, '≤'); str = str.replace(/&ge;/gi, '≥'); str = str.replace(/&quot;/gi, '"'); // " str = str.replace(/&apos;/gi, "'"); // ' str = str.replace(/&iexcl;/gi, '¡'); str = str.replace(/&cent;/gi, '¢'); str = str.replace(/&pound;/gi, '£'); str = str.replace(/&curren;/gi, '¤'); str = str.replace(/&yen;/gi, '¥'); str = str.replace(/&brvbar;/gi, '¦'); str = str.replace(/&sect;/gi, '§'); str = str.replace(/&uml;/gi, '¨'); str = str.replace(/&copy;/gi, '©'); str = str.replace(/&ordf;/gi, 'ª'); str = str.replace(/&laquo;/gi, '«'); str = str.replace(/&not;/gi, '¬'); str = str.replace(/&reg;/gi, '®'); str = str.replace(/&macr;/gi, '¯'); str = str.replace(/&deg;/gi, '°'); str = str.replace(/&sup2;/gi, '²'); str = str.replace(/&sup3;/gi, '³'); str = str.replace(/&acute;/gi, '´'); str = str.replace(/&micro;/gi, 'µ'); str = str.replace(/&para;/gi, '¶'); str = str.replace(/&middot;/gi, '·'); str = str.replace(/&cedil;/gi, '¸'); str = str.replace(/&sup1;/gi, '¹'); str = str.replace(/&ordm;/gi, 'º'); str = str.replace(/&raquo;/gi, '»'); str = str.replace(/&frac14;/gi, '¼'); str = str.replace(/&frac12;/gi, '½'); str = str.replace(/&frac34;/gi, '¾'); str = str.replace(/&iquest;/gi, '¿'); str = str.replace(/&circ;/gi, 'ˆ'); str = str.replace(/&tilde;/gi, '˜'); str = str.replace(/&lsquo;/gi, '‘'); str = str.replace(/&rsquo;/gi, '’'); str = str.replace(/&sbquo;/gi, '‚'); str = str.replace(/&ldquo;/gi, '“'); str = str.replace(/&rdquo;/gi, '”'); str = str.replace(/&bdquo;/gi, '„'); str = str.replace(/&bull;/gi, '•'); str = str.replace(/&permil;/gi, '‰'); str = str.replace(/&lsaquo;/gi, '‹'); str = str.replace(/&rsaquo;/gi, '›'); str = str.replace(/&oline;/gi, '‾'); str = str.replace(/&frasl;/gi, '⁄'); str = str.replace(/&euro;/gi, '€'); str = str.replace(/&image;/gi, 'ℑ'); str = str.replace(/&weierp;/gi, '℘'); str = str.replace(/&real;/gi, 'ℜ'); str = str.replace(/&trade;/gi, '™'); str = str.replace(/&alefsym;/gi, 'ℵ'); str = str.replace(/&crarr;/gi, '↵'); str = str.replace(/&forall;/gi, '∀'); str = str.replace(/&part;/gi, '∂'); str = str.replace(/&exist;/gi, '∃'); str = str.replace(/&empty;/gi, '∅'); str = str.replace(/&nabla;/gi, '∇'); str = str.replace(/&isin;/gi, '∈'); str = str.replace(/&notin;/gi, '∉'); str = str.replace(/&ni;/gi, '∋'); str = str.replace(/&prod;/gi, '∏'); str = str.replace(/&sum;/gi, '∑'); str = str.replace(/&lowast;/gi, '∗'); str = str.replace(/&radic;/gi, '√'); str = str.replace(/&prop;/gi, '∝'); str = str.replace(/&infin;/gi, '∞'); str = str.replace(/&ang;/gi, '∠'); str = str.replace(/&and;/gi, '∧'); str = str.replace(/&or;/gi, '∨'); str = str.replace(/&cap;/gi, '∩'); str = str.replace(/&cup;/gi, '∪'); str = str.replace(/&int;/gi, '∫'); str = str.replace(/&there4;/gi, '∴'); str = str.replace(/&sim;/gi, '∼'); str = str.replace(/&cong;/gi, '≅'); str = str.replace(/&sub;/gi, '⊂'); str = str.replace(/&sup;/gi, '⊃'); str = str.replace(/&nsub;/gi, '⊄'); str = str.replace(/&sube;/gi, '⊆'); str = str.replace(/&supe;/gi, '⊇'); str = str.replace(/&oplus;/gi, '⊕'); str = str.replace(/&otimes;/gi, '⊗'); str = str.replace(/&perp;/gi, '⊥'); str = str.replace(/&sdot;/gi, '⋅'); str = str.replace(/&lceil;/gi, '⌈'); str = str.replace(/&rceil;/gi, '⌉'); str = str.replace(/&lfloor;/gi, '⌊'); str = str.replace(/&rfloor;/gi, '⌋'); str = str.replace(/&lang;/gi, '〈'); str = str.replace(/&rang;/gi, '〉'); str = str.replace(/&loz;/gi, '◊'); str = str.replace(/&spades;/gi, '♠'); str = str.replace(/&clubs;/gi, '♣'); str = str.replace(/&hearts;/gi, '♥'); str = str.replace(/&diams;/gi, '♦'); } // Uppercase symbols if(str.search(/&[A-Z][a-z]+;/) >= 0) { //Greek symbols str = str.replace(/&Alpha;/g, 'Α'); str = str.replace(/&Beta;/g, 'Β'); str = str.replace(/&Gamma;/g, 'Γ'); str = str.replace(/&Delta;/g, 'Δ'); str = str.replace(/&Epsilon;/g, 'Ε'); str = str.replace(/&Zeta;/g, 'Ζ'); str = str.replace(/&Eta;/g, 'Η'); str = str.replace(/&Theta;/g, 'Θ'); str = str.replace(/&Iota;/g, 'Ι'); str = str.replace(/&Kappa;/g, 'Κ'); str = str.replace(/&Lambda;/g, 'Λ'); str = str.replace(/&Mu;/g, 'Μ'); str = str.replace(/&Nu;/g, 'Ν'); str = str.replace(/&Xi;/g, 'Ξ'); str = str.replace(/&Omicron;/g, 'Ο'); str = str.replace(/&Pi;/g, 'Π'); str = str.replace(/&Rho;/g, 'Ρ'); str = str.replace(/&Sigma;/g, 'Σ'); str = str.replace(/&Tau;/g, 'Τ'); str = str.replace(/&Upsilon;/g, 'Υ'); str = str.replace(/&Phi;/g, 'Φ'); str = str.replace(/&Chi;/g, 'Χ'); str = str.replace(/&Psi;/g, 'Ψ'); str = str.replace(/&Omega;/g, 'Ω'); //Latin symbols str = str.replace(/&Agrave;/g, 'À'); str = str.replace(/&Aacute;/g, 'Á'); str = str.replace(/&Acirc;/g, 'Â'); str = str.replace(/&Atilde;/g, 'Ã'); str = str.replace(/&Auml;/g, 'Ä'); str = str.replace(/&Aring;/g, 'Å'); str = str.replace(/&AElig;/g, 'Æ'); str = str.replace(/&Ccedil;/g, 'Ç'); str = str.replace(/&Egrave;/g, 'È'); str = str.replace(/&Eacute;/g, 'É'); str = str.replace(/&Ecirc;/g, 'Ê'); str = str.replace(/&Euml;/g, 'Ë'); str = str.replace(/&Igrave;/g, 'Ì'); str = str.replace(/&Iacute;/g, 'Í'); str = str.replace(/&Icirc;/g, 'Î'); str = str.replace(/&Iuml;/g, 'Ï'); str = str.replace(/&Ntilde;/g, 'Ñ'); str = str.replace(/&Ograve;/g, 'Ò'); str = str.replace(/&Oacute;/g, 'Ó'); str = str.replace(/&Ocirc;/g, 'Ô'); str = str.replace(/&Otilde;/g, 'Õ'); str = str.replace(/&Ouml;/g, 'Ö'); str = str.replace(/&Oslash;/g, 'Ø'); str = str.replace(/&Ugrave;/g, 'Ù'); str = str.replace(/&Uacute;/g, 'Ú'); str = str.replace(/&Ucirc;/g, 'Û'); str = str.replace(/&Uuml;/g, 'Ü'); str = str.replace(/&Yacute;/g, 'Ý'); str = str.replace(/&Scaron;/g, 'Š'); str = str.replace(/&Yuml;/g, 'Ÿ'); //XML and HTML Symbols str = str.replace(/&Dagger;/g, '‡'); str = str.replace(/&Prime;/g, '″'); } // lowercase symbols if(str.search(/&[a-z][a-z]+;/) >= 0) { //Greek symbols str = str.replace(/&alpha;/g, 'α'); str = str.replace(/&beta;/g, 'β'); str = str.replace(/&gamma;/g, 'γ'); str = str.replace(/&delta;/g, 'δ'); str = str.replace(/&epsilon;/g, 'ε'); str = str.replace(/&zeta;/g, 'ζ'); str = str.replace(/&eta;/g, 'η'); str = str.replace(/&theta;/g, 'θ'); str = str.replace(/&iota;/g, 'ι'); str = str.replace(/&kappa;/g, 'κ'); str = str.replace(/&lambda;/g, 'λ'); str = str.replace(/&mu;/g, 'μ'); str = str.replace(/&nu;/g, 'ν'); str = str.replace(/&xi;/g, 'ξ'); str = str.replace(/&omicron;/g, 'ο'); str = str.replace(/&pi;/g, 'π'); str = str.replace(/&rho;/g, 'ρ'); str = str.replace(/&sigmaf;/g, 'ς'); str = str.replace(/&sigma;/g, 'σ'); str = str.replace(/&tau;/g, 'τ'); str = str.replace(/&upsilon;/g, 'υ'); str = str.replace(/&phi;/g, 'φ'); str = str.replace(/&chi;/g, 'χ'); str = str.replace(/&psi;/g, 'ψ'); str = str.replace(/&omega;/g, 'ω'); str = str.replace(/&thetasym;/g, 'ϑ'); str = str.replace(/&upsih;/g, 'ϒ'); str = str.replace(/&piv;/g, 'ϖ'); //Latin symbols str = str.replace(/&szlig;/g, 'ß'); str = str.replace(/&agrave;/g, 'à'); str = str.replace(/&aacute;/g, 'á'); str = str.replace(/&acirc;/g, 'â'); str = str.replace(/&atilde;/g, 'ã'); str = str.replace(/&auml;/g, 'ä'); str = str.replace(/&aring;/g, 'å'); str = str.replace(/&aelig;/g, 'æ'); str = str.replace(/&ccedil;/g, 'ç'); str = str.replace(/&egrave;/g, 'è'); str = str.replace(/&eacute;/g, 'é'); str = str.replace(/&ecirc;/g, 'ê'); str = str.replace(/&euml;/g, 'ë'); str = str.replace(/&igrave;/g, 'ì'); str = str.replace(/&iacute;/g, 'í'); str = str.replace(/&icirc;/g, 'î'); str = str.replace(/&iuml;/g, 'ï'); str = str.replace(/&eth;/g, 'ð'); str = str.replace(/&ntilde;/g, 'ñ'); str = str.replace(/&ograve;/g, 'ò'); str = str.replace(/&oacute;/g, 'ó'); str = str.replace(/&ocirc;/g, 'ô'); str = str.replace(/&otilde;/g, 'õ'); str = str.replace(/&ouml;/g, 'ö'); str = str.replace(/&oslash;/g, 'ø'); str = str.replace(/&ugrave;/g, 'ù'); str = str.replace(/&uacute;/g, 'ú'); str = str.replace(/&ucirc;/g, 'û'); str = str.replace(/&uuml;/g, 'ü'); str = str.replace(/&yacute;/g, 'ý'); str = str.replace(/&thorn;/g, 'þ'); str = str.replace(/&yuml;/g, 'ÿ'); str = str.replace(/&oelig;/g, 'œ'); str = str.replace(/&scaron;/g, 'š'); str = str.replace(/&fnof;/g, 'ƒ'); //XML and HTML Symbols str = str.replace(/&dagger;/g, '†'); str = str.replace(/&prime;/g, '′'); } // False positives // Breaks large amounts of code which discuss programming/scripting. // str = str.replace(/&lt;/gi, '<'); // str = str.replace(/&gt;/gi, '>'); // Breaks large number of URLs and discussion of programming/scripting. // str = str.replace(/&amp;/gi, '&'); // Arrows str = str.replace(/&larr;/g, '←'); str = str.replace(/&rarr;/g, '→'); str = str.replace(/&uarr;/g, '↑'); str = str.replace(/&darr;/g, '↓'); str = str.replace(/&lArr;/g, '⇐'); str = str.replace(/&rArr;/g, '⇒'); str = str.replace(/&uArr;/g, '⇑'); str = str.replace(/&dArr;/g, '⇓'); str = str.replace(/&harr;/g, '↔'); str = str.replace(/&hArr;/g, '⇔'); str = str.replace(/<==|<--/gi, '←'); str = str.replace(/==>/gi, '→'); // Specific case str = str.replace(/&ETH;/g, 'Ð'); str = str.replace(/&THORN;/g, 'Þ'); str = str.replace(/&OElig;/g, 'Œ'); // Task 2: Replace numeric html entities with unicode ( User:CharlotteWebb ) // Symbols for which there may be a good reason to obfuscate/escape var dont_replace = "|!{}[]=<>"; // START specialreplace function from User:CharlotteWebb function specialreplace(ent, base){ var chr = ""; var num = parseInt(ent.replace(/[\&\#\;x]/g, ''), base); // see UTF-16 for chars outside the BMP // try this with Gothic letters at full volume ^_^ if (num > 0xFFFF) { num -= 0x10000; chr = String.fromCharCode(0xD800 + (num >> 10), 0xDC00 + (num & 0x3FF)); } else { chr = String.fromCharCode(num); } if (dont_replace.indexOf(chr) == -1) { str = str.replace(ent, chr, "gi"); } } // END specialreplace function // perform replacement if(m = str.match(/\&\#(\d+)\;/g)) { for(i = 0; i < m.length; i++) { specialreplace(m[i], 10); } } if(m = str.match(/\&\#x([\da-f]+)\;/gi)) { for(i = 0; i < m.length; i++) { specialreplace(m[i], 16); } } // Task 3: Unprintable control characters Windows-1252 from User:CharlotteWebb var failstr = ""; str = str.replace(/\u0080/g, '€'); str = str.replace(/\u0081/g, failstr); str = str.replace(/\u0082/g, '‚'); str = str.replace(/\u0083/g, 'ƒ'); str = str.replace(/\u0084/g, '„'); str = str.replace(/\u0085/g, '…'); str = str.replace(/\u0086/g, '†'); str = str.replace(/\u0087/g, '‡'); str = str.replace(/\u0088/g, 'ˆ'); str = str.replace(/\u0089/g, '‰'); str = str.replace(/\u008a/g, 'Š'); str = str.replace(/\u008b/g, '‹'); str = str.replace(/\u008c/g, 'Œ'); str = str.replace(/\u008d/g, failstr); str = str.replace(/\u008e/g, 'Ž'); str = str.replace(/\u008f/g, failstr); str = str.replace(/\u0090/g, failstr); str = str.replace(/\u0091/g, '‘'); str = str.replace(/\u0092/g, '’'); str = str.replace(/\u0093/g, '“'); str = str.replace(/\u0094/g, '”'); str = str.replace(/\u0095/g, '•'); str = str.replace(/\u0096/g, '–'); str = str.replace(/\u0097/g, '—'); str = str.replace(/\u0098/g, '˜'); str = str.replace(/\u0099/g, '™'); str = str.replace(/\u009a/g, 'š'); str = str.replace(/\u009b/g, '›'); str = str.replace(/\u009c/g, 'œ'); str = str.replace(/\u009d/g, failstr); str = str.replace(/\u009e/g, 'ž'); str = str.replace(/\u009f/g, 'Ÿ'); return str; }