User:Verdy p/tablesort.js

/* Please DON'T delete, this is only in MY own user space, and it is NOT a test. See rule: CSD */ /* */ /* ===================================================================== Support for sorting text cells of tables in a much more consistent way. (c) 2010 Philippe Verdy : fr:Utilisateur:Verdy_p (http://fr.wikipedia.org/wiki/Utilisateur:Verdy_p).

Available under the following licences: (see details about these licence templates on http://commons.mediawiki.org)

This implements a 2-level collation, nearly compatible with UCA (see UTS#10 and CLDR for tailoring), but: (instead of their preferred collation weight). (actually it uses the ordering of UTF-16 code units, instead of code point values of UTF-32). This has the effect of sorting "a-a" before "aa", because they have the same primary and secundary subkeys, and they differ only on the ternary level. a very limited support for French reverse ordering of the ternary level.
 * It is still using a the binary ordering of characters used in subkeys at each level
 * Ternary differences are all left in the last default level that uses the binary ordering of characters
 * It is incomplete in the decomposition mappings of graphemes.
 * It is almost completely locale-neutral (not tailored for any different locales), except that it contains

But despite these defects, it gives much better results than the default binary order of UTF-16 code units (the only option natively implemented in Javascript which makes it a level-0 collation), or than a simple 1-level collation with case mapping only (currently used in wikibits.js).

A complete UCA implementation, using true weights from the Unicode DUCET and possible per-locale tailoring would require much more work (but would produce more compact sort keys). This implementation is very fast and its code is simple enough and small, that it can be a full replacement of the current ts_toLowerCase function used to generate sort keys for text cells.

It does not impact the existing non-text (number, currency, date) ordering types of Mediawiki tables.

=
========================================================== */

/* Utility */ function ts_toOtherCase(s) { var c, d;  s = s.split(''); for (var i = s.length; --i >= 0;) s[i] = (c = s[i]) == (d = c.toLowerCase) ? c.toUpperCase : d;  return s.join(''); }

/* ===================================================================== */

ts_textToKeyReversed = mw.config.get("wgContentLanguage") == 'fr';

/* Grapheme_Cluster_Break property of Unicode: (see UAX #29 3.1 Default Grapheme Cluster Boundary Specification) This includes Grapheme_Extend = true, i.e.: * General_Category = Nonspacing_Mark * General_Category = Enclosing_Mark * U+200C ZERO WIDTH NON-JOINER (ZWNJ) // TODO: can't be unsupported here * U+200D ZERO WIDTH JOINER (ZWJ) // TODO: can't be unsupported here * plus a few Spacing Marks needed for canonical equivalence. as well as the following prepended characters (that don't follow the logical order): * U+0E30 ( ะ ) THAI CHARACTER SARA A * U+0E32 ( า ) THAI CHARACTER SARA AA  * U+0E33 ( ำ ) THAI CHARACTER SARA AM  * U+0E45 ( ๅ ) THAI CHARACTER LAKKHANGYAO * U+0EB0 ( ະ ) LAO VOWEL SIGN A * U+0EB2 ( າ ) LAO VOWEL SIGN AA  * U+0EB3 ( ຳ ) LAO VOWEL SIGN AM  FIXME: need support for graphemes longer than 2 characters (notably with ZWJ/ZWNJ). FIXME: check Unicode 5/6 completeness. ts_textToKeyGraphemeExtends = '\u0300-\u036F' + '\u0483-\u0489' + '\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7' + '\u0610-\u061A\u064B-\u065E\u0670\u06D6-\u06DC\u06DE-\u06E4\u06E7\u06E8\u06EA-\u06ED' + '\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3' + '\u0901-\u0903\u093C\u093E-\u094D\u0951-\u0954\u0962\u0963' + '\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3' + '\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75' + '\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3' + '\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B62\u0B63' + '\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7' + '\u0C01-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63' + '\u0C82\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3' + '\u0D02\u0D03\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63' + '\u0D82\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3' + '\u0E30-\u0E3A\u0E45\u0E47-\u0E4E' + /* including '\u0E30\u0E32\u0E33\u0E45' (see notice above) */ '\u0EB0-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD' + /* including '\u0EB0\u0EB2\u0EB3' (see notice above) */ '\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F90-\u0F97\u0F99-\u0FBC\u0FC6' + '\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F' + '\u135F' + '\u1712-\u1714\u1732-\u1734\u1752\u1753\u1772\u1773\u17B6-\u17D3\u17DD' + '\u180B-\u180D\u18A9' + '\u1920-\u192B\u1930-\u193B\u19B0-\u19C0\u19C8\u19C9' + '\u1A17-\u1A1B' + '\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAA' + '\u1C24-\u1C37' + '\u1DC0-\u1DE6\u1DFE\u1DFF' + '\u20D0-\u20F0' + '\u2DE0-\u2DFF' + '\u302A-\u302F\u3099\u309A' + '\uA66F-\uA672\uA67C\uA67D' + '\uA802\uA806\uA80B\uA823-\uA827\uA880\uA881\uA8B4-\uA8C4' + '\uA926-\uA92D\uA947-\uA953' + '\uAA29-\uAA36\uAA43\uAA4C\uAA4D' + '\uFB1E' + '\uFE00-\uFE0F\uFE20-\uFE26';

/* The "Prepend" characters (not obeying the logical structure) should be handled. (see UAX #29 3.1 Default Grapheme Cluster Boundary Specification). ts_textToKeyGraphemePrepends = '\u0E40-\u0E44\u0EC0-\u0EC4';

/* FIXME: check the missing currency symbols or non-separating punctuations to ignore. TODO: add the missing format controls to ignore. ts_textToKeyDropped = new RegExp(  '[-\'\x7F-\xBF\xAD' +    '⁃·´`‚‘’~^†‡¶§©®$£€¢₩฿₨₧₪₣₡₢' +   ts_textToKeyGraphemeExtends +   ']+', 'g' );

/* FIXME: add the missing whitespaces and word-separating punctuation signs. ts_textToKeyBlanked = new RegExp(  '[\\s\\"\\[\\]\\\\ \xA0' +   '{}|+*/±×÷%<=>°„“”«».,;:¡!‼¿?…•_–—#&' +   'ƧƨƻƼƽ' + /* tone letters ignored for sort */   'ʔʡʕʢʖƾ' + /* glottal letters */   'ǀǁǂǃʘ' + /* click letters */   ']+', 'g' );

/* Note: For performance reason in browsers, the values in this map MUST be precompiled regexps, ready for use as parameters of "string.replace(regexp, subst)" in loops.

TODO: add more mappings for more languages... This mapping is nearly complete for modern languages written in Latin. Should add mappings for precomposed characters in Greek, Cyrillic... Hangul precomposed syllables should be decomposed algorithmically into jamos. ts_textToKeyRemapped = { '0': /[⁰₀]/g, '1': /[¹₁]/g, '2': /[²₂]/g, '3': /[³₃]/g, '4': /[⁴₄]/g, '5': /[⁵₅]/g, '6': /[⁶₆]/g, '7': /[⁷₇]/g, '8': /[⁸₈]/g, '9': /[⁹₉]/g, 'a': /[áàãâấầẫẩäǟȁảǎăắằẵẳȃāǡåǻẚạậặḁąɐɑɒª@]/g, 'A': /[ÁÀÃÂẤẦẪẨÄǞȀẢǍĂẮẰẴẲȂĀǠÅǺẠẬẶḀĄ]/g, 'ae': /[æǽǣ]/g, 'AE': /[ÆǼǢ]/g, 'b': /[ḃḅḇƀƃƅɓʙ]/g, 'B': /[ḂḄḆƁƂƄ]/g, 'c': /[ćĉčċçḉƈɕʗɔ]/g, 'C': /[ĆĈČĊÇḈƇƆ]/g, 'd': /[ďḋḍḑḏḓƌɗɖ]/g, 'D': /[ĎḊḌḐḎḒƉƊƋ]/g, 'dh': /[ðđ]/g, 'DH': /[ÐĐ]/g, 'dz': /[ʣǳǆʤʥ]/g, 'Dz': /[ǲǅ]/g, 'DZ': /[ǱǄ]/g, 'e': /[éèẽêếềễểëȅẻěĕȇḗḕḛḙḝėẹệęɘəɚɛʚɜɝɞ]/g, 'E': /[ÉÈẼÊẾỀỄỂËȄẺĚĔȆḖḔĖẸỆḘḚḜĘƏƎƐ]/g, 'f': /[ḟƒ]/g, 'F': /[ḞƑ]/g, 'fi': /[ﬁ]/g, 'fl': /[ﬂ]/g, 'g': /[ǵĝǧğḡġģǥɠɡɢʛɣɤ/g, 'G': /[ǴĜǦĞḠĠĢǤƓƔ]/g, 'gh': /[ƣ]/g, 'GH': /[Ƣ]/g, 'h': /[ĥḧḣḫḥẖḩħɥɦɧʜ]/g, 'H': /[ĤḦḢḪḤḨĦ]/g, 'hv': /[ƕ]/g, 'i': /[íìĩîïḯǐĭȋīȉỉıḭịįɨɩɪ]/g, 'I': /[ÍÌĨÎÏḮǏĬȊĪȈỈİḬỊĮƗƖ]/g, 'ij': /[ĳ]/g, 'IJ': /[Ĳ]/g, 'j': /[ĵǰɟʝ]/g, 'J': /[Ĵ]/g, 'k': /[ḱǩḳķḵƙĸʞ]/g, 'K': /[ḰǨḲĶḴƘ]/g, 'l': /[ĺḻḷḹḽļľŀłƚɫɬɭʟƛ]/g, 'L': /[ĽḺḶḸḼĻĿŁ]/g, 'lj': /[ǉ]/g, 'Lj': /[ǈ]/g, 'LJ': /[Ǉ]/g, 'lz': /[ɮ]/g, 'm': /[ḿṁṃɯɰɱ]/g, 'M': /[ḾṀṂƜ]/g, 'n': /[ńñňṅṇņṉṋŉɲŋɳƞɴ]/g, 'N': /[ŃÑŇṄṆŅṈṊƝŊ]/g, 'nj': /[ǌ]/g, 'Nj': /[ǋ]/g, 'NJ': /[Ǌ]/g, 'o': /[óòõṍṏôốồỗổǒöǒŏȏōṓṑőȍỏọộǫǭơớờỡởợøǿɵɷº]/g, 'O': /[ÓÒÕṌṎÔỐỒỖỔÖǑŎȎŌṒṐŐȌỎỌỘǪǬƠỚỜỠỞỢØǾƟ]/g, 'oe': /[œɶ]/g, 'OE': /[Œ]/g, 'p': /[ṕṗƥ]/g, 'P': /[ṔṖƤ]/g, 'ph': /[ɸ]/g, 'q': /[ʠ]/g, 'r': /[ŕřȓṙȑŗṛṝṟɼɽɾɹɺɻɿʅʀʁ]/g, 'R': /[ŔŘȒṘȐŖṚṜṞƦ]/g, 's': /[śṥŝšṧṡṣṩşʂſẛʃʄʆƪ]/g, 'S': /[ŚṤŜŠṦṠṢṨŞƩ]/g, 'ss': /[ß]/g, 't': /[ẗťṫṭṯṱţƫƭŧʈʇ]/g, 'T': /[ŤṪṬṮṰŢƮƬŦ]/g, 'tc': /[ʨ]/g, 'ts': /[ʦʧ]/g, 'th': /[þ]/g, 'TH': /[Þ]/g, 'u': /[úùũṹûǔüǘǜǖǚŭȗūṻůűȕủṵṳụṷųưứừữửựʉʊ]/g, 'U': /[ÚÙŨṸÛǓÜǗǛǕǙŬȖŪṺŮŰȔỦṴṲỤṶŲƯỨỪỮỬỰƱ]/g, 'v': /[ṽṿʋʌ]/g, 'V': /[ṼṾƲ]/g, 'w': /[ẃẁŵẅẇẘẉʍƿ]/g, 'W': /[ẂẀŴẄẆẈ]/g, 'x': /[ẍẋ]/g, 'X': /[ẌẊ]/g, 'y': /[ýỳŷỹÿẏỷẙỵƴʎʏ]/g, 'Y': /[ÝỲŶỸŸẎỶỴƳ]/g, 'z': /[źẑžżẓẕƶʐʑʒǯƺʓƹ]/g, 'Z': /[ŹẐŽŻẒẔƵƷǮƸ]/g, };

/* Utility for reversing the ternary key containing differences such as accents (mostly for French).

Note that with a full UCA implementation, this should be the secondary level, before case differences. However we currently don't convert letters to sortable collation weights, so this would not work as expected, and letters with all accent would sort after all letters without accents. So this implementation swaps levels 2 and 3, to make sure that the same base letters with the same case will be sorted together. Difference of accents are left in the final level, binary-ordered here according to their UTF-16 code units, along with other ignorable differences.

Ideally, this should correctly use the Grapheme Cluster Boundary Rules of UAX#29, i.e. all legacy grapheme clusters plus rule GB9b of extended grapheme clusters (but probably not rule GB9a here for string reversal, because spacing marks such as '^' or '~' should be treated isolately, just like apostrophes; note that  these spacing marks are ignored here in the primary and secondary subkeys).

BUG: cannot handle string reversal so that it will detect and not break all extended grapheme clusters that are longer than 2 characters (especially Hangul L+V+T* clusters), or longer than 1 character in in supplementary planes. If Hangul is handled later, these clusters should be preserved

As this is just used here for French, this bug should not cause any practical problem for sorting actual tables.

Don't use this function for meaningful display ! A true grapheme cluster breaker, needed for rendering and text layout, would have to handle all these cases (including rule GB9a or extended grapheme clusters)

A more precise detection of grapheme clusters for collation would require a huge and complex map of collation weights (at least the DUCET). ts_textToKeyReverseGraphemeExtends = new RegExp( '(.)([' + '\uDC00-\uDFFF' + /* include trailing surrogates */ ts_textToKeyGraphemeExtends + '])', 'g' ); ts_textToKeyReverseGraphemeExtends = new RegExp( '([' + ts_textToKeyGraphemePrepends + '])(.)', 'g' ); function ts_textToKeyReverse(s) { return s   .replace(ts_textToKeyReverseGraphemeExtends, '$2$1') .replace(ts_textToKeyReverseGraphemePrepends, '$2$1') .split().reverse.join(); }

function ts_textToKeyFilter(s) { var t = s.replace(ts_textToKeyDropped, ).replace(ts_textToKeyBlanked, ' ').replace(/^ /, ).replace(/ $/, ''); for (var k in ts_textToKeyRemapped) t = t.replace(ts_textToKeyRemapped[k], k); /* TODO: Hangul precomposed syllables should be decomposed algorithmically into jamos. */ return t; }

/* Main function. function ts_textToKey(s) { var f = ts_toOtherCase(ts_textToKeyFilter(s)), k = f.toLowerCase, x = '\t'; if (ts_textToKeyReversed) s = ts_textToKeyReverse(s); /* mainly for French */ return f != s ? k + x + f + x + s : k != f ? k + x + f : k; }

/* Override the existing "wikibits.js" for table sorts, when processing text cells into sort keys. ts_toLowerCase = ts_textToKey;

/* ===================================================================== */ /* */