User:Opencooper/bindKana.js

// This script takes kanji with ruby text over it and removes repeated parts // It's called automatically by showKanji.js if any furigana was added

// The basic algorithm searches for *continuous* hiragana/katakana/latin/punctuation // strings that are in both the base and reading, and splits on these. This does // not take into account any lexical information (so it doesn't know anything about // particles or individual kanji readings). It can also fail for more complicated // cases, but the script should be able to abort for these (maybe in the future we can // continue and just ignore that specific base and substring).

// License: CC0

function getKanjiInfo { // Don't run if the kanji or the ruby is hidden if ($("#kanjiInfo").css("display") == "none" || $("#kanjiInfo rt").css("display") == "none") { return; }

var kanji = $("#kanjiInfo ruby")[0].childNodes[0].nodeValue; var kana = $("#kanjiInfo rt").text;

if (!kanji || !kana) { return; }

var bases = [kanji]; var readings = [kana];

bindKana(bases, readings);

// If any binding occured if (bases.length > 1) { displayBoundKana(bases, readings); } }

function bindKana(bases, readings) { var iterations = 0; var maxIterations = 25; var foundBindings = true; while (foundBindings && iterations != maxIterations) { iterations++; foundBindings = tryBind(bases, readings); }

// Sanity check if (bases.length != readings.length) { throw new Error("bindKana.js: Bases and readings arrays don't have same lengths."); }   // Check kanji:kana ratio for (var i = 0; i < bases.length; i++) { var kanjiLength = bases[i].length; var kanaLength = readings[i].length; if (kanjiLength === 0 || kanaLength === 0) { continue; } var ratio = kanaLength / kanjiLength; if (ratio >= 6 || ratio <= 1/6) { throw new Error("bindKana.js: kanji:kana ratio greater than 6 for `"   		                + bases[i] + "` and `" + readings[i] + "`."); }   }    if (iterations == maxIterations - 1) { console.warn("bindKana.js: Encountered maximum iterations."); if (bases.length == 1) { throw new Error("bindKana.js: Encountered maximum iterations while furigana wasn't split once."); }   } }

function tryBind(bases, readings) { var regexes = [kanaRegexes.katakanaRe, kanaRegexes.alphanumRe, kanaRegexes.hiraganaRe, kanaRegexes.miscRe]; var baseLength = bases.length; for (var i = 0; i < baseLength; i++) { if (readings[i] === "") { continue; }

for (var regex of regexes) { searchBase(bases, readings, i, regex);

if (bases.length != baseLength) { break; }       }    }

if (bases.length != baseLength) { // Make sure splitting didn't mess up the bindings for (var j = 0; j < bases.length; j++) { if (kanaRegexes.kanjiRe.test(bases[j]) && readings[j] === "") { throw new Error("bindKana.js: Kanji base with no reading: `"                               + bases[j] + "` at index " + j); } else if (bases[j] === "" && readings[j]) { throw new Error("bindKana.js: Blank base with reading: `"                               + readings[j] + "` at index " + j); }       }

return true; } else { return false; } }

function searchBase(bases, readings, index, re) { var baseLength = bases.length; var substring = bases[index].match(re); if (substring) { for (var j = 0; j < substring.length; j++) { // Handle case where the furigana is just a hiragana version of the katakana // Only works if whole thing is split along the reading if (re == kanaRegexes.katakanaRe && /^[ァ-ヴ]+$/.test(bases[index])       	    && bases[index] == readings[index].hiraganaToKatakana) { readings[index] = readings[index].hiraganaToKatakana; }

// Misc stuff like whitespace should be split searching forward if (re !== kanaRegexes.miscRe) { splitFuriganaReverse(bases, readings, index, substring[j]); } else { splitFuriganaForward(bases, readings, index, substring[j]); }

// Check if we split on the substring if (bases.length != baseLength) { // Splitting should result in [l|match|r] w/ ruby of [l|""|r] if (bases.length != baseLength + 2) { throw new Error("bindKana.js: Splitting added more than two new parts."); }

return; }       }    } }

String.prototype.hiraganaToKatakana = function { return this.replace(/[\u3041-\u3096]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) + 0x0060)}); };

// We search for everything reversed because particles are suffixes function splitFuriganaReverse(bases, readings, index, substring) { var baseReversed = reverseString(bases[index]); var readingReversed = reverseString(readings[index]); var substringReversed = reverseString(substring);

var substringEscaped = mw.util.escapeRegExp(substringReversed); var substringRe = new RegExp(substringEscaped); // We match everything to left of substring, substring, and then right side var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");

// First make sure substring is in both the base and its reading if (substringRe.test(baseReversed) && substringRe.test(readingReversed)) { // Insert substring into base var baseSearch = baseReversed.match(substringSearch); // AaBbCc -> "cC" | "bB" | "aA" var baseLeftSide = reverseString(baseSearch[3]); var baseRightSide = reverseString(baseSearch[1]); // Start at index, delete one element, and then insert the other parameters bases.splice(index, 1, baseLeftSide, substring, baseRightSide); var readingSearch = readingReversed.match(substringSearch); var readingLeftSide = reverseString(readingSearch[3]); var readingRightSide = reverseString(readingSearch[1]); readings.splice(index, 1, readingLeftSide, "", readingRightSide); } }

function reverseString(str) { return str.split("").reverse.join(""); }

// TODO: Generalize this with reverse somehow function splitFuriganaForward(bases, readings, index, substring) { var substringEscaped = mw.util.escapeRegExp(substring); var substringRe = new RegExp(substringEscaped); var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");

if (substringRe.test(bases[index]) && substringRe.test(readings[index])) { var baseSearch = bases[index].match(substringSearch); var baseLeftSide = baseSearch[1]; var baseRightSide = baseSearch[3]; // Start at index, delete one element, and then insert the other parameters bases.splice(index, 1, baseLeftSide, substring, baseRightSide); var readingSearch = readings[index].match(substringSearch); var readingLeftSide = readingSearch[1]; var readingRightSide = readingSearch[3]; readings.splice(index, 1, readingLeftSide, "", readingRightSide); } }

function displayBoundKana(bases, readings) { $("#kanjiInfo ruby").addClass("unbound"); $(".unbound").css("display", "none"); var fromWikidata = false;

// Build new ruby element from the two bases and readings arrays var newKana = ""; for (var i = 0; i < bases.length; i++) { newKana += "" + bases[i] + ""; newKana += "" + readings[i] + ""; }   newKana += " ";

$("#kanjiInfo").append(newKana); prettifyEnds; }

function prettifyEnds { // Exclude misc characters from base; for nicer formatting $("#kanjiInfo rb").each(function{       var baseText = $(this).text;

// Rm empty ruby base and readings if (baseText === "") { $(this).next.remove; $(this).remove; return; } else if (baseText === " ") { return; }

var start = baseText[0]; kanaRegexes.miscRe.lastIndex = 0; // reset regex if (kanaRegexes.miscRe.test(start)) { var startRemainder = baseText.slice(1); $(this).text(startRemainder); $(this).before("" + start + ""); }

baseText = $(this).text; kanaRegexes.miscRe.lastIndex = 0; var end = baseText.slice(-1); if (kanaRegexes.miscRe.test(end)) { var len = baseText.length; var endRemainder = baseText.slice(0, len-1); $(this).text(endRemainder); $(this).next.after("" + end + ""); }   }); }

var kanaRegexes = { kanjiRe: /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]/, // kanjiRe: /[一-龯]+/g, hiraganaRe: /[ぁ-ゔ]+/g, katakanaRe: /[ァ-ヴー]+/g, alphanumRe: /[A-Za-z0-9]+/g, miscRe: /[- !.?・、「」×〜&/]/g }

getKanjiInfo;