User:Opencooper/bindKana-dev.js

// This script takes kanji with ruby text over it and removes repeated parts // It's called automatically by showKanji.js if any furigana was added

// The basic algorithm searches for *continuous* hiragana/katakana/latin/punctuation // strings that are in both the base and reading, and splits on these. This does // not take into account any lexical information (so it doesn't know anything about // particles or individual kanji readings). It can also fail for more complicated // cases, but the script should be able to abort for these (maybe in the future we can // continue and just ignore that specific base and substring).

// License: CC0

// References: //            https://www.w3.org/International/articles/ruby/markup.en //             https://w3c.github.io/i18n-drafts/articles/ruby/styling.en.html //            https://www.w3.org/TR/css-ruby-1/#break-between

// FIXME: Throws out whitespace, e.g. https://en.wikipedia.org/wiki/Days_of_Youth

// Fails on okurigana: https://en.wikipedia.org/wiki/I_Am_a_Cat // Possible bug on: https://en.wikipedia.org/wiki/Douglas%E2%80%93Grumman_scandal // Overcapturing: https://en.wikipedia.org/wiki/Kare_Kano //               https://en.wikipedia.org/wiki/Nobunaga_no_Shinobi //               https://en.wikipedia.org/wiki/Musashino-sen_no_Shimai //               https://en.wikipedia.org/wiki/Hatfield%E2%80%93McCoy_feud // Missing part of furigana: https://en.wikipedia.org/wiki/Tsuki_wa_Higashi_ni_Hi_wa_Nishi_ni //                          https://en.wikipedia.org/wiki/Kawaii // Katakana can't match extraneous hiragana: https://en.wikipedia.org/wiki/Gompertz_function // Broken because of spaces: https://en.wikipedia.org/wiki/Go_for_It,_Baby_(Kioku_no_Sanmyaku) // Midpoint replacing special chars: https://en.wikipedia.org/wiki/Doki_Doki_Wildcat_Engine

/* Test pages: https://en.wikipedia.org/wiki/Lear_on_the_Shore - mixed hiragana/katakana https://en.wikipedia.org/wiki/One_Cut_of_the_Dead - failed capture blocking later https://en.wikipedia.org/wiki/Dog%C3%97Police - partial block https://en.wikipedia.org/wiki/Otome_wa_Boku_ni_Koishiteru - partial capture https://en.wikipedia.org/wiki/Sacrificial_Princess_and_the_King_of_Beasts - fails https://en.wikipedia.org/wiki/Clamp_no_Kiseki - Latin isn't consumed https://en.wikipedia.org/wiki/Cape_St._George - interpunct is a space https://en.m.wikipedia.org/wiki/Chūshingura:_Hana_no_Maki,_Yuki_no_Maki - whitespace ignored on mobile https://en.wikipedia.org/wiki/Hageshisa_to,_Kono_Mune_no_Naka_de_Karamitsuita_Shakunetsu_no_Yami https://en.wikipedia.org/wiki/Ai_to_Makoto https://en.wikipedia.org/wiki/Aru_yo_no_Tonosama https://en.wikipedia.org/wiki/Cho_Kamen_Rider_Den-O_%26_Decade_Neo_Generations:_The_Onigashima_Warship https://en.wikipedia.org/wiki/Doubutsu_Sentai_Zyuohger_vs._Ninninger_the_Movie:_Super_Sentai%27s_Message_from_the_Future https://en.wikipedia.org/wiki/Light_Novel_no_Tanoshii_Kakikata https://en.wikipedia.org/wiki/YU-NO:_A_Girl_Who_Chants_Love_at_the_Bound_of_this_World - overcapture https://en.wikipedia.org/wiki/Suppose_a_Kid_From_the_Last_Dungeon_Boonies_Moved_to_a_Starter_Town - overcapture https://en.wikipedia.org/wiki/Bloom_in_the_Moonlight https://en.wikipedia.org/wiki/A_Scene_at_the_Sea - period isn't stripped https://en.wikipedia.org/wiki/Kamen_Rider_Reiwa_The_First_Generation https://en.wikipedia.org/wiki/Mr._Thank_You https://en.wikipedia.org/wiki/Sakura_no_Uta https://en.wikipedia.org/wiki/I_Married_My_Best_Friend_To_Shut_My_Parents_Up https://en.wikipedia.org/wiki/Papillon_(manga) https://en.wikipedia.org/wiki/Ten_(manga) - deletes space in base+reading https://en.wikipedia.org/wiki/La_Dolce_Vita https://en.wikipedia.org/wiki/O-Parts_Hunter -- needs some kind of check https://en.wikipedia.org/wiki/Two_or_Three_Things_I_Know_About_Her https://en.wikipedia.org/wiki/The_%22Hentai%22_Prince_and_the_Stony_Cat.

function getKanjiInfo { // Don't run if the kanji or the ruby is hidden if ($("#kanjiInfo").css("display") == "none" || $("#kanjiInfo rt").css("display") == "none") { return; }

var kanji = $("#kanjiInfo ruby")[0].childNodes[0].nodeValue; var kana = $("#kanjiInfo rt").text;

if (!kanji || !kana) { return; }

var bases = [kanji]; var readings = [kana]; logTable(readings, bases);

bindKana(bases, readings);

// If any binding occured if (bases.length > 1) { displayBoundKana(bases, readings); } }

function bindKana(bases, readings) { var iterations = 0; var maxIterations = 25; var foundBindings = true; while (foundBindings && iterations != maxIterations) { iterations++; foundBindings = tryBind(bases, readings);

if (foundBindings) { logTable(readings, bases); // console.log("bindKana-dev.js: readings: `" + readings + "`"); // console.log("bindKana-dev.js: bases: `" + bases + "`\n"); }   }

// Sanity check if (bases.length != readings.length) { throw new Error("bindKana-dev.js: Bases and readings arrays don't have same lengths."); }   // Check kanji:kana ratio for (var i = 0; i < bases.length; i++) { var kanjiLength = bases[i].length; var kanaLength = readings[i].length; if (kanjiLength === 0 || kanaLength === 0) { continue; } var ratio = kanaLength / kanjiLength; if (ratio >= 6 || ratio <= 1/6) { throw new Error("bindKana-dev.js: kanji:kana ratio greater than 6 for `"   		                + bases[i] + "` and `" + readings[i] + "`."); }   }    if (iterations == maxIterations - 1) { console.warn("bindKana-dev.js: Encountered maximum iterations."); if (bases.length == 1) { throw new Error("bindKana-dev.js: Encountered maximum iterations while furigana wasn't split once."); }   } }

function tryBind(bases, readings) { var regexes = [kanaRegexes.katakanaRe, kanaRegexes.alphanumRe, kanaRegexes.hiraganaRe, kanaRegexes.miscRe]; var baseLength = bases.length; for (var i = 0; i < baseLength; i++) { if (readings[i] === "") { continue; }

for (var regex of regexes) { searchBase(bases, readings, i, regex);

if (bases.length != baseLength) { break; }       }    }

if (bases.length != baseLength) { // Make sure splitting didn't mess up the bindings for (var j = 0; j < bases.length; j++) { if (kanaRegexes.kanjiRe.test(bases[j]) && readings[j] === "") { throw new Error("bindKana-dev.js: Kanji base with no reading: `"                               + bases[j] + "` at index " + j); } else if (bases[j] === "" && readings[j]) { throw new Error("bindKana-dev.js: Blank base with reading: `"                               + readings[j] + "` at index " + j); }       }

return true; } else { return false; } }

function searchBase(bases, readings, index, re) { var baseLength = bases.length; var substring = bases[index].match(re); if (substring) { for (var j = 0; j < substring.length; j++) { // Handle case where the furigana is just a hiragana version of the katakana // Only works if whole thing is split along the reading if (re == kanaRegexes.katakanaRe && /^[ァ-ヴ]+$/.test(bases[index])       	    && bases[index] == readings[index].hiraganaToKatakana) { readings[index] = readings[index].hiraganaToKatakana; }

// Misc stuff like whitespace should be split searching forward if (re !== kanaRegexes.miscRe) { splitFuriganaReverse(bases, readings, index, substring[j]); } else { splitFuriganaForward(bases, readings, index, substring[j]); }

// Check if we split on the substring if (bases.length != baseLength) { // Splitting should result in [l|match|r] w/ ruby of [l|""|r] if (bases.length != baseLength + 2) { throw new Error("bindKana-dev.js: Splitting added more than two new parts."); }

return; }       }    } }

String.prototype.hiraganaToKatakana = function { return this.replace(/[\u3041-\u3096]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) + 0x0060)}); };

// We search for everything reversed because particles are suffixes function splitFuriganaReverse(bases, readings, index, substring) { var baseReversed = reverseString(bases[index]); var readingReversed = reverseString(readings[index]); var substringReversed = reverseString(substring);

var substringEscaped = mw.util.escapeRegExp(substringReversed); var substringRe = new RegExp(substringEscaped); // We match everything to left of substring, substring, and then right side var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");

// First make sure substring is in both the base and its reading if (substringRe.test(baseReversed) && substringRe.test(readingReversed)) { console.log("bindKana-dev.js: string found in both `" + readings[index]                   + "` and `" + bases[index] + "`: `" + substring                    + "` at index " + index); // Insert substring into base var baseSearch = baseReversed.match(substringSearch); // AaBbCc -> "cC" | "bB" | "aA" var baseLeftSide = reverseString(baseSearch[3]); var baseRightSide = reverseString(baseSearch[1]); // Start at index, delete one element, and then insert the other parameters bases.splice(index, 1, baseLeftSide, substring, baseRightSide); var readingSearch = readingReversed.match(substringSearch); var readingLeftSide = reverseString(readingSearch[3]); var readingRightSide = reverseString(readingSearch[1]); readings.splice(index, 1, readingLeftSide, "", readingRightSide); } }

function reverseString(str) { return str.split("").reverse.join(""); }

// TODO: Generalize this with reverse somehow function splitFuriganaForward(bases, readings, index, substring) { var substringEscaped = mw.util.escapeRegExp(substring); var substringRe = new RegExp(substringEscaped); var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");

if (substringRe.test(bases[index]) && substringRe.test(readings[index])) { console.log("bindKana-dev.js: string found in both "+ bases[index]                   + " and " + readings[index] + ": `" + substring                    + "` at index " + index); var baseSearch = bases[index].match(substringSearch); var baseLeftSide = baseSearch[1]; var baseRightSide = baseSearch[3]; // Start at index, delete one element, and then insert the other parameters bases.splice(index, 1, baseLeftSide, substring, baseRightSide); var readingSearch = readings[index].match(substringSearch); var readingLeftSide = readingSearch[1]; var readingRightSide = readingSearch[3]; readings.splice(index, 1, readingLeftSide, "", readingRightSide); } }

function displayBoundKana(bases, readings) { $("#kanjiInfo ruby").addClass("unbound"); $(".unbound").css("display", "none");

// Build new ruby element from the two bases and readings arrays var newKana = ""; for (var i = 0; i < bases.length; i++) { newKana += "" + bases[i] + ""; newKana += "" + readings[i] + ""; }   newKana += " ";

$("#kanjiInfo").append(newKana);

prettifyEnds;

// Extra dev-only stuff $("#kanjiInfo").click(       function(e) {        	if (!e.metaKey) { return; }        	var visible = $(".unbound").is(":visible");        	if (!visible) {                $(".bound").hide;                $(".unbound").show;        	} else {                $(".unbound").hide;        		$(".bound").show;        	}        }    ); const styleSheet = window.document.styleSheets[0]; styleSheet.insertRule("#kanjiInfo .bound rt { user-select: none; }"); }

function prettifyEnds { // Exclude misc characters from base; for nicer formatting $("#kanjiInfo rb").each(function{       var baseText = $(this).text;

// Rm empty ruby base and readings if (baseText === "") { $(this).next.remove; $(this).remove; return; } else if (baseText === " ") { return; }

var start = baseText[0]; kanaRegexes.miscRe.lastIndex = 0; // reset regex if (kanaRegexes.miscRe.test(start)) { // console.log("bindKana-dev.js: found misc at start: `" + start + "`"); var startRemainder = baseText.slice(1); $(this).text(startRemainder); $(this).before("" + start + ""); }

baseText = $(this).text; kanaRegexes.miscRe.lastIndex = 0; var end = baseText.slice(-1); if (kanaRegexes.miscRe.test(end)) { // console.log("bindKana-dev.js: found misc at end: `" + end + "`"); var len = baseText.length; var endRemainder = baseText.slice(0, len-1); $(this).text(endRemainder); $(this).next.after("" + end + ""); }   }); }

// The table logging is done asynchronously, so we make a deep copy function logTable(readings, bases) { var copy = {readings: [], bases: []}; copy.readings = readings.slice(0); copy.bases = bases.slice(0); console.table(copy); }

var kanaRegexes = { kanjiRe: /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]/, // kanjiRe: /[一-龯]+/g, hiraganaRe: /[ぁ-ゔ]+/g, katakanaRe: /[ァ-ヴー]+/g, alphanumRe: /[A-Za-z0-9]+/g, miscRe: /[- !.?・、「」★×〜&/]/g }

getKanjiInfo;