User:Opencooper/showKanji-dev.js

// This script shows, if found, the kanji and kana for an article // It then calls another script, bindKana.js, to clean up the display of ruby // For configuration, please see the documentation

// TODO: Reject if any base/reading has too low/high of a ratio. // TODO: Reject if unbalanced parenthesis count // TODO:  is not actually in the whatcg standard...

// License: CC0

/* Sample pages: https://en.wikipedia.org/wiki/Tamio_Kawachi - kana on wikidata https://en.wikipedia.org/wiki/A_Fantastic_Tale_of_Naruto - kanji from wikidata only https://en.wikipedia.org/wiki/What_a_Wonderful_World! - kana from wikidata only https://en.wikipedia.org/wiki/Asako_I_%26_II - from redirect https://en.wikipedia.org/wiki/Bokura_ga_Ita_(film) - interwiki to subsection https://ja.wikipedia.org/wiki/%E7%B4%AF - kana part of bolded title https://en.wikipedia.org/wiki/Bokutachi_no_Koukan_Nikki - kana not in first sentence https://en.wikipedia.org/wiki/Domestic_Girlfriend - first full sentence not lead https://ja.wikipedia.org/wiki/SCP%E8%B2%A1%E5%9B%A3 - bolded term w/ kana past first sentence https://en.wikipedia.org/wiki/Nuclear_fusion - different term w/ kana in lead https://en.wikipedia.org/wiki/Oedipus_Rex - other stuff in kana https://en.wikipedia.org/wiki/20th_Century_Boys - overcapturing because title is subset https://en.wikipedia.org/wiki/Seiza - kana not at start of parenthesis https://en.wikipedia.org/wiki/Indentation_style - other kana in disambiguation https://en.wikipedia.org/wiki/Haven%27t_You_Heard%3F_I%27m_Sakamoto - Halfwidth-fullwidth difference https://en.wikipedia.org/wiki/Kanji_Furutachi - kanji only https://en.wikipedia.org/wiki/Anata_e - hiragana only https://en.wikipedia.org/wiki/Anatahan_(film) - katakana only https://en.wikipedia.org/wiki/A.LI.CE - latin only https://en.wikipedia.org/wiki/0.5_mm - numeric https://en.wikipedia.org/wiki/Truth_Coming_Out_of_Her_Well - angle brackets https://en.wikipedia.org/wiki/South_of_the_Border,_West_of_the_Sun - kana contains comma https://en.wikipedia.org/wiki/Leap_year - multiple kana separated by comma https://en.wikipedia.org/wiki/Do_You_Love_Your_Mom_and_Her_Two-Hit_Multi-Target_Attacks%3F - question mark https://en.wikipedia.org/wiki/Comic_Magazine - exclamation point https://en.wikipedia.org/wiki/Tsurune - dash https://en.wikipedia.org/wiki/Flare_(film) - wave dash https://en.wikipedia.org/wiki/Dog%C3%97Police - multiplication sign https://en.wikipedia.org/wiki/Foreboding_(film) - spaces https://en.wikipedia.org/wiki/Age_12 - period in title https://en.wikipedia.org/wiki/Suzukake_Nanchara - very long kanji https://en.wikipedia.org/wiki/After_the_Rain_(manga) - kanji + hiragana https://en.wikipedia.org/wiki/Afro_Tanaka - kanji + katakana https://en.wikipedia.org/wiki/Battle_Girl:_The_Living_Dead_in_Tokyo_Bay - katakana + latin https://en.wikipedia.org/wiki/Calling_You_(short_story_collection) - kanji + hiragana + latin https://en.wikipedia.org/wiki/Ashita_no_Joe - hiragana + katakana https://en.wikipedia.org/wiki/Arcadia_of_My_Youth - kanji + hiragana + katakana https://en.wikipedia.org/wiki/Haou_Airen - special character https://ja.wikipedia.org/wiki/%E6%98%A0%E7%94%BB_%E8%81%B2%E3%81%AE%E5%BD%A2 - reference in between https://en.wikipedia.org/wiki/Ninjō - No interlanguage, but wiktionary https://en.wikipedia.org/wiki/Seiza - Interlanguage failed, but wiktionary https://en.wikipedia.org/wiki/Epsomite - No interlanguage, but wiktionary "see" Table https://en.wikipedia.org/wiki/Bakayaro!_I%27m_Plenty_Mad - only part of parenthesis extracted

https://en.wikipedia.org/wiki/ORCID https://en.wikipedia.org/wiki/Survive_Style_5%2B - fails due to + https://en.wikipedia.org/wiki/Ko-Shint%C5%8D https://ja.wikipedia.org/wiki/Terminate_and_Stay_Resident https://en.wikipedia.org/wiki/Ikk%C5%8D-sh%C5%AB https://en.wikipedia.org/wiki/Kakegoe - doesn't find jawiki interlanguage https://en.wikipedia.org/wiki/Love_Live!_The_School_Idol_Movie - interpunct in reading https://en.wikipedia.org/wiki/Lupin_the_Third:_The_Woman_Called_Fujiko_Mine - hyphen in kanji https://en.wikipedia.org/wiki/Sunscreen https://en.wikipedia.org/wiki/Flag_of_China https://en.wikipedia.org/wiki/W3m https://en.wikipedia.org/wiki/Magnum_Collection_1999_%22Dear%22 https://en.wikipedia.org/wiki/EC_Comics https://en.wikipedia.org/wiki/CJK_characters https://en.wikipedia.org/wiki/My_Girlfriend_is_Shobitch https://en.wikipedia.org/wiki/Immaculate_Conception_Cathedral,_Nagasaki - partial match https://en.wikipedia.org/wiki/USA-224 - または https://en.wikipedia.org/wiki/Milk - bad match https://en.wikipedia.org/wiki/Not_invented_here

function setup { // If we're not reading an article, do nothing if (!(mw.config.get( 'wgAction' ) === 'view' && mw.config.get( 'wgIsArticle' ) && !location.search.split('oldid=')[1] && !mw.config.get("wgIsMainPage") && mw.config.get("wgContentLanguage") !== "ja")) { return; }

// Assuming that if there's no wikidata, there're no 1:1 interlanguage links, // and we don't want cases where a page links to a subsection of a jawiki // article if (wikidataId === null) { return; }

// Placeholder so other elements don't push it down later var header; if ($('#firstHeading').length) { // Vector header = $('#firstHeading'); } else if ($('.page-heading').length) { // Minerva header = $('.page-heading'); } else { console.error("showKanji-dev.js: Couldn't find a page heading. This skin (" + mw.config.get( 'skin' ) + ") might not be supported."); return; }   header.append(" ");

// Get the Japanese label from wikidata // API docs: https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities $.ajax({       url: "https://www.wikidata.org/w/api.php",        data: {            action: "wbgetentities",            ids: wikidataId,            props: "labels",            languages: "ja",            format: "json",            origin: "*"        },        success: parseJaLabel    }); }

function parseJaLabel(response) { var wikidataInfo = response.entities[wikidataId]; var jaLabel; if (!jQuery.isEmptyObject(wikidataInfo.labels.ja)) { jaLabel = wikidataInfo.labels.ja.value; }

if (jaLabel) { jaLabel = jaLabel.toHalfWidth; console.log("showKanji-dev.js: kanji: `" + jaLabel + "`"); buildRegexes(jaLabel); displayKanji(jaLabel); } else { return; }

// If the japanese title is not just only kana, get the reading if (!kanjiRegexes.kanaOnly.test(jaLabel)) { requestKana; } }

function buildRegexes(kanji) { // Strip $kanji of all kanji and kana, adding whatever is left to the regex var reKanjiKana = /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6Aぁ-ゔァ-ヴー-]/g; var kanjiStripped = kanji.replace(reKanjiKana, ""); kanjiStripped += " "; // Need to add hyphen escaped since it has special behavior in regex classes // TODO: Just escape $kanji early instead, like we did before? kanjiStripped += "\\-"; var kanjiAuxillary = kanjiStripped.replace(/\w/g, "");

kanjiRegexes.latinOnly = /^[A-Za-z0-9\-.?!/,:;@#$%&+=*'"・ ]+$/;   kanjiRegexes.kanaOnly = new RegExp("^[ぁ-ゔァ-ヴー" + kanjiAuxillary + "]+$");    kanjiRegexes.hiraganaOnly = new RegExp("^[ぁ-ゔーA-Za-z" + kanjiAuxillary + "]+$");    kanjiRegexes.katakanaOnly = new RegExp("^[ァ-ヴーA-Za-z" + kanjiAuxillary + "]+$");

// Add midpoint for Latin in titles if (/\w/.test(kanji)) { kanjiStripped += "・"; } console.log("showKanji-dev.js: stripped: `" + kanjiStripped + "`");

var leadReBase = "([ぁ-ゔァ-ヴー" + kanjiStripped + "]+)"; var kanjiEscaped = mw.util.escapeRegExp(kanji); // Account for spaces, but ignore backslash and other misc characters var reKanjiKanaLatin = /([\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6Aぁ-ゔァ-ヴーA-Za-z0-9])/g; var kanjiSpaced = kanjiEscaped.replace(/ /g, " ?"); kanjiSpaced = kanjiSpaced.replace(reKanjiKanaLatin, "$1 ?");

// Add kanji to regex to make sure we're not getting the reading of some // other term kanjiRegexes.leadUnspaced = new RegExp(kanjiEscaped + "[^(\n)]*?\\(" + leadReBase);   kanjiRegexes.lead = new RegExp(kanjiSpaced + "[^(\n)]*?\\(" + leadReBase, "i"); // brittle }

function displayKanji(kanji) { wikidataKanji = kanji; $('#kanjiInfo').append(" " + kanji + " ");

// Add some classes so users can choose to not display for example // katakana-only kanji in their CSS if (kanjiRegexes.latinOnly.test(kanji)) { $("#kanjiInfo").addClass("kanjiInfo-latin-only"); $("#kanjiInfo").prop("title", "Japanese title in Latin script"); $("#kanjiInfo").css("display", "none"); } else if (kanjiRegexes.hiraganaOnly.test(kanji)) { $("#kanjiInfo").addClass("kanjiInfo-hiragana-only"); $("#kanjiInfo").prop("title", "Japanese title in hiragana"); } else if (kanjiRegexes.katakanaOnly.test(kanji)) { $("#kanjiInfo").addClass("kanjiInfo-katakana-only"); $("#kanjiInfo").prop("title", "Japanese title in katakana"); } else { $("#kanjiInfo").prop("title", "Japanese title in kanji"); } }

function requestKana { // API docs: https://www.wikidata.org/w/api.php?action=help&modules=wbgetclaims // We have to wholesale get all the claims instead of just one because the // kana might be present as a qualifier to another claim $.ajax({       url: "https://www.wikidata.org/w/api.php",        data: {            action: "wbgetclaims",            entity: wikidataId,            format: "json",            origin: "*"        },        success: parseKanaClaim    }); }

function parseKanaClaim(response) { var kana; var properties = { title: "P1476", nativeLabel: "P1705", officialName: "P1448", nameInNativeLanguage: "P1559" };   var nameInKana = "P1814"; // Try getting nameInKana as a qualifier to some properties for (var prop in properties) { var pnum = properties[prop]; if (response.claims[pnum]) { var kanji = response.claims[pnum][0].mainsnak.datavalue.value.text; if (kanji.replace(/ /g, "") == wikidataKanji.replace(/ /g, "")               && response.claims[pnum][0].qualifiers                && response.claims[pnum][0].qualifiers[nameInKana]) { kana = response.claims[pnum][0].qualifiers[nameInKana][0].datavalue.value; break; }   	}    }

// Try getting nameInKana as a general claim if (!kana && response.claims[nameInKana]) { prop = "nameInKana"; kana = response.claims[nameInKana][0].mainsnak.datavalue.value; }   // We couldn't find nameInKana if (!kana) { getInterlanguage; return; }

kana = kana.toHalfWidth; displayKana(kana); $("#kanjiInfo").addClass("kanjiInfo-wikidata"); $("#kanjiInfo").addClass("kanjiInfo-wikidata-" + prop); }

function getInterlanguage { var apiUrl = location.origin + "/w/api.php"; // Documentation: https://en.wikipedia.org/w/api.php?action=help&modules=query%2Blanglinks $.ajax({       url: apiUrl,        data: {            action: "query",            format: "json",            prop: "langlinks",            lllang: "ja",            titles: mw.config.get( 'wgTitle' )        },        success: function(response) {        	var pageId = mw.config.get( 'wgArticleId' );        	var page = response.query.pages[pageId];            var langlinks = page ? page.langlinks : undefined;        	var jaLabel;        	if (langlinks) {        	    jaLabel = langlinks[0]["*"];        	    jaLabel = jaLabel.replace(/(.*)#.*/, "$1"); // rm anchors        	} else {        		getWiktionary;        		return;        	}        	scrapeKana(jaLabel);        }    }); }

function scrapeKana(jaLabel) { // Get jawiki article's lead wikitext // API docs: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bextracts $.ajax({       url: "https://ja.wikipedia.org/w/api.php",        data: {            action: "query",            prop: "extracts",            format: "json",            redirects: true,            exintro: true,            exsentences: 2,            exlimit: 1,            explaintext: true,            titles: jaLabel,            origin: "*"        },        success: getFirstSentence    }); }

function getFirstSentence(response) { var responsePart = response.query.pages; // Have to split parsing into two parts since jawiki pageid is unknown var pageId = Object.keys(responsePart)[0]; var introText = responsePart[pageId].extract;

if (!introText) { console.error("showKanji-dev.js: TextExtracts failed to get a lead for the Japanese article."); getWiktionary; return; }

var wikitext = introText.toHalfWidth;

console.log("showKanji-dev.js: lead: `" + wikitext + "`"); console.log("showKanji-dev.js: regex: `" + kanjiRegexes.lead + "`"); console.log("showKanji-dev.js: regex (unspaced): `" + kanjiRegexes.leadUnspaced + "`");

var kana; var kanaSearch = wikitext.match(kanjiRegexes.lead); if (kanaSearch && kanaSearch.length == 2) { kana = kanaSearch[1]; } else { getWiktionary; return; }

// Rm trailing characters kana = kana.replace(/[・、 ]$/, "");

// Abort if our reading is only katakana (for non-Latin) or Latin if ((!kanjiRegexes.latinOnly.test(wikidataKanji) && kanjiRegexes.katakanaOnly.test(kana))       || kanjiRegexes.latinOnly.test(kana)) { console.log("showKanji-dev.js: throwing away reading: " + kana); getWiktionary; return; }

displayKana(kana); $("#kanjiInfo").addClass("kanjiInfo-jawiki"); }

// Adapted from: //    http://ilog4.blogspot.com/2015/09/javascript-convert-full-width-and-half.html //    https://stackoverflow.com/a/20488304/1995949 //    https://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms String.prototype.toHalfWidth = function { var halfWidth = this.replace(/[\uff01-\uff5e]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) - 0xFEE0)}); halfWidth = halfWidth.replace(/　/g, " "); return halfWidth; };

// We use the English Wiktionary because it has more terms and better structure function getWiktionary { // API docs: https://en.wikipedia.org/w/api.php?action=help&modules=parse $.ajax({       url: "https://en.wiktionary.org/w/api.php",        data: {            action: "parse",            format: "json",            page: wikidataKanji,            prop: "sections",            origin: "*"        },        success: findJapaneseSection    }); }

function findJapaneseSection(response) { if (response.error) { console.log("showKanji-dev.js: No Wiktionary item for " + wikidataKanji); return; }   var sectionsCount = response.parse.sections.length; var sectionIndex; for (let i = 0; i < sectionsCount; i++) { var sectionHeader = response.parse.sections[i].line; if (sectionHeader == "Japanese") { sectionIndex = response.parse.sections[i].index; break; }   }    if (sectionIndex == null) { console.log("showKanji-dev.js: Wiktionary entry doesn't have a section titled 'Japanese'"); return; }

// API docs: https://en.wikipedia.org/w/api.php?action=help&modules=parse $.ajax({       url: "https://en.wiktionary.org/w/api.php",        data: {            action: "parse",            format: "json",            page: wikidataKanji,            prop: "text",            section: sectionIndex,            origin: "*"        },        success: parseWiktionary    }); }

function parseWiktionary(response) { var html = response.parse.text["*"]; var parsed = $($.parseHTML(html));

// Wiktionary adds readings as furigana var headword = parsed.find(".headword:lang(ja)").first; var seeTable = parsed.find(".Jpan ruby").first; var kanji = ""; var kana = ""; if (headword.length) { // Wiktionary already binds their kana, so we have to undo the process to get // the constituent parts, at least with the current markup var childNodes = headword[0].childNodes; for (let i = 0; i < childNodes.length; i++) { if (childNodes[i].nodeName == "RUBY") { var ruby = $(childNodes[i]); // convert back to JQuery for convenience ruby.children("rp").remove; kana += ruby.children("rt").detach.text; kanji += ruby.text; } else if (childNodes[i].nodeType == 3) { // "#text" kanji += childNodes[i].nodeValue; kana += childNodes[i].nodeValue; }       }        if (kanji != wikidataKanji) { return; } } else if (seeTable.length) { kanji = seeTable.children("rb").text; kana = seeTable.children("rt").text; } else { return; }

if (kana) { displayKana(kana); $("#kanjiInfo").addClass("kanjiInfo-wiktionary");

// Extra stuff just for fun var definition = headword.parent.siblings("ol").children("li").first.text; definition = definition.split('\n', 1)[0]; definition = definition.replace(/\[[0-9]{1,2}\]/g, "");		$("#kanjiInfo").prop("title", definition);	} }

function displayKana(kana) { $("#kanjiInfo ruby").append("" + kana + "");

// Cleanup redundant furigana with another script var kanjiOnlyRe = /^[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]+$/; if (!kanjiOnlyRe.test(wikidataKanji)) { mw.loader.load( '//en.wikipedia.org/w/index.php?title=User:Opencooper/bindKana-dev.js&action=raw&ctype=text/javascript' ); } }

var wikidataId = mw.config.get( 'wgWikibaseItemId' ); var wikidataKanji; var kanjiRegexes = {}; $(setup);