Module:Sandbox/DarmaniLink

require('strict'); local utf8 = require("Module:Unicode data") -- Converts romanji kana to modified hepburn, I recommend subst:ing

-- standard long vowel patterns local diacritics = { ["aa"] = "ā", ["uu"] = "ū", ["ee"] = "ē", ["oo"] = "ō", ["ou"] = "ō", ["Aa"] = "Ā", ["Uu"] = "Ū", ["Ee"] = "Ē", ["Oo"] = "Ō", ["Ou"] = "Ō", ["AA"] = "Ā", ["UU"] = "Ū", ["EE"] = "Ē", ["OO"] = "Ō", ["OU"] = "Ō" } local function romanjiToHepburn(romanji) for target, replacement in pairs(diacritics) do   	romanji = romanji:gsub(target, replacement) end return romanji end

--map is made local so it wont get cached every single time this is ran local function kanaToHepburn(kana) local romanji = "" --TODO split map up into consonant groups and create a jump table based off the unicode value local kanaMap = { ["あ"] = "a", ["い"] = "i", ["う"] = "u", ["え"] = "e", ["お"] = "o", ["か"] = "ka", ["き"] = "ki", ["く"] = "ku", ["け"] = "ke", ["こ"] = "ko", ["さ"] = "sa", ["し"] = "shi", ["す"] = "su", ["せ"] = "se", ["そ"] = "so", ["た"] = "ta", ["ち"] = "chi", ["つ"] = "tsu", ["て"] = "te", ["と"] = "to", ["な"] = "na", ["に"] = "ni", ["ぬ"] = "nu", ["ね"] = "ne", ["の"] = "no", ["は"] = "ha", ["ひ"] = "hi", ["ふ"] = "fu", ["へ"] = "he", ["ほ"] = "ho", ["ま"] = "ma", ["み"] = "mi", ["む"] = "mu", ["め"] = "me", ["も"] = "mo", ["や"] = "ya", ["ゆ"] = "yu", ["よ"] = "yo", --["ゃ"] = "ya", ["ゅ"] = "yu", ["ょ"] = "yo", ["ら"] = "ra", ["り"] = "ri", ["る"] = "ru", ["れ"] = "re", ["ろ"] = "ro", ["わ"] = "wa", ["ゐ"] = "wi", ["ゑ"] = "we", ["を"] = "wo", ["ん"] = "n", ["が"] = "ga", ["ぎ"] = "gi", ["ぐ"] = "gu", ["げ"] = "ge", ["ご"] = "go", ["ざ"] = "za", ["じ"] = "ji", ["ず"] = "zu", ["ぜ"] = "ze", ["ぞ"] = "zo", ["だ"] = "da", ["ぢ"] = "ji", ["づ"] = "dzu", ["で"] = "de", ["ど"] = "do", ["ば"] = "ba", ["び"] = "bi", ["ぶ"] = "bu", ["べ"] = "be", ["ぼ"] = "bo", ["ぱ"] = "pa", ["ぴ"] = "pi", ["ぷ"] = "pu", ["ぺ"] = "pe", ["ぽ"] = "po", ["ゔ"] = "vu" }   local smallKanaMap = { ["ぁ"] = "a", ["ぃ"] = "i", ["ぅ"] = "u", ["ぇ"] = "e", ["ぉ"] = "o", ["ゕ"] = "ka", ["ゖ"] = "ke", ["ゃ"] = "ya", ["ゅ"] = "yu", ["ょ"] = "yo" }

for character in mw.ustring.gcodepoint(kana) do	-- iterates over each kana to convert it to romanji without diacritics local char = mw.ustring.char(character)		-- this is a really bad way of doing a foreach but local romanization = kanaMap[char]			-- i'm new to mw lua, and couldn't get it to play nice local smallRomanization = smallKanaMap[char] if romanization then					-- if kana was found in kanaMap romanji = romanji .. romanization elseif smallRomanization then			-- if kana was not found in kanaMap, but found in smallKanaMap local lasttwo = romanji:sub(-3,-2) if(lasttwo == "sh" or lasttwo == "ch" or lasttwo == "ts" or lasttwo == "dz") then	-- special case for the 3 letter romanizations romanji = romanji:sub(1, -2) .. smallKanaMap[char]:sub(-1)						-- since しゅ=> shu rather than shyu or syu else romanji = romanji:sub(1, -2) .. smallKanaMap[char] -- remove vowel, append small vowel or other letters end													  -- this will cause issues if someone tries something like あぁぁぁぁぁ => a        else												       -- special rule for double little vowels maybe? will make this more expensive romanji = romanji .. char	-- character was not in either map, append it directly end end

-- Replace "っ" with the next consonant for i = 1, mw.ustring.len(romanji) do	   local chr = mw.ustring.sub(romanji, i, i)	-- string[i] if chr == "っ" then local nextChar = mw.ustring.sub(romanji, i + 1, i + 1)	-- get the next letter after the small tsu if nextChar and not nextChar:match("[aeiou]") then	   -- if it's a vowel, we don't replicate it. romanji = mw.ustring.sub(romanji, 1, i-1) .. nextChar .. mw.ustring.sub(romanji, i + 1) -- string before the small tsu + next character + string after that one xtsu end																						   -- surely there's a better way? end end return romanjiToHepburn(romanji)	-- kana is converted to romanji, now change it to hepburn end										-- TODO: add a flag to disable this, and return the normal romanji without the diacritics

-- checking for kana will need to check these bounds regardless -- might as well convert at the same time -- would it be better to have the kana conversion in the above function? local function checkForKanaPresentAndConvert(data) local kanaFound = false local convertedString = "" local kanaDelta = (mw.ustring.codepoint("ァ") - mw.ustring.codepoint("ぁ")) -- difference in the unicode table local hiraganaLowerBound = mw.ustring.codepoint("ぁ") -- I know magic numbers are bad but it almost seems more worth local hiraganaUpperBound = mw.ustring.codepoint("ゖ") -- it to use them in this context local katakanaLowerBound = mw.ustring.codepoint("ァ") -- this is really expensive local katakanaUpperBound = mw.ustring.codepoint("ヶ") for c in mw.ustring.gcodepoint(data) do       if c<=127 then	-- short circuit for ascii, which is the intended use. -- kana support was intended to be a minor feature elseif (hiraganaLowerBound <= c and c <= hiraganaUpperBound) then kanaFound = true elseif (katakanaLowerBound <= c and c <= katakanaUpperBound) then kanaFound = true c = c - kanaDelta -- convert to hiragana codepointwise so i dont have to remake the lookup table for katakana end convertedString = convertedString .. mw.ustring.char(c) -- append the processed character to the current running string end															-- this is bad, we're rebuilding the entire string just for katakana -- maybe there's some string sub magic we can do? return {kanaFound, convertedString} end

local function toHepburnKana(data) local processedData

if not data then -- short circuit return end processedData = checkForKanaPresentAndConvert(data) if processedData[1] then					-- processedData[1] = kanaFound return kanaToHepburn(processedData[2])	-- processedData[2] = convertedString else return romanjiToHepburn(data)			-- kana not found, that should mean we were probably given romanji end											-- and if its other unicode, they just get that string back end

local p = {} local flags = {} --TODO add a performant way to detect if there is kana in a string --this could be expanded to use bopomofo too function p.toHepburn(frame) local data = frame.args[1] local romanji = "#ERROR!" -- this should change, built in sanity test local fromRomanji = false if flags["romanji"] then romanji = romanjiToHepburn(data) fromRomanji = true else romanji = toHepburnKana(data) fromRomanji = false end if flags["name"] then romanji = romanji:gsub("(%a)([%w_']*)", function(first, rest)	   	return first:upper .. rest:lower    	end) end

return romanji end

-- testing function for the lua console on the module itself function p.toHepburnTEST(frame) return frame.args[2] end

return p