Module:Make emoji zwj table

--[[

This module creates an associative table emoji code points that may follow a zero-width joiner character (U+200D).

The module reads a copy of the Unicode Emoji ZWJ Sequences for UTS (typically emoji-zwj-sequences.txt found in https://unicode.org/Public/emoji/VV.V/ where VV.V is the Unicode version number). The copy of the unicode data file is held inside html comments in the module's /doc page. From that file, the module extracts pairs of . The moculde save each unique code point, transformed as necessary to build a new version of emoji_t for use in Module:Citation/CS1/Configuration.

The module takes one positional parameter:

is the url that matches the Unicode data file. Alas, Lua modules cannot read external data files so is merely used to document where the data may be found.

Use of this module is documented on its /doc page

]]

require('strict');

local emoji_names_t = {															-- keys are hex values from U+xxxx code points ['2194'] = 'left right arrow', ['2195'] = 'up down arrow', ['2620'] = 'skull and crossbones', ['2640'] = 'female sign', ['2642'] = 'male sign', ['2695'] = 'staff of aesculapius', ['2696'] = 'scales', ['26A7'] = 'male with stroke and male and female sign', ['2708'] = 'airplane', ['2744'] = 'snowflake', ['2764'] = 'heavy black heart', ['27A1'] = 'black rightwards arrow', ['2B1B'] = 'black large square', ['1F308'] = 'rainbow', ['1F32B'] = 'fog', ['1F33E'] = 'ear of rice', ['1F373'] = 'cooking', ['1F37C'] = 'baby bottle', ['1F384'] = 'christmas tree', ['1F393'] = 'graduation cap', ['1F3A4'] = 'microphone', ['1F3A8'] = 'artist palette', ['1F3EB'] = 'school', ['1F3ED'] = 'factory', ['1F466'] = 'boy', ['1F467'] = 'girl', ['1F468'] = 'man', ['1F469'] = 'woman', ['1F48B'] = 'kiss mark', ['1F4A5'] = 'collision symbol', ['1F4A8'] = 'dash symbol', ['1F4AB'] = 'dizzy symbol', ['1F4BB'] = 'personal computer', ['1F4BC'] = 'brief case', ['1F525'] = 'fire', ['1F527'] = 'wrench', ['1F52C'] = 'microscope', ['1F5E8'] = 'left speech bubble', ['1F680'] = 'rocket', ['1F692'] = 'fire engine', ['1F7E9'] = 'large green square', ['1F7EB'] = 'large brown square', ['1F91D'] = 'handshake', ['1F9AF'] = 'probing cane', ['1F9B0'] = 'emoji component red hair', ['1F9B1'] = 'emoji component curly hair', ['1F9B2'] = 'emoji component bald', ['1F9B3'] = 'emoji component white hair', ['1F9BA'] = 'safety vest', ['1F9BC'] = 'motorized wheelchair', ['1F9BD'] = 'manual wheelchair', ['1F9D1'] = 'adult', ['1F9D2'] = 'child', ['1FA79'] = 'adhesive bandage', ['1FAF2'] = 'leftwards hand', }

----< M A I N >--

local function main (frame) local this_wiki = table.concat ({':', mw.language.getContentLanguage:getCode, ':'}); local title_obj = mw.title.getCurrentTitle; local content; if title_obj.prefixedText:match ('/doc$') then								-- if this title object is the ~/doc page (viewing the ~/doc page standalone) content = title_obj:getContent;										-- get the content else																		-- when viewing the module page content = mw.title.new (table.concat ({title_obj.prefixedText, '/doc'})):getContent;	-- get title object and content for the ~/doc page end

local code_points_t = {};													-- sequence to hold unique code points that follow U+200D in RGI Emoji ZWJ Sequences in decimal local out_t = {};															-- final output goes here local new_emoji_names_t = {};												-- used to update emoji_names_t in this module local tabs_15 = string.rep ('\t', 15);										-- for six-digit keys local tabs_16 = string.rep ('\t', 16);										-- for keys that have fewer than six digits local file_date = content:match ('# *Date: *(%d%d%d%d%-%d%d%-%d%d)');		-- file date of the Unicode source local file_version = content:match ('# *Version: *([%d%.]+)');				-- version of the Unicode source

for code_point in content:gmatch ('200D (%x+)') do							-- find each pair local code_point_dec = tonumber ('0x' .. code_point);					-- convert hex code point to decimal for output table key

if not code_points_t[code_point] then									-- if we have not seen this  before code_points_t[code_point] = true;									-- remember that we have now seen this  table.insert (out_t, table.concat ({								-- build a line for this code point '\t[',															-- open key markup code_point_dec,													--  in decimal '] = true,',													-- close key and assign it the value 'true' (100000 <= code_point_dec) and tabs_15 or tabs_16,				-- insert a bunch of tabs between the k/v pair and an associated comment '-- U+',														-- start the comment; prefix for the hex code_point,														-- add the ' &#x',															-- hex html entity prefix for code_point,														-- add the '; ',															-- finish the html entity emoji_names_t[code_point] and emoji_names_t[code_point] or '',	-- if we have a name for this code point, add the name; empty string else }));

table.insert (new_emoji_names_t, table.concat ({					-- build a line for this code point '\t[\'',															-- open key markup code_point,														--  in hex '\'] = \'',														-- close key, open quote mark and ready to assign it a name emoji_names_t[code_point] and emoji_names_t[code_point] or '',	-- if we have a name for this code point, add the name; empty string else '\',',															-- add closing quote mark and terminal comma }));		end end

local function compare_dec (a, b)											-- local compare function for decimal table.sort ascending a = a:match ('%[(%d+)%]');												-- extract decimal key text b = b:match ('%[(%d+)%]'); return tonumber (a) < tonumber (b);										-- convert key text to numbers and compare end

local function compare_hex (a, b)											-- local compare function for hexadecimal table.sort ascending a = a:match ('%[\'(%x+)\'%]');											-- extract hexadecimal key text b = b:match ('%[\'(%x+)\'%]'); a = table.concat ({'0x', a});											-- make a hex string b = table.concat ({'0x', b}); return tonumber (a) < tonumber (b);										-- convert hex key text todecimal numbers and compare end

table.sort (out_t, compare_dec);											-- ascending numerical sort on decimal keys local prefix_t = {};														-- build a prefix for this version of the table table.insert (prefix_t, '==emoji_t =='); table.insert (prefix_t, 'use this table to overwrite same-named table in Module:Citation/CS1/Configuration/sandbox'); table.insert (prefix_t, ' -- list of emoji that use a zwj character (U+200D) to combine with another emoji'); table.insert (prefix_t, table.concat ({'-- from: ', frame.args[1], '; version: ', file_version, '; ', file_date})); table.insert (prefix_t, table.concat ({'-- table created by: ', this_wiki, title_obj.nsText, ':', title_obj.baseText, ''})); table.insert (prefix_t, table.concat ({'local emoji_t = {', tabs_16, '-- indexes are decimal forms of the hex values in U+xxxx'}));

table.insert (out_t, 1, table.concat (prefix_t, '\n'));						-- insert at the head of the output table table.insert (out_t, '\t} ');											-- close the tag

table.sort (new_emoji_names_t, compare_hex);								-- ascending numerical sort on hexadecimal keys

table.insert (out_t, '==emoji_names_t =='); table.insert (out_t, table.concat ({'use this table to overwrite same-named table in ', this_wiki, title_obj.nsText, ':', title_obj.baseText, '; add missing names.'})); table.insert (out_t, table.concat ({'\n local emoji_names_t = {', tabs_15, '-- keys are hex values from U+xxxx code points'})); for _, v in ipairs (new_emoji_names_t)do table.insert (out_t, v); end table.insert (out_t, '\t} ');											-- close the tag

return frame:preprocess (table.concat (out_t, '\n'));						-- make a big string and done end

----< E X P O R T S >

return { main = main, }