Module:Sandbox/Trappist the monk/Emoji short name data make

--[[

this module reads html extracted from https://www.unicode.org/cldr/charts/xx/annotations/americas.html and creates a series of data tables suitable for Module:Emoji

1. open https://www.unicode.org/cldr/charts/xx/annotations/americas.html where xx is the version number 2. note the version number and date at the top of the page 3. view page source 4. scroll down to 5. select and copy the whole html  text to clipboard 6. paste into this module's doc page inside the comment markup 7 update version number and date in this module's 8. save 9. copy the rendered table(s) from the module documentation and paste it over the existing table(s) in Module:Emoji/data

]]

require ('strict');

--[[--< R E N D E R _ O U T P U T >

render the locale tables that this module creates. ]]

local function render_output (frame, out_t) local temp_t = {}; table.insert (temp_t, ' ');								-- to close the rendering return frame:preprocess (table.concat (temp_t));							-- make a big string, preprocess for syntax highlighting and done end

--[[--< T A B S >--

return the number of tabs needed to position the line comment at column 80

is the number of characters counted from the left margin (tabs count as 4). the length og this table opening: local en_emoji_hex_from_name_t = { is 34 so in the function call is 34. That makes 46 and 11 with a remainder of 2 so for this, return 12.

for text that is 80 or more characters long, return 1.

]]

local function tabs (length)													-- local function to calculate number of tabs needed between end of entry and column 80 comment local white_space = 80 - length;											-- comments begin at column 80 local tabs = math.floor (white_space / 4);									-- the minimum number of tabs to get to column 80 if 0 ~= math.fmod (white_space, 4) then										-- if there is a remainder ... tabs = tabs + 1;														-- add one more tab end return ((0 >= tabs) and 1) or tabs;										-- return the number the tabs needed to get to column 80; minimum of 1 (for long entries) end

--[[--< T A B L E _ S T R I N G _ M A K E >

make a big string from a locale data table, its name, and the source file's version and timestamp

]]

local function table_string_make (locale_t, table_name, timestamp, version) table.sort (locale_t);														-- ascending sort table.insert (locale_t, '\t}');												-- close the table table.insert (locale_t, 1, table.concat ({									-- insert this at the start of the output sequence 'local ',																-- declaration table_name,																-- name of the table ' = {',																	-- rest of the opening stuff string.rep ('\t', tabs (10 + string.len (table_name))),					-- tabs to position the version/timestamp comment '-- v.',																-- version prefix version,																-- the version '; ',																	-- separator timestamp,																-- and the timestamp }));	return table.concat (locale_t, '\n');										-- make a big string and done end

--[[--< P R E T T I F Y >--

make a 'pretty table entry' from the emoji name (the key), its hex value (the value), and a comment showing the emoji

]]

local function prettify (emoji_name, hex, emoji) return table.concat ({		'\t[\,																-- indent one tab space; open index		emoji_name,																-- add emoji name as index		'\'] = \,																-- close index; add assignment operator; open name		hex,																	-- add emoji hex value		'\',',																	-- close name		string.rep ('\t', tabs (14 + mw.ustring.len (emoji_name) + string.len (hex))),	-- add enough tabs to get to column 80		'-- ',																	-- start a comment		emoji,																	-- and add the emoji		}); end

----< M A I N >--

local function main (frame) local page_title = frame:getTitle .. '/doc'; local title_object_t = mw.title.new (page_title);							-- get the title object for the doc page invoking this module

local content = title_object_t:getContent;								-- get the content of that page local timestamp = frame.args[1];											-- get the timestamp local version = frame.args[2];												-- get the version

local en_data_t = {};														-- generic en locale data local en_001_data_t = {};													-- en-001 locale data local en_AU_data_t = {};													-- en-AU locale data local en_CA_data_t = {};													-- en-CA locale data local en_GB_data_t = {};													-- en-GB locale data local en_IN_data_t = {};													-- en-IN locale data

for row in content:gmatch (' .- ') do								-- get each row from the html table local cells_t = {};														-- holds the first three cells (Char, Hex, English) for td in row:gmatch (' will be empty local emoji = cells_t[1]:match ("name='([^']+)");					-- the character for use in comment local hex = cells_t[2]:match ('>([%x ]+)<'):lower;				-- the character's hexadecimal value(s); down cased local name; local en_names_t = mw.text.split (cells_t[3], ' ');				-- split the name cell on the tag which is used to visually separate locales for _, locale in ipairs (en_names_t) do				name = locale:match ('%*(.-)'):lower;					-- emoji name; down cased name = name:gsub ('%s+', '_');									-- replace whitespace with underscore name = name:gsub ("'", "\\'");									-- escape ' (U+0027 typewriter apostrophe) TODO: necessary? name = mw.ustring.gsub (name, '[“”‘’]', {					['“'] = '\"',												-- replace “” (U+201C & U+201D) with ' (U+0022 typewriter quote) TODO: necessary?					['”'] = '\"',					['‘'] = "\\'",												-- replace ‘’ (U+2018 & U+2019) with ' (U+0027 typewriter apostrophe) TODO: necessary?					['’'] = "\\'",					}); local locales_list = locale:match ((.-)) or 'en';		-- get the locales list for this emoji name if present; 'en' else local locales_list_t = mw.text.split (locales_list, '%s*,%s*');	-- split the list on comma-space pairs

for _, locale in ipairs (locales_list_t) do						-- for each locale tag in the cell, add an entry in the approriate locale table if 'en' == locale then										-- generic English table.insert (en_data_t, prettify (name, hex, emoji)); elseif 'en_001' == locale then								-- English in the 'World' region table.insert (en_001_data_t, prettify (name, hex, emoji)); elseif 'en_AU' == locale then								-- Australian English table.insert (en_AU_data_t, prettify (name, hex, emoji)); elseif 'en_CA' == locale then								-- Canadian English table.insert (en_CA_data_t, prettify (name, hex, emoji)); elseif 'en_GB' == locale then								-- United Kingdom English table.insert (en_GB_data_t, prettify (name, hex, emoji)); elseif 'en_IN' == locale then								-- Indian English table.insert (en_IN_data_t, prettify (name, hex, emoji)); else error ('unhandled locale: ' .. locale);					-- error trap in case newer versions of the source have other locales end end end end end

local out_t = {};															-- raw output data go here

for _, locale in ipairs ({'en', 'en_001', 'en_AU', 'en_CA', 'en_GB', 'en_IN'}) do		if 'en' == locale then													-- generic English table.insert (out_t, table_string_make (en_data_t, 'en_emoji_hex_from_name_t', timestamp, version)) elseif 'en_001' == locale then											-- English in the 'World' region table.insert (out_t, table_string_make (en_001_data_t, 'en_001_emoji_hex_from_name_t', timestamp, version)) elseif 'en_AU' == locale then											-- Australian English table.insert (out_t, table_string_make (en_AU_data_t, 'en_AU_emoji_hex_from_name_t', timestamp, version)) elseif 'en_CA' == locale then											-- Canadian English table.insert (out_t, table_string_make (en_CA_data_t, 'en_CA_emoji_hex_from_name_t', timestamp, version)) elseif 'en_GB' == locale then											-- United Kingdom English table.insert (out_t, table_string_make (en_GB_data_t, 'en_GB_emoji_hex_from_name_t', timestamp, version)) elseif 'en_IN' == locale then											-- Indian English table.insert (out_t, table_string_make (en_IN_data_t, 'en_IN_emoji_hex_from_name_t', timestamp, version)) end end return render_output (frame, out_t) end

----< E X P O R T S >

return { main = main, }