Module:Sandbox/Trappist the monk/Emoji data make

--[[

this module reads html of https://unicode.org/Public/emoji/latest/emoji-test.txt and creates a data table suitable for Module:Emoji

1. open https://unicode.org/Public/emoji/latest/emoji-test.txt 2. view page source 3. select and copy the whole html text to clipboard 4. paste into this module's doc page inside the comment markup 5. save 6. copy the rendered table from the module documentation and paste it over the existing table in Module:Emoji/data

Is ~/annotations/americas.html the best source? What about: https://www.unicode.org/emoji/charts/full-emoji-list.html (takes a vey long time to load) – currently v15.1 but: the html source (view source) loads relatively quickly but: that source is much much much 'longer than the maximum of 2,048 kilobytes' https://unicode.org/Public/emoji/15.1/emoji-sequences.txt; simple text is good but doesn't provide names for each code https://unicode.org/Public/emoji/15.1/emoji-test.txt; simple text is good; appears to provide names; there are duplicates qualified with FE0F as the last subcode; what to do about them? names appear to be the same so drop the duplicates? date and version can be read from the source ]]

require ('strict');

--[[--< R E N D E R _ O U T P U T >

render the base table emotbl{} that this module creates. ]]

local function render_output (frame, out_t, timestamp, version) table.insert (out_t, '\t} ');								-- to close the table table.insert (out_t, 1, table.concat ({										-- insert this at the start of the output sequence ' local emoji_hex_from_name_t = {',			-- opening stuff string.rep ('\t', 13),													-- tabs to position the version/timestamp comment '-- v.',																-- version prefix version,																-- the version '; ',																	-- separator timestamp,																-- and the timestamp }));	return frame:preprocess (table.concat (out_t, '\n'));						-- make a big string and done end

----< M A I N >--

local function main (frame) local page_title = frame:getTitle .. '/doc'; local title_object_t = mw.title.new (page_title);							-- get the title object for the doc page invoking this module

local content = title_object_t:getContent;								-- get the content of that page local timestamp = content:match ('# Date: (%d%d%d%d%-%d%d%-%d%d, %d%d:%d%d:%d%d) GMT');	-- get parts of the timestamp timestamp = timestamp:gsub (',%s+', 'T');

local version = content:match ('# Version: (%d+%.%d+)')

local data_t = {};															-- raw data extracted from source html goes here indexed by emoji hex value(s)

for line in content:gmatch ('([%x ]+;[^\n\r]+)[\n\r]+') do		local hex = line:match ('[%x ]+');										-- one or more hexadecimal strings separated by space characters hex = mw.text.trim (hex);												-- remove extraneous whitespace hex = hex:gsub (' +FE0F$', '');											-- remove u+FE0F hex = hex:lower;														-- down case

local emoji = line:match ('# +([^ ]+) ');								-- get the emojis for possible use in comment (TODO)

local name = line:match ('E%d+%.%d+ (.+)');								-- get emoji name name = name:gsub ("'", "\\'");											-- escape ' (U+0027 typewriter apostrophe) name = mw.ustring.gsub (name, '[“”‘’]', {			['“'] = '\"',														-- replace “” (U+201C & U+201D)			['”'] = '\"',			['‘'] = "\\'",														-- replace ‘’ (U+2018 & U+2019) with ' (U+0027 typewriter apostrophe)			['’'] = "\\'",		}); name = name:gsub (' +', '_');											-- replace whitespace with single underscore (why?) name = name:lower;													-- down case

data_t[hex] = {name, emoji};											-- add to the base data list end

local out_t = {};															-- prettified list goes here

local function tabs (hex, info_t)											-- local function to calculate number of tabs needed between end of entry and column 80 comment local length = 14 + mw.ustring.len (info_t[1]) + string.len (hex);		-- length of table entry; ustring.len because there are some multibyte characters local white_space = 80 - length;										-- comments begin at column 80 local tabs = math.floor (white_space / 4);								-- the minimum number of tabs to get to column 80 if 0 ~= math.fmod (white_space, 4) then									-- if there is a remainder ... tabs = tabs + 1;													-- add one more tab end return ((0 >= tabs) and 1) or tabs;									-- return the number the tabs needed to get to column 80; minimum of 1 (for long entries) end

for hex, info_t in pairs (data_t) do										-- spin through data_t and make a prettified list table.insert (out_t, table.concat ({ '\t[\'',															-- indent one tab space; open index info_t[1],															-- add emoji name as index '\'] = \'',															-- close index; add assignment operator; open name hex,																-- add emoji hex value '\',',																-- close name string.rep ('\t', tabs (hex, info_t)),								-- add enough tabs to get to column 80 '-- ',																-- start a comment info_t[2],															-- and add the emoji }));	end

table.sort (out_t);															-- ascending sort

return render_output (frame, out_t, timestamp, version);					-- make a big string and done end

----< E X P O R T S >

return { main = main, }