Module:Sandbox/DePiep/uchar

-- todo split consist Char and Args -- todo cwith double dotcircle 230/239, 233, 234 -- thought: option "speccial notes", listing: "whitesace, control, combining, NaC, .." require( 'strict' ) local p = {} local getArgs     = require( 'Module:Arguments' ).getArgs local uChar_data  = mw.loadData( 'Module:Sandbox/DePiep/uchar/data' ) local uData       = require('Module:Unicode data') local uData_helper = require('Module:Sandbox/DePiep/uchar-helper') local uBaseConvert = require('Module:BaseConvert') local yesno       = require('Module:Yesno') local str         = require('Module:String') local plaintext   = require('Module:Plain text') --- local tabletools  = require('Module:TableTools') local ERRstatus   = '' local tUchar      = {}

local DOTTED_CIRCLE     = '◌' -- U+25CC local NBSP              = ' ' -- U+00A0 local LEFT_TO_RIGHT_MARK = '&lrm;' -- U+200E LEFT-TO-RIGHT MARK (&lrm;) local DEFAULT_IMAGE_SIZE = '21px' local WS_BLUE           = 'lightblue'

local function testH( s ) local h = mw.html.create('span') h :attr('id', 'testH') :tag('big') :css('background', WS_BLUE) :wikitext( s ) --:newline

return tostring(h) end

local function addStyles( tChar ) local h = mw.html.create('span')

h :attr('id', 'testH') :css('font-size', '150%') :wikitext( tChar.uChar ) if tChar.uIsWhitespace == true then h:css('background', WS_BLUE) end --:newline

return tostring(h) end

function p.testH( frame ) local origArgs = getArgs( frame ) return testH( origArgs[1] ) end

function p.testFromDoc(frame) local div = mw.html.create( 'div' ) div :attr( 'id', 'testdiv' ) :css( 'width', '100%' ) :wikitext( 'Some text' ) :tag( 'hr' ) return tostring( div ) -- Output: Some text end

-- FORMATTERS ===== ===== ===== ===== ===== ===== ===== ===== local function inTag( s, arg, val, divspan ) local obj local rprt = '' if divspan == 'div' or divspan == 'span' then else return nil -- ERR end

return s, rprt end

local function decodeString( s ) if s == nil then return nil end return mw.text.decode( s ) end

-- Format string in ' end

-- Use mono font-family ( from: Template:Mono ) local function inMono( s ) if s == nil then s = '' end s = string.gsub( s, '%s+', ' ' ) return '' .. s .. ' ' end

local function inSmallcaps( s ) if ( s == nil ) or ( s ==  ) then return  end -- '' -- Smallcaps/styles.css: span.smallcaps {font-variant: small-caps;} local sc	-- sc = '' sc = ' ' .. s .. ' '	return sc end

local function xlLinkFileFormat( uHexBare0x, uHexFormat, sGenCat ) -- depending on parameter used, xlink one of two if uHexBare0x ~= nil then -- Character data page -- https://www.fileformat.info/info/unicode/char/00ad/index.htm (or "/ad/"); no 0x no uc return '[https://www.fileformat.info/info/unicode/char/' .. string.lower( uHexBare0x ) .. '/index.htm ff.info ' .. uHexFormat .. ']'	else -- GenCat list, for example gencat "Nd": -- https://www.fileformat.info/info/unicode/category/Nd/list.htm return '[https://www.fileformat.info/info/unicode/category/' .. sGenCat .. '/list.htm ff.info ' .. sGenCat .. ']'	end end

-- UHEX HANDLERS & FORMATTERS - - - - - - - - - local function formatUhex( uHex0x, uLink ) -- formatting into normalform "U+00A9" local uHexFmt -- working uHexFmt = string.gsub( uHex0x, '^0x', '' ) uHexFmt = string.gsub( uHexFmt, '^0*', '' ) uHexFmt = 'U+' .. string.sub( '0000' .. uHexFmt, - math.max( #uHexFmt, 4 ) ) if uLink ~= nil then return uHexFmt .. '_[todo: fmt Uhex_link_U+]' end return uHexFmt end

local function formatGenCat( sGenCat, fmt ) local tCat tCat = uChar_data.tGenCat[sGenCat] if tCat == nil then return '' end return inMono(sGenCat) .. '=' .. tCat[1] end

-- Formats table ( array ) using concat -- replace space by nbsp ( keep untrimmed sp ) -- in monospace font-family local function formatTablelist( t ) -- unused? local s = '' if t == nil then return '<?>' end s = table.concat( t, '; ' ) s = mw.text.decode( string.gsub( s, '%s+', ' ' ) ) s = '<' .. inMono( s ) .. '>'	return s end

local function formatCombiningChar( is_combining, cWith ) local addPrefix local uCombWith -- working, cWith logic local rprt -- todo need 4-way logic for cwith cWith = decodeString( cWith ) rprt = 'is_combi: ' .. tostring( is_combining ) .. '; cwith: ' .. tostring( cWith )

-- strip wikicode; but save NBSP -- todo improve, test if cWith ~= nil then cWith = string.gsub( cWith, NBSP, 'NBSP' ) cWith = plaintext._main( cWith, false ) cWith = string.gsub( cWith, 'NBSP', NBSP) end uCombWith = yesno( cWith ) -- y/n/nil (3-way logic; 'foo' == nil) addPrefix = '' if (cWith == nil) or (uCombWith == true) then -- default: per is_combining rprt = rprt .. '_dflt non-combi = none' if is_combining == true then addPrefix = DOTTED_CIRCLE rprt = rprt .. '_dflt' end elseif uCombWith == false then -- explicitly false, so suppress addPrefix = '' rprt = rprt .. '_false, suppress' else -- use character provided by cwith addPrefix = cWith rprt = rprt .. '_cleanchar: ' .. tostring( cWith ) end return addPrefix, rprt end

-- READ & PROCESS ==== ====== ===== ===== ===== ===== ===== ===== local function convertHexInToHex0x( uHexAnyform ) local uHexBare0x local uHex0x -- targets local uHexNum local uHexFormat

if ( uHexAnyform == nil ) or ( uHexAnyform == '' ) then ERRstatus ='ERR convertHexInToHex0x: no uHex input' return nil end uHexBare0x = decodeString( uHexAnyform ) uHexBare0x = string.gsub( uHexBare0x, '%s', '' ) uHexBare0x = string.gsub( uHexBare0x, '^U%+', '' ) uHexBare0x = string.gsub( uHexBare0x, '^0x', '' ) uHexBare0x = string.upper( uHexBare0x ) uHex0x = '0x' .. uHexBare0x -- number check uHexNum = tonumber( uHex0x ) -- kills NaN, todo: test this if uHexNum == nil then ERRstatus ='ERR convertHexInToHex0x: uHex is not hex: >' .. tostring( uHexNum ) .. '<'		return nil elseif ( uHexNum < 0 ) or ( uHexNum > 0x10FFFF ) then ERRstatus ='ERR convertHexInToHex0x: uHex out of U+ range' .. uHex0x return nil end uHexFormat = formatUhex( uHex0x )

return uHex0x, uHexNum, uHexBare0x, uHexFormat end

local function convertHexToDec( uHex0x ) local xVal if uHex0x == nil then return nil end xVal = uBaseConvert.convert( {n = uHex0x, base = 10, from = 16} ) return xVal end

local function convertDecToHex( uDec ) -- todo: dec input is NaN, err, edge if uDec == nil then return nil end return uBaseConvert.convert( {n = tonumber( uDec, 10 ), base = 16, from = 10} ) end

-- GET DATA ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== ==== local function getBlock( uHexNum ) uData.lookup_block( uHexNum ) return 'blck' end

local function getPlane( uHexNum ) local i = math.floor(uHexNum / 0x10000) return i .. ': ' .. uChar_data.tPlanes[i] end

local function getCombiningClass( uHex0x ) -- CCC -- todo: 239 (230), 233, 234 = between spacing chars. local ccc

ccc = uData_helper.lookup_combiningclass( uHex0x ) or '' -- new -helper function

return ccc end

local function getNamedEntities( uDec, fmt ) -- returns from datalist, by decimal val: -- formatted into concat.table list -- demo: [168]='&amp;uml;, &amp;die;, &amp;Dot;, &amp;DoubleDot;' local tNamedEntitiesData = mw.loadData( 'Module:Numcr2namecr' ) local sNameList local tNames= {} uDec=169 -- fmt = report -- id = decimal input sNameList = tNamedEntitiesData[tonumber(uDec)] if sNameList == nil then return nil end sNameList = decodeString( sNameList ) -- has literal '&amp;' in source

local patstring = '%f[^&][^%;]+%f[%;]' local hitCount = 0 local hitWord = '' while hitCount <= 20 do		hitCount = hitCount + 1 hitWord = str._match( sNameList, patstring, 1, hitCount, false, '' ) hitWord = mw.text.trim( hitWord ) if hitWord ~= '' then table.insert( tNames, inMono( '&amp;' .. hitWord .. ';' ) )		elseif hitWord == '' then -- no more hits in the string break end end return table.concat( tNames, ' &#x20;' ) -- double spaced end

local function getAliases( uHex ) -- returns t5{} = 5 alias tables named by reason -- demo 0x002118 = weier local tAllAliases = mw.loadData( 'Module:Unicode data/aliases' ) local tCPalias = {}

tCPalias = tAllAliases[uHex] if tCPalias == nil then return nil end

-- for 2-deep 5-subtable ( Aliases ) local tAlias5 = {} local abbreviation = {} local alternate   = {} local correction  = {} local control     = {} local figment     = {}

tAlias5["abbreviation"] = abbreviation tAlias5["alternate"]   = alternate tAlias5["control"]     = control tAlias5["correction"]  = correction tAlias5["figment"]     = figment

for i, v in ipairs( tCPalias ) do		-- i = counter, v[i] = table (1/5), v[2] = tablename ( alias, 1/5 ) if type( v ) == 'table' then table.insert( tAlias5[v[1]], v[2] ) end end return tAlias5 end

local function getScriptName( sScriptISO ) local sName local UDscripts = mw.loadData( 'Module:Unicode data/scripts' ) if sScriptISO == nil then return nil end

sName = UDscripts.aliases[sScriptISO] or nil if sName == nil then sName = '_unk' end return sName end

local function formatAlias5( t5Alias, fmt ) local sReport if t5Alias == nil then return nil end -- fmt = report sReport = ' ALIASES: ' for k, v in pairs( t5Alias ) do		if #v > 0 then sReport = sReport .. ' ' .. k .. ': ' .. table.concat( v, '; ' ) end end return sReport end

-- 1. PARSE INCOMING ARGS -- 2. READ PROPERTIES local function getArgsAndProps( origArgs ) local tNewArgs = {}

local inHex, inDec, inChar = 1, 2, 3 -- 'inHex', 'inDec', 'inChar' local tOrigIn = { inHex=nil, inDec=nil, inChar=nil } local uHexIn = -1 -- the base input local uHex0x, uHexNum -- local working val --xx -- PART 1 READ & NORMALISE ORIG ARGUMENTS -- HEX DEC CHAR local rprt = 'R-t0:' .. #tOrigIn tOrigIn[inHex] = (origArgs[1] or origArgs['hex']) or nil -- todo: split for check? tOrigIn[inDec] = origArgs['dec'] or nil tOrigIn[inChar] = decodeString( origArgs['char'] ) or nil

rprt = rprt .. ' R-t2:' .. #tOrigIn for n, v in pairs( tOrigIn ) do	if v ~= nil then rprt = rprt .. ' ' .. tostring(v) .. ';;'	end end

if tOrigIn[inDec] ~= nil then uHexIn = convertDecToHex( tOrigIn[inDec] ) rprt = rprt .. ' dec;' end if tOrigIn[inChar] ~= nil then uHexIn = convertDecToHex( mw.ustring.codepoint( tOrigIn[inChar] )) rprt = rprt .. ' char;' end if tOrigIn[inHex] ~= nil then uHexIn = tOrigIn[inHex] rprt = rprt .. ' hex;' end

-- REPORT todo: what if >1 input?: err msg, prio, conflictcheck -- 2023-02-04: removed "\|" "invalid escape sequence" ??? tNewArgs['rprtOrigIDs'] = ' |ID in: #t4=' .. #tOrigIn .. ':>' .. rprt .. tostring(uHexIn) .. '<| '

-- returns: uHex0x, uHexNum, uHexBare0x, uHexFormat tNewArgs['uHex0x'], tNewArgs['uHexNum'], tNewArgs['uHexBare0x'], tNewArgs['uHexFormat'] = convertHexInToHex0x( uHexIn ) if tNewArgs['uHex0x'] == nil then -- ERROR -- shortcut to error #1: no uHex (valid 0x) input return tNewArgs end -- local shortcut only uHex0x = tNewArgs['uHex0x'] uHexNum = tNewArgs['uHexNum']

-- DEC tNewArgs['uDec'] = convertHexToDec( uHex0x ) -- OTHER ORIG ARGS tNewArgs['uNameLink'] = origArgs['link'] or origArgs['nlink'] -- old nlink = depr paramname tNewArgs['format']   = origArgs['format'] or '' tNewArgs['cwith']    = decodeString( origArgs['cwith'] )

tNewArgs['uSize']    = origArgs['size'] tNewArgs['uImage']   = origArgs['image']

tNewArgs['html']     = origArgs['html'] -- depr? tNewArgs['ulink']    = origArgs['ulink'] -- old ulink = depr?

-- test notice tNewArgs['test']     = origArgs['test'] or ''

-- PART 2 READ & USE PROPERTIES == == == == == == == == == == == == == == == == == == == == == == == == -- ASSIGNED, GenCat, Control, Char tNewArgs['uIsAssigned'] = uData.is_assigned( uHexNum )

if tNewArgs['uIsAssigned'] == true then tNewArgs['uGenCat'] = uData.lookup_category( uHexNum ) tNewArgs['uChar']  = mw.text.decode( '&#x' .. tNewArgs['uHex0x'] .. ';' ) else tNewArgs['uGenCat'] = 'Xx' -- todo not assigned == ? tNewArgs['uChar']  = 'ERR_not_assg' -- ERROR end tNewArgs['uBlock'] = uData.lookup_block( uHexNum ) tNewArgs['uPlane'] = getPlane( uHexNum ) -- CHAR replacement if tNewArgs['uGenCat'] == 'Cc' then tNewArgs['uChar']  = '&#xFFFD;' -- '?' placeholder end

if tNewArgs['uGenCat'] == 'Cc' then -- assuming this is 1:1 tNewArgs['uIsControl'] = true else tNewArgs['uIsControl'] = false end

--NAME, ALIASES tNewArgs['uName']  = uData.lookup_name( uHexNum ) tNewArgs['Aliases'] = getAliases( uHexNum ) -- table5

--PROPS Script, Latin, WS	tNewArgs['uIsLatin']     = uData.is_Latin( tostring( tNewArgs['uChar'] ) ) tNewArgs['uScript']      = uData.lookup_script( uHexNum ) tNewArgs['uScriptName']  = getScriptName( tNewArgs['uScript'] ) tNewArgs['uIsWhitespace'] = uData.is_whitespace( uHexNum )

--PROPS rtl tNewArgs['uIsRtl']       = uData.is_rtl( tostring( tNewArgs['uChar'] ) )

--PROPS2 COMBINING PREFIX Combining/cwith/dottedcircle, CCC tNewArgs['uIsCombining'] = uData.is_combining( uHexNum ) or false if yesno( tNewArgs['uIsCombining'], false ) == true then -- todo: could do: read ccc, once ;-)		tNewArgs['uCombiningClass'] = getCombiningClass( uHexNum ) 	end	tNewArgs['uCombiningClass'] = getCombiningClass( uHexNum ) 	tNewArgs['uCharPrefix'], tNewArgs['uCwithReport'] = formatCombiningChar( tNewArgs['uIsCombining'], tNewArgs['cwith'] )

-- CHAR SUFFFIX; rtl if tNewArgs['uIsRtl'] == true then tNewArgs['uCharSuffix'] = LEFT_TO_RIGHT_MARK else tNewArgs['uCharSuffix'] = '' end

--PROPS3: NamedEntities tNewArgs['NamedEntities'] = getNamedEntities( convertHexToDec( uHex0x ) )

return tNewArgs end

function p._main ( args ) return '_todo _main' end

function p.main ( frame ) local origArgs = getArgs( frame, { trim=false, removeBlanks=false } ) local tArgs = {} local s = ''

tUchar = getArgsAndProps( origArgs ) if tUchar['uHex0x'] == nil then return ' >' .. ( origArgs[1] or '?' ) .. '< ERR hexIn ' .. ERRstatus .. ' ' .. (tUchar['rprtOrigIDs'] or 'unk1') end

-- REPORT RPRT s =	 formatUhex( tUchar['uHex0x'] )

--string together & css format tUchar.uChar = tUchar['uCharPrefix'] .. tUchar.uChar .. tUchar['uCharSuffix'] -- cwith, rtl, --- 	tUchar['styledChar'] = addStyles( tUchar ) local cssChar cssChar = addStyles( tUchar ) if tUchar['uImage'] ~= nil then s = s .. ' '	else --s = s .. ' ' .. tUchar['uCharPrefix'] .. tUchar.uChar .. tUchar['uCharSuffix'] .. ' '		s = s .. ' ' .. cssChar .. ' '	end

s = s .. inSmallcaps( tUchar['uName'] ) s = s .. ' [testing: ' .. tUchar['test'] .. ']' .. (tUchar['rprtOrigIDs'] or '?') .. '&rarr; ' .. tUchar['uHex0x'] .. ' [' .. tUchar['uDec'] .. 'dec]'.. '; (' .. xlLinkFileFormat( tUchar['uHexBare0x'], tUchar['uHexFormat'] ) .. ') ' .. 'GC: ' .. formatGenCat( tUchar['uGenCat'] ) .. ' (' .. xlLinkFileFormat( nil, nil, tUchar['uGenCat'] ) .. ')' .. ' ASSIG: ' .. tostring( tUchar['uIsAssigned'] ) .. '; '				.. 'WS: '.. tostring( tUchar['uIsWhitespace'] ) .. ' BLK: ' .. tUchar['uBlock'] .. '; PLANE: ' .. tUchar['uPlane'] .. '; '			.. ' SC: ' .. tUchar['uScript'] .. '=' .. tUchar['uScriptName'] .. '; RTLsuffix:' .. tostring( tUchar['uIsRtl'] )  .. '; '		s = s .. ' COMBI PREFIX: >' .. tUchar['uCharPrefix'] .. '<; ' .. tUchar['uCwithReport'] .. '; CCC class:' .. ( tUchar['uCombiningClass'] or '-' )

if tUchar['NamedEntities'] ~= nil then s = s .. ' NAMED ENTITIES: ' .. tUchar['NamedEntities'] end

if tUchar['Aliases'] ~= nil then s = s .. formatAlias5( tUchar['Aliases'], 'report' ) end return s end

function p.test(frame) local sChar sChar = frame.args['char'] return mw.ustring.codepoint(sChar, 1, 2) end

function p.testScriptName( frame ) local sISOid sISOid = frame.args[1] return getScriptName(sISOid) end

return p