Module:WikitextParser

-- Module:WikitextParser is a general-purpose wikitext parser -- Documentation and master version: https://en.wikipedia.org/wiki/Module:WikitextParser -- Authors: User:Sophivorus, User:Certes, User:Aidan9382, et al. -- License: CC-BY-SA-4.0 local WikitextParser = {}

-- Helper function to escape a string for use in regexes local function escapeString( str ) return str:gsub( '[%^%$%(%)%.%[%]%*%+%-%?%%]', '%%%0' ) end

-- Get the lead section from the given wikitext -- The lead section is any content before the first section title. -- @param wikitext Required. Wikitext to parse. -- @return Wikitext of the lead section. May be empty if the lead section is empty. function WikitextParser.getLead( wikitext ) wikitext = '\n' .. wikitext wikitext = wikitext:gsub( '\n==.*', '' ) wikitext = mw.text.trim( wikitext ) return wikitext end

-- Get the sections from the given wikitext -- This method doesn't get the lead section, use getLead for that -- @param wikitext Required. Wikitext to parse. -- @return Map from section title to section content function WikitextParser.getSections( wikitext ) local sections = {} wikitext = '\n' .. wikitext .. '\n==' for title in wikitext:gmatch( '\n==+ *([^=]+) *==+' ) do local section = wikitext:match( '\n==+ *' .. escapeString( title ) .. ' *==+(.-)\n==' ) section = mw.text.trim( section ) sections[ title ] = section end return sections end

-- Get a section from the given wikitext (including any subsections) -- If the given section title appears more than once, only the section of the first instance will be returned -- @param wikitext Required. Wikitext to parse. -- @param title Required. Title of the section -- @return Wikitext of the section, or nil if it isn't found. May be empty if the section is empty or contains only subsections. function WikitextParser.getSection( wikitext, title ) title = mw.text.trim( title ) title = escapeString( title ) wikitext = '\n' .. wikitext .. '\n' local level, wikitext = wikitext:match( '\n(==+) *' .. title .. ' *==.-\n(.*)' ) if wikitext then local nextSection = '\n==' .. string.rep( '=?', #level - 2 ) .. '[^=].*'		wikitext = wikitext:gsub( nextSection, '' ) -- remove later sections at this level or higher wikitext = mw.text.trim( wikitext ) return wikitext end end

-- Get the content of a tag from the given wikitext. -- We can't use getTags because both opening and closing tags are self-closing tags. -- @param wikitext Required. Wikitext to parse. -- @param name Required. Name of the tag -- @return Content of the tag, or nil if it isn't found. May be empty if the section tag is empty. function WikitextParser.getSectionTag( wikitext, name ) name = mw.text.trim( name ) name = escapeString( name ) wikitext = wikitext:match( '< *section +begin *= *["\']? *' .. name .. ' *["\']? */>(.-)< *section +end= *["\']? *'.. name ..' *["\']? */>' ) if wikitext then return mw.text.trim( wikitext ) end end

-- Get the lists from the given wikitext. -- @param wikitext Required. Wikitext to parse. -- @return Sequence of lists. function WikitextParser.getLists( wikitext ) local lists = {} wikitext = '\n' .. wikitext .. '\n\n' for list in wikitext:gmatch( '\n([*#].-)\n[^*#]' ) do		table.insert( lists, list ) end return lists end

-- Get the paragraphs from the given wikitext. -- @param wikitext Required. Wikitext to parse. -- @return Sequence of paragraphs. function WikitextParser.getParagraphs( wikitext ) local paragraphs = {}

-- Remove non-paragraphs wikitext = '\n' .. wikitext .. '\n' wikitext = wikitext:gsub( '\n[*#][^\n]*', '' ) -- remove lists wikitext = wikitext:gsub( '\n%[%b[]%]\n', '' ) -- remove files and categories wikitext = wikitext:gsub( '\n%b{} *\n', '\n%0\n' ) -- add spacing between tables and block templates wikitext = wikitext:gsub( '\n%b{} *\n', '\n' ) -- remove tables and block templates wikitext = wikitext:gsub( '\n==+[^=]+==+ *\n', '\n' ) -- remove section titles wikitext = mw.text.trim( wikitext )

for paragraph in mw.text.gsplit( wikitext, '\n\n+' ) do		if mw.text.trim( paragraph ) ~= '' then table.insert( paragraphs, paragraph ) end end return paragraphs end

-- Get the templates from the given wikitext. -- @param wikitext Required. Wikitext to parse. -- @return Sequence of templates. function WikitextParser.getTemplates( wikitext ) local templates = {} for template in wikitext:gmatch( '{%b{}}' ) do		if wikitext:sub( 1, 3 ) ~= '{{#' then -- skip parser functions like #if table.insert( templates, template ) end end return templates end

-- Get the requested template from the given wikitext. -- If the template appears more than once, only the first instance will be returned -- @param wikitext Required. Wikitext to parse. -- @param name Name of the template to get -- @return Wikitext of the template, or nil if it wasn't found function WikitextParser.getTemplate( wikitext, name ) local templates = WikitextParser.getTemplates( wikitext ) local lang = mw.language.getContentLanguage for _, template in pairs( templates ) do		local templateName = template:match( '^{{ *([^}|\n]+)' ) if lang:ucfirst( templateName ) == lang:ucfirst( name ) then return template end end end

-- Get the parameters from the given template. -- @param wikitext Required. Template wikitext to parse. -- @return Map from parameter name to parameter value function WikitextParser.getParameters( template ) local parameters = {} local params = template:match( '{{[^|}]-|(.*)}}' ) if params then -- Temporarily replace pipes in subtemplates and links to avoid chaos for subtemplate in params:gmatch( '{%b{}}' ) do			params = params:gsub( escapeString( subtemplate ), subtemplate:gsub( '.', { ['%']='%%', ['|']="@@:@@", ['=']='@@_@@' } ) ) end for link in params:gmatch( '[%b[]]' ) do			params = params:gsub( escapeString( link ), link:gsub( '.', { ['%']='%%', ['|']='@@:@@', ['=']='@@_@@' } ) ) end local count = 0 local parts, name, value for param in mw.text.gsplit( params, '|' ) do			parts = mw.text.split( param, '=' ) name = mw.text.trim( parts[1] ) if #parts == 1 then value = name count = count + 1 name = count else value = mw.text.trim( table.concat( parts, '=', 2 ) ) end value = value:gsub( '@@_@@', '=' ) value = value:gsub( '@@:@@', '|' ) parameters[ name ] = value end end return parameters end

-- Get the tags from the given wikitext. -- @param wikitext Required. Wikitext to parse. -- @return Sequence of tags. function WikitextParser.getTags( wikitext ) local tags = {} local tag, tagName, tagEnd for tagStart, tagOpen in wikitext:gmatch( '(<[^/].->)' ) do		tagName = tagOpen:match( '< ?(.-)[ >]' )

-- If we're in a self-closing tag, like, , , , , etc.		if tagOpen:match( '<.-/>' ) or tagName == 'br' or tagName == 'hr' then tag = tagOpen

-- If we're in a tag that may contain others like it, like or 		elseif tagName == 'div' or tagName == 'span' then local position = tagStart + #tagOpen - 1 local depth = 1 while depth > 0 do tagEnd = wikitext:match( '', position ) if tagEnd then tagEnd = tagEnd - 1 else break -- unclosed tag end position = wikitext:match( '< ?' .. tagName .. '[ >]', position + 1 ) if not position then position = tagEnd + 1 end if position > tagEnd then depth = depth - 1 else depth = depth + 1 end end tag = wikitext:sub( tagStart, tagEnd )

-- Else we're in tag that shouldn't contain others like it, like or 		else tagEnd = wikitext:match( '', tagStart ) - 1 tag = wikitext:sub( tagStart, tagEnd ) end table.insert( tags, tag ) end return tags end

-- Get the tags from the given wikitext. -- @param wikitext Required. Wikitext to parse. -- @return Sequence of gallery tags. function WikitextParser.getGalleries( wikitext ) local galleries = {} local tags = WikitextParser.getTags( wikitext ) for _, tag in pairs( tags ) do		local tagName = tag:match( '< ?(.-)[ >]' ) if tagName == 'gallery' then table.insert( galleries, tag ) end end return galleries end

-- Get the tags from the given wikitext. -- @param wikitext Required. Wikitext to parse. -- @return Sequence of ref tags. function WikitextParser.getReferences( wikitext ) local references = {} local tags = WikitextParser.getTags( wikitext ) for _, tag in pairs( tags ) do		local tagName = tag:match( '< ?(.-)[ >]' ) if tagName == 'ref' then table.insert( references, tag ) end end return references end

-- Get the tables from the given wikitext. -- @param wikitext Required. Wikitext to parse. -- @return Sequence of tables. function WikitextParser.getTables( wikitext ) local tables = {} wikitext = '\n' .. wikitext for t in wikitext:gmatch( '\n%b{}' ) do		if t:sub( 1, 3 ) == '\n{|' then t = mw.text.trim( t ) -- exclude the leading newline table.insert( tables, t ) end end return tables end

-- Get the id from the given table wikitext -- @param t Required. Wikitext of the table to parse. -- @return Id of the table or nil if not found function WikitextParser.getTableId( t ) return string.match( t, '^{|[^\n]-id *= *["\']?([^"\'\n]+)["\']?[^\n]*\n' ) end

-- Get a table by id from the given wikitext -- @param wikitext Required. Wikitext to parse. -- @param id Required. Id of the table -- @return Wikitext of the table or nil if not found function WikitextParser.getTableById( wikitext, id ) local tables = WikitextParser.getTables( wikitext ) for _, t in ipairs( tables ) do		if id == WikitextParser.getTableId( t ) then return t		end end end

-- Get the data from the given table wikitext -- @param tableWikitext Required. Wikitext of the table to parse. -- @return Table data -- @todo Test and make more robust function WikitextParser.getTableData( tableWikitext ) local tableData = {} tableWikitext = mw.text.trim( tableWikitext ); tableWikitext = string.gsub( tableWikitext, '^{|.-\n', '' ) -- remove the header tableWikitext = string.gsub( tableWikitext, '\n|}$', '' ) -- remove the footer tableWikitext = string.gsub( tableWikitext, '^|%+.-\n', '' ) -- remove any caption tableWikitext = string.gsub( tableWikitext, '|%-.-\n', '|-\n' ) -- remove any row attributes tableWikitext = string.gsub( tableWikitext, '^|%-\n', '' ) -- remove any leading empty row tableWikitext = string.gsub( tableWikitext, '\n|%-$', '' ) -- remove any trailing empty row for rowWikitext in mw.text.gsplit( tableWikitext, '|-', true ) do		local rowData = {} rowWikitext = string.gsub( rowWikitext, '||', '\n|' ) rowWikitext = string.gsub( rowWikitext, '!!', '\n|' ) rowWikitext = string.gsub( rowWikitext, '\n!', '\n|' ) rowWikitext = string.gsub( rowWikitext, '^!', '\n|' ) rowWikitext = string.gsub( rowWikitext, '^\n|', '' ) for cellWikitext in mw.text.gsplit( rowWikitext, '\n|' ) do			cellWikitext = mw.text.trim( cellWikitext ) table.insert( rowData, cellWikitext ) end table.insert( tableData, rowData ) end return tableData end

-- Get the internal links from the given wikitext (includes category and file links). -- @param wikitext Required. Wikitext to parse. -- @return Sequence of internal links. function WikitextParser.getLinks( wikitext ) local links = {} for link in wikitext:gmatch( '%[%b[]%]' ) do		table.insert( links, link ) end return links end

-- Get the file links from the given wikitext. -- @param wikitext Required. Wikitext to parse. -- @return Sequence of file links. function WikitextParser.getFiles( wikitext ) local files = {} local links = WikitextParser.getLinks( wikitext ) for _, link in pairs( links ) do		local namespace = link:match( '%[%[ ?(.+) ?:.+%]%]' ) if namespace and mw.site.namespaces[ namespace ] and mw.site.namespaces[ namespace ].canonicalName == 'File' then table.insert( files, link ) end end return files end

-- Get the category links from the given wikitext. -- @param wikitext Required. Wikitext to parse. -- @return Sequence of category links. function WikitextParser.getCategories( wikitext ) local categories = {} local links = WikitextParser.getLinks( wikitext ) for _, link in pairs( links ) do		local namespace = link:match( '%[%[ ?(.+) ?:.+%]%]' ) if namespace and mw.site.namespaces[ namespace ] and mw.site.namespaces[ namespace ].canonicalName == 'Category' then table.insert( categories, link ) end end return categories end

-- Get the external links from the given wikitext. -- @param wikitext Required. Wikitext to parse. -- @return Sequence of external links. function WikitextParser.getExternalLinks( wikitext ) local links = {} for link in wikitext:gmatch( '%b[]' ) do		if link:match( '^%[//' ) or link:match( '^%[https?://' ) then table.insert( links, link ) end end return links end

return WikitextParser