User:Gdr/taxoconvert.py


 * 1) !/usr/bin/python
 * 2) -*- encoding:utf-8 -*-
 * 3) taxoconvert.py -- convert multi-template taxoboxes to single template
 * 1) taxoconvert.py -- convert multi-template taxoboxes to single template

import codecs import getopt import os import pickle import re import sys import tempfile import wikipedia

global checks, edit, debug site = wikipedia.Site('en') checks = True edit = False debug = False

class Error(Exception): def __init__(self, text): self.text = text def __str__(self): return self.text

class NoError(Error): None

def edittext(s): fn = tempfile.mktemp f = codecs.open(fn, 'w', 'utf-8') f.write(s) f.close os.system('%s "%s"' % (os.getenv('EDITOR', 'vi'), fn)) f = codecs.open(fn, 'r', 'utf-8') s = f.read f.close return s

def canonize(s): return filter(lambda c: c.isalnum, s).lower

def check(text, newtext): if not checks: return newtext while 1: wikipedia.showDiff(text, newtext) i = wikipedia.input(u'OK? [yNeq]') if i == 'q': raise IOError elif i == 'y': return newtext elif i == 'e': newtext = edittext(newtext) else: return None

def record(params, key, value): if debug: wikipedia.output(u"%s = %s" % (key, value)) if params.has_key(key): raise Error(u"Duplicate key %s" % key) if value: params['sequence'].append(key) params[key] = value

def parse_nomial(suffix, n, lines, params): if debug: wikipedia.output(u"parse_nomial: suffix = '%s', lines[n] = %s" % (suffix, lines[n])) orig_n = n   found = False

m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'                r'(?:[ _]+(?:simple|botany|parens))? *\| *'                 r'color *= *[a-z]+ *\| *'                 r'\1_name *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m:       record(params, m.group(1) + suffix, "%s" % m.group(2)) n += 1 found = True

m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'                r'(?:[ _]+part)? *\| *'                 r'(?:color *= *[a-z]+ *\| *)?'                 r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'                 r'author *= *([^\}]*[^\} ]) *\| *'                 r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m and not found: record(params, m.group(1) + suffix, "%s" % m.group(2)) record(params, '%s%s_authority' % (m.group(1), suffix),              '%s, %s' % (m.group(3), m.group(4))) n += 1 found = True

m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial) *\| *'                r'color *= *[a-z]+ *\| *'                 r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'                 r'author *= *([^\}]*[^\} ]) *\| *'                 r'date *= *}}$', lines[n]) if m and not found: record(params, m.group(1) + suffix, "%s" % m.group(2)) record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3)) n += 1 found = True

m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'                r'(?:[ _]+(?:parens|botany|simple))? *\| *'                 r'color *= *[a-z]+ *\| *'                 r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'                 r'author *= *\| *'                 r'date *= *}}$', lines[n]) if m and not found: record(params, m.group(1) + suffix, "%s" % m.group(2)) n += 1 found = True

m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'                r'[ _]+parens(?:[ _]+part)? *\| *'                 r'(?:color *= *[a-z]+ *\| *)?'                 r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'                 r'author *= *([^\}]*[^\} ]) *\| *'                 r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m and not found: record(params, m.group(1) + suffix, "%s" % m.group(2)) record(params, '%s%s_authority' % (m.group(1), suffix),              '(%s, %s)' % (m.group(3), m.group(4))) n += 1 found = True

m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'                r'[ _]+botany *\| *'                 r'color *= *[a-z]+ *\| *'                 r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'                 r'author *= *([^\}]*[^\} ]|) *}}$', lines[n]) if m and not found: record(params, m.group(1) + suffix, "%s" % m.group(2)) record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3)) n += 1 found = True

if n + 1 < len(lines): m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'                    r'[ _]+botany *\| *'                     r'color *= *[a-z]+ *\| *'                     r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'                     r'author *= *([^\}]*[^\} ]|) *}}$', lines[n] + lines[n+1]) if m and not found: record(params, m.group(1) + suffix, "%s" % m.group(2)) record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3)) n += 2 found = True

m = re.match(r'(?i){{taxobox[ _]+image *\| *image *= *'                r'\) *' r'(?:\| *([0-9]+px))?(?:\|[^\]]*)?\]\] *\| *' r'caption *= *([^\}]*[^\} ]|) *}}$', lines[n])   if m and re.search(r'(?i)(?:range|distribution)', lines[n]):        record(params, 'range_map%s' % suffix, m.group(1))        record(params, 'range_map%s_width' % suffix, m.group(2))        record(params, 'range_map%s_caption' % suffix, m.group(3))        n += 1

return (n, orig_n != n)

def parse(text, linkname): """parse(text, linkname) -- parse multi-template taxobox from 'text' and   return it as a dictionary suitable for constructing a taxobox    template."""

params = {'sequence': []} text = re.sub(r'(?m)[ \t\r]+$', '', text)

if 1 < len(re.findall(r'(?i){{taxobox[ _]+begin *\|', text)): raise Error(u"Two occurrences of {{taxobox begin}}.") if 1 < len(re.findall(r'(?i){{taxobox[ _]+end *}}', text)): raise Error(u"Two occurrences of {{taxobox end}}.") m = re.search(r'(?is){{taxobox[ _]+begin.*{{taxobox[ _]+end *}}', text) if not m:       global done done[linkname] = True raise NoError(u"Can't find taxobox.") lines = re.split(r'(?: *(?:(?= *(?:{{|<))|\n) *)+', m.group(0)) n = 0

m1 = re.match(r'(?i){{taxobox[ _]+begin *\| *color *= *([a-z]+) *\| *'                 'name *= *(.*[^ ]) *}}[ \t]*(?: *)?$', lines[n]) m2 = re.match(r'(?i){{taxobox[ _]+begin *\| *name *= *(.*[^ ]) *\| *'                 'color *= *([a-z]+) *}}[ \t]*(?: *)?$', lines[n]) if m1: record(params, 'color', m1.group(1)) record(params, 'name', m1.group(2)) n += 1 elif m2: record(params, 'color', m2.group(2)) record(params, 'name', m2.group(1)) n += 1 else: raise Error(u"Can't find {{taxobox begin}}: %s" % lines[n])

m = re.match(r'(?i){{(?:template:)?(status[^\}]+)}}', lines[n]) if m:       record(params, 'status', '{{%s}}' % m.group(1)) n += 1

m = re.match(r'(?i)(?: *)?fossil +(?:range|record): +([^<\n]*[^<\n ]) *'                r'(?: )?', lines[n]) if m:       record(params, 'fossil_range', m.group(1)) n += 1

if re.match(r'(?i)', lines[n]): n += 1

image_re = (r'(?i){{taxobox[ _]+image *\| *image *= *'               r'\) *' r'(?:\| *([0-9]+px))?(?:\|.*)?\]\] *\| *' r'caption *= *([^\}]*[^\} ]|) *}}$')

m1 = re.match(image_re, lines[n]) m2 = re.match(image_re, lines[n] + lines[n+1]) m3 = re.match(r'(?i){{taxobox[ _]+image *\| *image *= *'                 r'\) *' r'(?:\| *([0-9]+px))?(?:\|.*)?\]\] *}}$', lines[n])   if m1:        record(params, 'image', m1.group(1))        record(params, 'image_width', m1.group(2))        record(params, 'image_caption', m1.group(3))        n += 1    elif m2:        record(params, 'image', m2.group(1))        record(params, 'image_width', m2.group(2))        record(params, 'image_caption', m2.group(3))        n += 2    elif m3:        record(params, 'image', m3.group(1))        record(params, 'image_width', m3.group(2))        n += 1

m = re.match(image_re, lines[n]) if m:       record(params, 'image2', m.group(1)) record(params, 'image2_width', m.group(2)) record(params, 'image2_caption', m.group(3)) n += 1

if re.match(r'(?i){{taxobox[ _]+image *\| *image *= *\| *caption *= *}}$',               lines[n]): n += 1 if re.match(r'(?i){{taxobox[ _]+image *\| *image *= *(?:|\|.*)}}$',               lines[n]): n += 1 if re.match(r'(?i){{taxobox[ _]+image.*(?:Image with unknown copyright status removed|Unsourced image removed)', lines[n]): n += 1 if re.match(r'(?i)', lines[n]): n += 1 if re.match(r'(?is)', lines[n] + lines[n+1]): n += 2

m = re.match(r'(?i){{taxobox[ _]+begin[ _]+placement *\| *'                r'color *= *[a-z]+ *}}$', lines[n]) if not m:       raise Error(u"Can't find {{taxobox begin placement}}: %s"                          % lines[n]) n += 1

while n < len(lines): m0 = re.match(r'(?i){{taxobox[ _]+([a-z_]+)[ _]+entry[ _]*\| *'                     r'taxon *= *([^\}]*[^\} ]) *'                      r' (.*)}}$', lines[n] + lines[n+1]) if m0: record(params, m0.group(1), m0.group(2)) record(params, m0.group(1) + '_authority', m0.group(3)) n += 2 continue

m1 = re.match(r'(?i){{taxobox[ _]+([a-z_]+)[ _]+entry[ _]*\| *'                     r'taxon *= *([^\}]*[^\} ]) *}}(?:)?$', lines[n]) if not m1: break record(params, m1.group(1), m1.group(2)) n += 1 m2 = re.match(r'(?i){{taxobox[ _]+authority *\| *'                     r'author *= *([^\}]*[^\} ]) *\| *'                      r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m2: record(params, m1.group(1) + '_authority',                  '%s, %s' % (m2.group(1), m2.group(2))) n += 1 continue m3 = re.match(r'(?i){{taxobox[ _]+authority[ _]+parens *\| *'                     r'author *= *([^\}]*[^\} ]) *\| *'                      r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m3: record(params, m1.group(1) + '_authority',                  '(%s, %s)' % (m3.group(1), m3.group(2))) n += 1 continue m4 = re.match(r'(?i){{taxobox[ _]+authority[ _]+(?:new|botany)? *\| *'                     r'author(?:ity)? *= *([^\}]*[^\} ]) *}}$', lines[n]) if m4: record(params, m1.group(1) + '_authority', m4.group(1)) n += 1 continue m5 = re.match(r'(?i) *(.*[^ ]) *(?:)?', lines[n]) if m5: record(params, m1.group(1) + '_authority', m5.group(1)) n += 1 continue

m = re.match(r'(?i)$', lines[n]) if not m:       raise Error(u"Expected : %s"                          % lines[n]) n += 1

n, found = parse_nomial('', n, lines, params) if found: n, found = parse_nomial('2', n, lines, params) if found: n, found = parse_nomial('3', n, lines, params) if found: n, found = parse_nomial('4', n, lines, params)

m = re.match(r'(?i){{taxobox[ _]+section[ _]+type[ _]+species *\| *'                r'color *= *[a-z]+ *\| *'                 r'species *= *([^\}]*[^\} ]) *\| *'                 r'comment *= *([^\}]*[^\} ]|) *}}$', lines[n]) if m:       record(params, 'type_species', "%s" % m.group(1)) record(params, 'type_species_authority', m.group(2)) n += 1

if re.match(r'(?i)', lines[n]): n += 1

m = re.match(r'(?i)$', lines[n]) if m:       n += 1 syn = [] while 1: m1 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry[ _]+simple'                         r' *\| *binomial_name *= *([^\}]*[^\} ]) *}}$',                          lines[n]) m2 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry[ _]+botany'                         r' *\| *binomial_name *= *([^\}]*[^\} ]) *\| *'                          r'author *= *([^\}]*[^\} ]) *}}$',                          lines[n]) m3 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry *\| *'                        r'binomial_name *= *([^\|\}]*[^\|\} ]) *\| *'                         r'author *= *([^\}]*[^\} ]) *\| *'                         r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n]) if m1: syn.append("%s" % m1.group(1)) elif m2: syn.append("%s %s "                          % (m2.group(1), m2.group(2))) elif m3: syn.append("%s %s, %s "                          % (m3.group(1), m3.group(2), m3.group(3))) else: break n += 1 record(params, 'synonyms', ' '.join(syn)) m = re.match(r'(?i)$', lines[n]) if not m:           raise Error(u"Expected  but found: %s"                        % lines[n]) n += 1

if not params.has_key('binomial') and not params.has_key('trinomial'): n, found = parse_nomial('', n, lines, params)

m = re.match(r'(?i){{taxobox[ _]+section[ _]+diversity *\| *'                r'color *= *[a-z]+ *\| *'                 r'link *= *([^\}]*[^\} ]) *\| *'                 r'diversity *= *([^\}]*[^\} ]) *}}$', lines[n]) if m:       record(params, 'diversity', m.group(2)) record(params, 'diversity_link', m.group(1)) n += 1

m = re.match(r'(?i){{taxobox[ _]+section[ _]+(?:subdivision|list) *\| *'                r'color *= *[a-z]+ *\| *'                 r'plural_taxon *= *([^\}]*[^\} ]) *}}$', lines[n]) if not m:       m = re.match(r'(?i){{taxobox[ _]+section[ _]+(?:subdivision|list) *\| *'                     r'plural_taxon *= *([^\}]*[^\} ]) *\| *'                     r'color *= *[a-z]+ *}}$', lines[n]) if m:       record(params, 'subdivision_ranks', m.group(1)) n += 1 m = n       while not re.match(r'(?i){{taxobox', lines[n]): n += 1 record(params, 'subdivision', '\n' + '\n'.join(lines[m:n]))

if re.match(r'(?i)', lines[n]): n += 1 if n + 1 < len(lines) and re.match(r'(?i)', lines[n] + lines[n+1]): n += 2

m = re.match(r'(?i){{taxobox[ _]+end *}}$', lines[n]) if not m:       raise Error(u"Unrecognized line: %s" % lines[n])

# Some other checks if params.has_key('norank'): raise Error(u"Can't handle {{taxobox norank entry}}, sorry.") if params.has_key('unranked'): raise Error(u"Can't handle {{taxobox unranked entry}}, sorry.")

# Fix some simple mistakes. if (params.has_key('genus') and params.has_key('name')       and params['genus'] == "%s" % params['name']): params['name'] = "%s" % params['name'] if (params.has_key('binomial') and params.has_key('name')       and params['binomial'] == "%s" % params['name']): params['name'] = "%s" % params['name'] if (params.has_key('trinomial') and params.has_key('name')       and params['trinomial'] == "%s" % params['name']): params['name'] = "%s" % params['name'] if (params.has_key('image_caption')       and canonize(params['image_caption'])        in (canonize(params.get('name', '')), canonize(params.get('binomial', '')), canonize(params.get('trinomial', '')), canonize(params.get('genus', '')) + 'sp', canonize(params.get('name', ) + params.get('binomial', )), )):       del params['image_caption'] if params.has_key('binomial_authority'): params['binomial_authority'] = re.sub(r',,', ',',                                             params['binomial_authority']) if params.has_key('trinomial_authority'): params['trinomial_authority'] = re.sub(r',,', ',',                                             params['trinomial_authority']) if params.has_key('genus') and re.match(r".*$", params['genus']): params['genus'] = params['genus'][3:-3] if params.has_key('name'): m = re.match(r" *(.*[^ ]) * $", params['name']) if m:           params['name'] = m.group(1) if params.has_key('subdivision_ranks'): m = re.match(r" *(.*[^ ]) * $", params['subdivision_ranks']) if m:           params['subdivision_ranks'] = m.group(1) if params.has_key('genus') and re.match(r"(''')?[^']+\1$", params['genus']): params['genus'] = "%s" % params['genus'] if params.has_key('species') and re.match(r"(''')?[^']+\1$", params['species']): params['species'] = "%s" % params['species'] if params.has_key('subspecies') and re.match(r"(''')?[^']+\1$", params['subspecies']): params['subspecies'] = "%s" % params['subspecies'] if params.has_key('species') and params.has_key('binomial') and re.match(r"[^']+$", params['species']): params['species'] = "%s" % params['species'] if params.has_key('subspecies') and params.has_key('trinomial') and re.match(r"[^']+$", params['subspecies']): params['subspecies'] = "%s" % params['subspecies'] if params.has_key('subdivision') and canonize(params['subdivision']) == 'seetext': params['subdivision'] = '\nSee text.' if (params.has_key('binomial') and params.has_key('species')       and re.match("[^']*$", params['species'])): m = re.match(r"'*([A-Z])[a-z-]* ([a-z-]*)'*", params['binomial']) if m:           params['species'] = "%s. %s" % (m.group(1), m.group(2)) if (params.has_key('trinomial') and params.has_key('subspecies')       and re.match(".*$", params['subspecies'])): m = re.match(r"'*([A-Z])[a-z-]* ([a-z])[a-z-]* ([a-z][a-z-]*)'*", params['trinomial']) if m:           params['subspecies'] = "%s. %s. %s" % (m.group(1), m.group(2), m.group(3))

return params

def convert(pl): text = pl.get if edit: text = edittext(text) params = parse(text, pl.title) newtext = re.sub(r'(?is){{taxobox[ _]+begin *\|.*{{taxobox[ _]+end *}}',                    '{{Taxobox\n'                     + ''.join(map(lambda k: '| %s = %s\n' % (k, params[k]),                                   filter(lambda s: params.has_key(s), params['sequence'])))                    + '}}', text) newtext = check(pl.get, newtext) if newtext: status, reason, data = pl.put(newtext, u'nomialbot — converted multi-template taxobox to {{Taxobox}}') global done if data == '': done[pl.title] = True

def convertmany: global site, n, linknames, done pages = map(lambda l: wikipedia.Page(site, l), linknames) fetched = [] while n < len(linknames): try: if not done.get(linknames[n]): if linknames[n] not in fetched: tofetch = filter(lambda p: not done.get(p.title), pages[n:])[:50] wikipedia.getall(site, tofetch) fetched += map(lambda p: p.title, tofetch) wikipedia.output("Trying %s" % linknames[n]) if pl.namespace != 0: done[pl.title] = True wikipedia.output(u"%s not in main namespace" % pl.title) else: convert(pages[n]) except wikipedia.LockedPage: wikipedia.output("%s is locked" % linknames[n]) except wikipedia.IsRedirectPage: wikipedia.output("%s is redirect" % linknames[n]) done[linknames[n]] = True except NoError: None except Error, e:           wikipedia.output(u'***' + e.text) n += 1

def main: global checks, edit, debug offset = None reload = None try: opts, args = getopt.getopt(sys.argv[1:], 'r:dneo:',                                  ['reload=', 'debug', 'no-checks', 'edit', 'offset=']) for o, a in opts: if o in ('-n', '--no-checks'): checks = False elif o in ('-o', '--offset'): offset = int(a) elif o in ('-e', '--edit'): edit = True elif o in ('-d', '--debug'): debug = True elif o in ('-r', '--reload'): reload = a           else: print "Bad option: %s" % o               return except getopt.GetoptError: print "Bad command line" return global n, linknames, done done = {} try: f = file('taxoconvert.db', 'rb') n, linknames, done = pickle.load(f) f.close if reload: raise IOError except IOError: tb = wikipedia.Page(site, reload) linknames = map(lambda p:p.title, tb.getReferences) print len(linknames), "pages found" n = 0 try: if offset != None: n = offset if args: for aa in args: convert(wikipedia.Page(site, aa)) else: convertmany finally: f = file('taxoconvert.db.new', 'wb') pickle.dump((n, linknames, done), f)       f.close os.rename('taxoconvert.db.new', 'taxoconvert.db')

if __name__ == '__main__': try: main finally: wikipedia.stopme