User:Gdr/yearbot.py


 * 1) !/usr/bin/python
 * 2)             YEARBOT.PY -- POPULATE BIRTHS/DEATHS IN YEAR
 * 3)                           Gdr, 2005-05-14
 * 4)                           Minor updates: User:Docu, 2006-12-17
 * 5) INTRODUCTION
 * 6) This script assists with the population of the "Births" and "Deaths"
 * 7) sections of an article about a year in the English wikipedia, using
 * 8) articles in  and.
 * 9) USAGE
 * 10) See User:Gdr/Yearbot
 * 11) requires User:Gdr/history.py
 * 12) DATA STRUCTURES
 * 13) An entry is a dictionary with these fields:
 * 14) article   Name of article.
 * 15) bdate     Date of birth, as a pair like ('April 17', '0417').
 * 16) byear     Birth year, as string like '1543'
 * 17) ddate     Date of death, as a pair like ('September 23', '0923').
 * 18) dyear     Death year, as string like '1602'
 * 19) exclude   1 if article is to be excluded from the page.
 * 20) intro     Introductory paragraph of article, if any is found.
 * 21) pagelink  wikipedia.Page object referring to article.
 * 22) post      String placed after the article link.
 * 23) pre       String placed before the article link.
 * 24) sort      Sort key, if any.
 * 25) desc      Description extracted from article (used as text for 'post'
 * 26)           if entry is new).
 * 27) LICENCE
 * 28) This program is free software; you can redistribute it and/or modify
 * 29) it under the terms of the GNU General Public License as published by
 * 30) the Free Software Foundation; either version 2 of the License, or (at
 * 31) your option) any later version.
 * 1) post      String placed after the article link.
 * 2) pre       String placed before the article link.
 * 3) sort      Sort key, if any.
 * 4) desc      Description extracted from article (used as text for 'post'
 * 5)           if entry is new).
 * 6) LICENCE
 * 7) This program is free software; you can redistribute it and/or modify
 * 8) it under the terms of the GNU General Public License as published by
 * 9) the Free Software Foundation; either version 2 of the License, or (at
 * 10) your option) any later version.
 * 1) it under the terms of the GNU General Public License as published by
 * 2) the Free Software Foundation; either version 2 of the License, or (at
 * 3) your option) any later version.

import catlib import getopt import history import re import sys import wikipedia

class Year: site = wikipedia.Site('en')

# List of regexp search-and-replace patterns that should be applied # to all descriptions. patterns = []

# The year we are working on, its Page, and the original text. year = None year_pl = None year_orig = None year_text = None

ignore = { 'Special:Categories': 1, }

# Matches a regexp pattern. pattern_re = r'/((?:[^\\]|\\.)*)/((?:[^\\]|\\.)*)/$'

# File to store patterns. pattern_file = 'yearbot-patterns'

def __init__(self, year): if not re.match(r'^' + self.year_re + r'$', year): print "%s doesn't look like a year" % year self.year = year self.year_pl = wikipedia.Page(self.site, self.year) self.patterns = [] f = file(self.pattern_file) if f:           for line in f:                m = re.match(self.pattern_re, line) if m:                   self.patterns.append(m.groups) f.close

# Matches a year in the range for which the script operates. year_re = r'1[0-9][0-9][0-9]'

# Matches a trailing birth date. trail_born_re = re.compile(r'^(.*\S)\s*\(b(?:\.|orn)\s*\[?\[?('                              + year_re + r')\]?\]?\)$')

# Matches a trailing death date. trail_died_re = re.compile(r'^(.*\S)\s*\(d(?:\.|ied)\s*\[?\[?('                              + year_re + r')\]?\]?\)$')

# Matches a month name. month_re = (r'January|February|March|April|May|June|'               r'July|August|September|October|November|December')

# Matches a date. date_re = (r'\[?\[?(?:(' + month_re + r')\s+([0-9]+)|([0-9]+)\s*('              + month_re + r'))\]?\]?')

# Matches an entry starting with a date. entry_date_re = re.compile(r'^\s*' + date_re                              + r'\s*(?:-|–|&mdash;)?\s*(.*)$')

# Matches an entry: captures pre, article, linktext, post. entry_re = re.compile(r'([^\[]*)\[\[([^\]|]+)(?:\|([^|\]]*))?\]\](.*)')

# Matches the introductory paragraph of an article, once filled in   # with birth year and death year. intro1_re = r"^.*[^']+(.*?)\[?\[?%s\]?\]?(.*?)\[?\[?%s\]?\]?\W*(.*)$" intro2_re = r"^.*[^']+[^\(]*\([^\)]+\)(.*)$"

# Matches description. desc_re = r'\s+(?:(?:the|an?)\s+)?(([^,.!?\[]|\[\^\+\]\])+)[,.!?]' desc1_re = re.compile(r'\)\s*was' + desc_re)   desc2_re = re.compile(r'\),' + desc_re) desc3_re = re.compile(r'\s+was' + desc_re) desc4_re = re.compile(r',' + desc_re)

# Matches wiki-link link1_re = re.compile(r'\[\^|\+\|([^|\]]+)\]\]') link2_re = re.compile(r'\[\[([^|\]]+)\]\]')

# Approximate date? approx_re = re.compile(r'\bc(?:\.|a\.|irca)')

def save_patterns(self): f = file(self.pattern_file, 'w') if f:           for p in self.patterns: f.write(u'/%s/%s/\n' % (p[0], p[1])) f.close else: print "Couldn't write %s" % self.pattern_file

def apply_patterns(self): for entries in self.topic_entries.values: for e in entries: for p in self.patterns: if e.has_key('post'): e['post'] = re.sub(p[0], p[1], e['post']) elif e.has_key('desc'): e['desc'] = re.sub(p[0], p[1], e['desc'])

def unwikify(self, text): text = self.link1_re.sub(r'\1', text) text = self.link2_re.sub(r'\1', text) return text

def make_date(self, m): month = m.group(1) or m.group(4) day = m.group(2) or m.group(3) return ('%s %s' % (month, day),               '%02d%02d' % (history.months[month], int(day)))

def parse_entries(self, what): m = re.search(r'==\s*' + what.capitalize                     + '\s*==\n((?:\s*\n|\*.*\n)*)',                      self.year_pl.get) if not m:           print "No ==%s==" % what.capitalize return [] lines = re.split(r'\s*\n\s*', m.group(1)) entries = [] for line_orig in lines: entry = {} line = re.sub(r'^\*\s*', '', line_orig) m = self.entry_date_re.match(line) if m:               date = self.make_date(m) if what == 'births': entry['bdate'] = date elif what == 'deaths': entry['ddate'] = date else: entry['?date'] = date line = m.group(5) m = self.trail_born_re.match(line) if m:               entry['byear'] = m.group(2) line = m.group(1) m = self.trail_died_re.match(line) if m:               entry['dyear'] = m.group(2) line = m.group(1) m = self.entry_re.match(line) if m:               entry['pre'] = m.group(1) entry['article'] = m.group(2) if m.group(3): entry['linktext'] = m.group(3) entry['post'] = m.group(4) entries.append(entry) elif not re.match(r'^\s*$', line_orig): wikipedia.output(u"Couldn't parse %s" % line_orig) return entries

def check_entry(self, entry, key, what, value): if value != None: if entry.has_key(key) and entry[key] != value: wikipedia.output(u"%s '%s' fails to match '%s'; "                                u"discarding the former."                                 % (what, entry[key], value)) entry[key] = value

def parse_article(self, entry, what, entries = {}): intro = None try: text = entry['pagelink'].get except wikipedia.IsRedirectPage, arg: return except wikipedia.NoPage: return

# Look for template. m = re.search(r'', text) if m:           self.check_entry(entry, 'byear', 'birth year', m.group(1)) self.check_entry(entry, 'dyear', 'death year', m.group(2)) self.check_entry(entry, 'sortkey', 'sort key', m.group(3)) else: # Get birth year from category, if possible. m = re.search(r'\[\([^|\+))?\]\]', text) if m:               self.check_entry(entry, 'byear', 'birth year', m.group(1)) self.check_entry(entry, 'sortkey', 'sort key', m.group(2)) else: wikipedia.output(u"%s has no Category:births"                                % entry['article'])

# Get death year from category, if possible. m = re.search(r'\[\([^|\+))?\]\]', text) if m:               self.check_entry(entry, 'dyear', 'death year', m.group(1)) self.check_entry(entry, 'sortkey', 'sort key', m.group(2)) else: wikipedia.output(u"%s has no Category:deaths"                                % entry['article'])

# Find introductory paragraph. m = re.search(self.intro1_re % (entry.get('byear') or self.year_re, entry.get('dyear') or self.year_re),                     text, re.M)        if m:            entry['intro'] = m.group(0) intro = m.group(3)

# Birth date available in intro? mm = re.search(self.date_re, m.group(1)) if mm: self.check_entry(entry, 'bdate', 'birth date',                                self.make_date(mm))

# Birth date approximate? if self.approx_re.search(m.group(1)) and what == 'births': entry['exclude'] = True

# Death date available in intro? mm = re.search(self.date_re, m.group(2)) if mm: self.check_entry(entry, 'ddate', 'death date',                                self.make_date(mm))

# Death date approximate? if self.approx_re.search(m.group(2)) and what == 'deaths': entry['exclude'] = True else: m = re.search(self.intro2_re, text, re.M)           if m:                entry['intro'] = m.group(0) intro = m.group(1) else: # Use first line instead. entry['intro'] = text.split('\n')[0]

# Brief description available? mm = None if intro: mm = (self.desc3_re.match(intro)                 or self.desc4_re.match(intro)) mm = (mm or self.desc1_re.search(entry['intro'])             or self.desc2_re.search(entry['intro'])              or self.desc3_re.search(entry['intro'])              or self.desc4_re.search(entry['intro'])) if mm: entry['desc'] = self.unwikify(mm.group(1))

def get_entries(self, what): # Get entries from the section of the year page. entries = self.parse_entries(what) article_entry = {} for entry in entries: article_entry[entry['article']] = entry

# Get lists of births and deaths articles for this year. cl = catlib.Category(self.site, '%s %s' % (self.year, what)) for a in cl.articles: if (not self.ignore.has_key(a.title)               and not article_entry.has_key(a.title)): e = {'article': a.title} article_entry[a.title] = e

# Get them all. for e in article_entry.values: e['pagelink'] = wikipedia.Page(self.site, e['article']) wikipedia.getall(self.site, map(lambda e: e['pagelink'], article_entry.values))

# Merge redirects. for e in article_entry.values: try: text = e['pagelink'].get except wikipedia.IsRedirectPage, arg: pl = wikipedia.Page(self.site, arg.args[0]) redir = pl.title wikipedia.output("%s redirects to %s" % (e['article'], redir)) if article_entry.has_key(redir): e['pagelink'] = article_entry[redir]['pagelink'] del article_entry[redir] else: e['pagelink'] = pl               del article_entry[e['article']] article_entry[redir] = e               e['article'] = redir except wikipedia.NoPage: continue

# Parse articles. for e in article_entry.values: self.parse_article(e, what) return article_entry.values

def guess_sortkey(self, article): words = article.split(' ') if 1 < len(words): return words[-1] + u', ' + u' '.join(words[:-1]) else: return article

def sort_entries(self, entries, what): for e in entries: if what == 'births': e['sort'] = e.has_key('bdate') and e['bdate'][1] or e.get('sortkey') or self.guess_sortkey(e['article']) elif what == 'deaths': e['sort'] = e.has_key('ddate') and e['ddate'][1] or e.get('sortkey') or self.guess_sortkey(e['article']) else: e['sort'] = e.get('sortkey') or self.guess_sortkey(e['article']) entries.sort(key=lambda e: e['sort'])

def format_entry(self, entry, what): if entry.get('exclude'): t = u'- ' else: t = u'* ' if what == 'births' and entry.has_key('bdate'): t = t + u'%s - ' % entry['bdate'][0] elif what == 'deaths' and entry.has_key('ddate'): t = t + u'%s - ' % entry['ddate'][0] t = t + (entry.get('pre') or u'') if entry.has_key('linktext'): t = t + u'%s' % (entry['article'], entry['linktext']) elif entry['article'][-1] == ')':           t = t + u'%s' % entry['article']        else:            t = t + u'%s' % entry['article']        if entry.has_key('post'):            t = t + entry['post']        elif entry.has_key('desc'):            t = t + u', ' + entry['desc']        if what == 'births' and entry.has_key('dyear'):            t = t + u' (died %s)' % entry['dyear']        elif what == 'deaths' and entry.has_key('byear'):            t = t + u' (born %s)' % entry['byear']        return t

def write_entries(self, entries, what): if not self.year_text: self.year_text = self.year_pl.get text = self.year_text m = re.search(r'==\s*' + what.capitalize                     + '\s*==\n((?:\s*\n|\*.*\n)*)',                      text) if not m:           print "No ==%s==" % what.capitalize return "" return (text[:m.start(1)]               + u'\n'.join(map(lambda e: self.format_entry(e, what),                                 filter(lambda e: not e.get('exclude'), entries)))               + u'\n\n'                + text[m.end(1):])

help_text = u"""   h - Help    l - List entries    v - Preview changes to the page    s - Save changes to the page    q - Quit    / // - Edit all entries and save pattern in file    p - Print entry     i - Print introductory paragraph for entry     t - Print whole article text for entry     x - Exclude entry  (or include if already excluded)    d: - Update description for entry     d - Cut description for entry  to  words    P: - Update prefix text for entry <n>    <n>/ /<to>/ - Edit entry <n> using regexp search-and-replace    """

def show_entries(self, title, entries, what): wikipedia.output(u'--- %s ---' % title) n = 0 self.sort_entries(entries, what) for e in entries: n = n + 1 wikipedia.output(u"%d%s" % (n, self.format_entry(e, what)))

def interface(self, title, entries, what): self.show_entries(title, entries, what) while 1: inp = wikipedia.input(u"-- What now? [hlqs0-9pdtx]") m1 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*([0-9]+)$', inp) m2 = re.match(r'^\s*([0-9]+)\s*([A-Za-z])\s*(:.*)?$', inp) m3 = re.match(r'^\s*([0-9]+)\s*' + self.pattern_re, inp) m4 = re.match(r'^\s*' + self.pattern_re, inp) if inp == 'l': self.show_entries(title, entries, what) elif inp == 'q': return False elif inp == 's' or inp == 'w': return True elif inp == 'h': wikipedia.output(self.help_text) elif m1: n = int(m1.group(1)) op = m1.group(2) n2 = int(m1.group(3)) if n < 1 or len(entries) < n:                   wikipedia.output(u"No entry %d (must be 1-%d)"                                     % (n, len(entries))) elif op == 'd': desc = (entries[n-1].get('post')                           or entries[n-1].has_key('desc')                            and u', ' + entries[n-1]['desc'] or '') entries[n-1]['post'] = ' '.join(desc.split(' ')[:n2 + 1]) wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) else: wikipedia.output(u"Not understood: %s" % inp) elif m2: n = int(m2.group(1)) op = m2.group(2) if n < 1 or len(entries) < n:                   wikipedia.output(u"No entry %d (must be 1-%d)"                                     % (n, len(entries))) elif op == 'p': for k, v in entries[n-1].items: wikipedia.output(u' %s: %s' % (k, v)) elif op == 'd': if m2.group(3) and 2 <= len(m2.group(3)): entries[n-1]['post'] = u', ' + m2.group(3)[1:] wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) else: entries[n-1]['post'] = '' wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) elif op == 'P': entries[n-1]['pre'] = m2.group(3)[1:] wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) elif op == 't': try: wikipedia.output(entries[n-1]['pagelink'].get) except: wikipedia.output(u"No page %s" % entries[n-1]['pagelink'].title) elif op == 'i': wikipedia.output(entries[n-1].get('intro', u'No intro')) elif op == 'x': entries[n-1]['exclude'] = not entries[n-1].get('exclude') wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) else: wikipedia.output(u"Not understood: %s" % inp) elif m3: n = int(m3.group(1)) if n < 1 or len(entries) < n:                   wikipedia.output(u"No entry %d (must be 1-%d)"                                     % (n, len(entries))) else: desc = (entries[n-1].get('post')                           or entries[n-1].has_key('desc')                            and u', ' + entries[n-1]['desc'] or '') entries[n-1]['post'] = re.sub(m3.group(2), m3.group(3), desc) wikipedia.output(u"%d%s" % (n, self.format_entry(entries[n-1], what))) elif m4: self.patterns.append((m4.group(1), m4.group(2))) self.save_patterns self.apply_patterns else: wikipedia.output(u"Not understood: %s" % inp)

comment = "yearbot - robot-assisted updating of births and deaths" topic_names = ['births', 'deaths']

def run(self): self.topic_entries = {} for what in self.topic_names: self.topic_entries[what] = self.get_entries(what) self.sort_entries(self.topic_entries[what], what) self.apply_patterns while 1: for what in self.topic_names: entries = self.topic_entries[what] for i in range((len(entries) + 19) / 20): efrom = i * 20 eto = min(len(entries), (i + 1) * 20) batch = entries[efrom : eto] title = u'%s (%d-%d)' % (what.capitalize, efrom + 1, eto), if not self.interface(title, batch, what): return self.sort_entries(entries, what) self.year_text = self.write_entries(entries, what) wikipedia.showDiff(self.year_pl.get, self.year_text) if wikipedia.input(u"OK? [yN]") == 'y': self.year_pl.put(self.year_text, self.comment) return

if __name__ == '__main__': wikipedia.username = 'yearbot' try: if len(sys.argv) < 2: raise "No year specified" Year(sys.argv[1]).run finally: wikipedia.stopme