User:Jitse's bot/goim.py


 * 1) !/usr/bin/python
 * 2) encoding: utf-8


 * 1) Written by Jitse Niesen, jitse@maths.leeds.ac.uk
 * 2) Released in public domain.
 * 3) This is a bot which updates various reports of interest to
 * 4) WikiProject Mathematics on the English Wikipedia. It runs daily
 * 5) under the account User:Jitse's bot.
 * 6) This code is a mess and I know it is a mess, but I don't care very
 * 7) much. The bot only writes to three pages:
 * 8)  * WikiProject Mathematics/Current activity
 * 9)  * Pages needing attention/Mathematics/Lists
 * 10)  * WikiProject Mathematics/Count
 * 11) Any malfunctioning will thus be contained within those three pages
 * 12) and within the maths WikiProject, which will gladly clean up any
 * 13) mess it creates ;) . That being said, if somebody is willing to
 * 14) rewrite the code, go ahead!
 * 1) mess it creates ;) . That being said, if somebody is willing to
 * 2) rewrite the code, go ahead!

import datetime, os, pickle, re, sys, time from pprint import PrettyPrinter

os.chdir('/data/project/jitse-bot/goim') sys.path.append('/shared/pywikipedia/rewrite') import pywikibot
 * 1) from pywikibot.catlib import Category
 * 2) from pywikibot.page import Page

site = pywikibot.getSite

noload = False                         # do not update data nowrite = False                        # do not write results maxDays = 60                           # do not include news older than this subject = 'daily update from wikilabs'

data = None                            # global variable containing all data tempdata = {}                          # data that does not need to be saved

class GoimData: """All data is stored in a global variable of this type.""" def __init__(self): self.all = GoimDiff          # all maths articles self.cat = GoimDiff          # maths categories self.clar = GoimDiff         # articles needing clearification self.clean = GoimDiff        # articles needing clean-up self.con = GoimDiff          # articles needing context self.disp = GoimDiff         # accuracy disputes self.exp = GoimDiff          # articles needing expert attention self.fac = GoimDiff          # featured article candidates self.far = GoimDiff          # featured article review self.farc = GoimDiff         # featured article removal candidates self.gac = GoimDiff          # good article candidates self.gar = GoimDiff          # good article review self.imp = GoimDiff          # needing importance to be explained self.macr = GoimDiff         # maths a-class rating candidate self.merge = GoimDiff        # articles to be merged self.misc = GoimDiff         # articles in misc clean-up cats self.morefs = GoimDiff       # articles needing additional refs self.norelref = GoimDiff     # articles lacking reliable refs self.orig = GoimDiff         # original research self.orph = GoimDiff         # orphaned articles self.pr = GoimDiff           # articles on peer review self.prop = GoimDiff         # proposed deletion self.req = GoimDiff          # requested articles self.rewr = GoimDiff         # articles needing a rewrite self.rqIm = GoimDiff         # requested images self.split = GoimDiff        # articles to be split self.spr = GoimDiff          # articles on scientific peer review self.tech = GoimDiff         # too technical self.tone = GoimDiff         # inappropriate tone self.unref = GoimDiff        # articles lacking sources self.unsrc = GoimDiff        # articles with unsourced statements self.vfy = GoimDiff          # articles needing verification self.vfd = {}                  # listed on VfD self.vfdres = {}               # VfD results

class GoimDiff: """Lists of articles plus last seven diffs.""" def __init__(self): self.cur = None                # current list self.curtime = None            # timestamp of 'cur' self.added = []                # list of lists of added files self.rmvd = []                 # list of lists of removed files self.times = []                # list of timestamps

def update(self, new): try: old = self.cur except AttributeError: old = None new.sort new = removeDuplicates(new)

if old != None: added = [ l for l in new if not l in old ] rmvd = [ l for l in old if not l in new ] if added or rmvd: try: (self.added, self.rmvd, self.times) except AttributeError: self.added = [] self.rmvd = [] self.times = [] self.added = [added] + self.added[:6] self.rmvd = [rmvd] + self.rmvd[:6] self.times = [self.curtime] + self.times[:6] self.cur = new self.curtime = datetime.datetime.utcnow return (len(added), len(rmvd)) else: self.cur = new self.curtime = datetime.datetime.utcnow return (len(new),0)

def makeList(self, rmvd = False, num = 7, total = True, prefix = None): """Make a list of articles added.       rmvd: if True, list articles removed instead.        num: number of items in the list.        total: display total number, unless rmvd is True.        percOf: if not None, write percentage of percOf.        prefix: if not None, prefix to be added in front of article name.        """ res = '' if rmvd: l = self.rmvd else: l = self.added

for i in range(min(len(l), num)): if not l[i]: continue if i == 0: t = self.curtime else: t = self.times[i-1] if (datetime.datetime.now-t).days > maxDays: continue t = (t-datetime.timedelta(0.75)).strftime('%d %b') t = t.lstrip('0') x = [] for a in l[i]: if prefix: atxt = prefix + a               else: atxt = a               if a in self.cur or rmvd: x.append( + atxt + ) else: x.append(' ' + atxt + ' ') x = ', '.join(x) res += "* %s: %s.\n" % (t,x)

if not res: res = '* No (recent) news.\n'

if not rmvd and total: if not self.cur: res += "* Total: None.\n" else: res += "* Total: %d.\n" % (len(self.cur),)

return res

def list2wiki0(list): l = [ '* %s\n' % a for a in list ] l.append('* Total: {:d}\n'.format(len(list))) return ''.join(l)

def list2wiki(list): if not list: return '* None\n'; list.sort list = removeDuplicates(list) if len(list) <= 8: res = list2wiki0(list) else: res = '\n' + list2wiki0(list) + '\n' return res

def studyCatTree(root, cats, catsRec, ignore, arts): """Traverses the category subtree starting at _root_ and looks for   articles from the list _arts_. Returns a dictionary D such that:    * for all c in cats, D[c] is a list of all articles from _arts_      found in the category c;    * for all c in catsRec, D[c] is a list of all articles from _arts_      found in the category c or one of its subcategories;    * D['misc'] is a list of tuples (article, category) containing all      articles from _arts_ found elsewhere in the subtree.    Categories in _ignore_ and their subcategories are ignored."""
 * 1) This function is not used

todo = [root] done = [] D = {} D['misc'] = [] while todo: #print todo catname = todo.pop(0) if catname in ignore or catname in done: continue c = pywikibot.Category(site, catname) if catname in catsRec: a = c.articles(True) else: subcats = c.subcategories titles = [ x.title(withNamespace=False) for x in subcats ] todo.extend([ x for x in titles                         if not x in done and x != catname ]) a = c.articles(False) a = [ x.title(withNamespace=False) for x in a             if ((x.namespace == 0 or x.namespace == 1)                  and x.title(withNamespace=False) in arts) ] if catname in cats or catname in catsRec: D[catname] = a       else: D['misc'].extend([(x,catname) for x in a]) D['misc'].sort return D def removeDuplicates(list): """Removes duplicates in a sorted list""" res = [] for x in list: if not x in res: res.append(x) return res

def getLinks(txt): """Only returns links to pages in main namespace in this Wikipedia""" Rlink = re.compile(r'\[\[(?P [^:\]\|]*)(?:\|[^\]\|]*)?\]\]') links = Rlink.findall(txt) Rlink = re.compile(r'\{\{arttalk\|(?P [^:\}]*)\}\}') links += Rlink.findall(txt) links = [ link[0].upper + link[1:] for link in links if not link.startswith('#') ] links.sort return removeDuplicates(links)

def removeLines(txt, s1, s2): """Remove those lines from 'txt' which start with 's1' and end   with 's2'.""" lines = txt.split('\n') lines = [ l for l in lines if not (l.startswith(s1) and l.endswith(s2)) ] return '\n'.join(lines)

def getSection(txt, title): """Get a section (of arbitrary level) with the given title."""

lines = txt.split('\n') start = None for i in range(len(lines)): line = lines[i] k = 0 while len(line) > k and line[k] == '=': k = k+1 if start and k == level: return '\n'.join(lines[start+1:i]) if k >= 1: x = line[k:].strip if x.startswith(title) and x[len(title):].strip == k * '=': start = i               level = k    if start: return '\n'.join(lines[start+1:]) print 'WARNING: getSection could not find "%s"\n' % (title,)

def getPage(title): """Get a page from Wikipedia."""

page = pywikibot.Page(site, title) txt = page.get return txt

def initProgressReport: now = datetime.datetime.now tempdata['starttime'] = now tempdata['lasttime'] = now print "Info: initProgressReport time is %s" \ % now.time.strftime('%H:%M:%S')

def formatTD(td): s = td.seconds % 60 m = (td.seconds / 60) % 60 h = td.seconds / 3600 if h == 0: return "%d:%02d" % (m,s) else: return "%dh%02d:%02d" % (h,m,s)

def progressReport: now = datetime.datetime.now passed = now - tempdata['lasttime'] passedcum = now - tempdata['starttime'] tempdata['lasttime'] = now print "Info: time %s (cum %s)" \ % (formatTD(passed), formatTD(passedcum))

def replaceStanza(page, title, txt): cmt = '' % (title,) try: i1 = page.index(cmt) i2 = page.index(cmt, i1+1) if txt.endswith('\n'): txt = '\n' + txt page = page[:i1+len(cmt)] + txt + page[i2:] except ValueError: print "Warning: %s not found" % cmt return page

def summarizeFeatured0(diff, txt): res = [] for i in range(len(diff.added)): if i == 0: t = diff.curtime else: t = diff.times[i-1] for a in diff.added[i]: if a in diff.cur: res += [(t, txt(a))] else: res += [(t, ' ' + txt(a) + ' ')] return res

def summarizeFeatured: lst = []; txt = lambda x: ('%s is candidate to become a Featured Article '                    + '(discussion)') % (x,x) lst = summarizeFeatured0(data.fac, txt) txt = lambda x: ('The Featured Article status of %s is under review '                    + '(discussion)') % (x,x) lst.extend(summarizeFeatured0(data.far, txt)) txt = lambda x: ('%s is nominated to have its Featured Article status removed '                    + '(discussion)') % (x,x) lst.extend(summarizeFeatured0(data.farc, txt)) txt = lambda x: ('A discussion '                    + 'has been started on whether %s should be graded as A-class quality') % (x,x) lst.extend(summarizeFeatured0(data.macr, txt)) txt = lambda x: ('%s is undergoing Peer Review '                    + '(discussion)') % (x,x) lst.extend(summarizeFeatured0(data.pr, txt)) txt = lambda x: ('%s is candidate to become a Good Article '		    + '(see Talk:%s for link to discussion)') % (x,x) lst.extend(summarizeFeatured0(data.gac, txt)) txt = lambda x: ('The Good Article status of %s is under review '                    + '(discussion)') % (x,x) lst.extend(summarizeFeatured0(data.gar, txt)) lst.sort(key = lambda x: x[0], reverse = True) # sort on dates res = ''; for i in range(len(lst)): # No news older than 'maxDays' days if (datetime.datetime.now-lst[i][0]).days > maxDays: break if i == 6: cutoff = lst[i][0] # Don't list struck through items more 18h older than the 7th item if (i <= 6 or cutoff-lst[i][0] < datetime.timedelta(0.75)            or not lst[i][1].startswith(' ')): t = (lst[i][0]-datetime.timedelta(0.75)).strftime('%d %b').lstrip('0') res += ("* %s: %s.\n" % (t,lst[i][1])) if not res: res = '* No (recent) news.\n' return res

def summarizeVfd(vfd, vfdres): res = '' dates = vfd.keys dates.sort dates.reverse for d in dates: l1 = [ a for a in vfd[d] if a in data.all.cur ] l2 = [ a for a in vfd[d] if (a.endswith('nomination)') and a.rfind('(') != -1                                    and a[:a.rfind('(')-1] in data.all.cur) ]        if l1 or l2:            l1.extend(l2)            xs = []            for a in l1:                if a in l2:                    title = a[:a.rfind('(')-1] else: title = a               afdpage = 'Wikipedia:Articles for deletion/%s' % a                if vfdres.has_key(a): suffix = ' closed' if vfdres[a]: suffix += ', result: ' + vfdres[a] else: suffix = '' xs.append('%s (discussion%s)' % (title, afdpage, suffix)) x = ', '.join(xs) t = d.strftime('%d %b').lstrip('0') res += ("* %s: %s.\n" % (t,x)) if not res: res = 'None.\n' return res

def writeGoim: page = pywikibot.Page(site, 'Wikipedia:WikiProject Mathematics/Current activity') txt = page.get

txt = replaceStanza(txt, 'requested', data.req.makeList) txt = replaceStanza(txt, 'reqImages', data.rqIm.makeList) txt = replaceStanza(txt, 'onVfd', summarizeVfd(data.vfd, data.vfdres)) txt = replaceStanza(txt, 'onCfD',                       data.cfd.makeList(prefix = ":Category:", total = False)) txt = replaceStanza(txt, 'prop', data.prop.makeList(total = False)) txt = replaceStanza(txt, 'newArticles', data.all.makeList) txt = replaceStanza(txt, 'rmvdArticles', data.all.makeList(True)) txt = replaceStanza(txt, 'context', data.con.makeList) txt = replaceStanza(txt, 'cleanup', data.clean.makeList) txt = replaceStanza(txt, 'verify', data.vfy.makeList) txt = replaceStanza(txt, 'expert', data.exp.makeList) txt = replaceStanza(txt, 'technical', data.tech.makeList) txt = replaceStanza(txt, 'featured', summarizeFeatured)

if not nowrite: page.put(txt, subject) file('goim.out', 'w').write(txt.encode('utf-8')) print "Info: writeGoim finished at %s" \ % (datetime.datetime.now.time,) else: print txt.encode('utf-8') def updatePortal: page = pywikibot.Page(site, 'Wikipedia:WikiProject Mathematics/Count') page.put(unicode(len(data.all.cur)), subject, watchArticle=True) print "Info: updatePortal finished" progressReport

def updateScript: scriptpage = pywikibot.Page(site, "User:Jitse's bot/goim.py") text = file('goim.py').read.decode('utf-8') text = ' \n' + text + '' # Split to confuse MW parser if scriptpage.get != text: scriptpage.put(text, subject) print "Info: updateScript finished" progressReport

def updateLists: pn = 'Wikipedia:Pages needing attention/Mathematics/Lists' page = pywikibot.Page(site, pn) txt = page.get txt = replaceStanza(txt, 'verify',      list2wiki(data.vfy.cur)) txt = replaceStanza(txt, 'expert',      list2wiki(data.exp.cur)) txt = replaceStanza(txt, 'context',     list2wiki(data.con.cur)) txt = replaceStanza(txt, 'reqImages',   list2wiki(data.rqIm.cur)) txt = replaceStanza(txt, 'technical',   list2wiki(data.tech.cur)) txt = replaceStanza(txt, 'cleanup',     list2wiki(data.clean.cur)) txt = replaceStanza(txt, 'merge',       list2wiki(data.merge.cur)) txt = replaceStanza(txt, 'disputed',    list2wiki(data.disp.cur)) txt = replaceStanza(txt, 'unreferenced', list2wiki(data.unref.cur)) txt = replaceStanza(txt, 'norelref',    list2wiki(data.norelref.cur)) txt = replaceStanza(txt, 'morerefs',    list2wiki(data.morefs.cur)) txt = replaceStanza(txt, 'unsourced',   list2wiki(data.unsrc.cur)) txt = replaceStanza(txt, 'split',       list2wiki(data.split.cur)) txt = replaceStanza(txt, 'clarification',list2wiki(data.clar.cur)) txt = replaceStanza(txt, 'rewrite',     list2wiki(data.rewr.cur)) txt = replaceStanza(txt, 'importance',  list2wiki(data.imp.cur)) txt = replaceStanza(txt, 'tone',        list2wiki(data.tone.cur)) txt = replaceStanza(txt, 'original',    list2wiki(data.orig.cur)) txt = replaceStanza(txt, 'orphaned',    list2wiki(data.orph.cur))

if data.misc.cur: l = [ '* %s is in Category:%s\n' % a for a in data.misc.cur ] else: l = '* None\n' txt = replaceStanza(txt, 'misc',        ''.join(l))

if not nowrite: page.put(txt, subject + ' (phase %d)' % (datetime.datetime.now.toordinal % 4)) print "Info: updateLists finished" progressReport else: print '-' * 70 print txt.encode('utf-8')

def listedOnVfd: if not 'vfd' in dir(data): data.vfd = {} newvfd = {} d = datetime.date.today for i in range(10): d = d - datetime.timedelta(1)  # subtract one day if d in data.vfd: newvfd[d] = data.vfd[d] else: ds = d.strftime('%Y %B %d') ds = ds.replace(' 0', ' ')     # 'June 05' -> 'June 5' ttl = 'Wikipedia:Articles for deletion/Log/' + ds           try: txt = getPage(ttl) except pywikibot.IsRedirectPage: ttl = 'Wikipedia:Votes for deletion/Log/' + ds              txt = getPage(ttl) R1 = re.compile(r'\{\{Wikipedia:Votes for deletion/([^\}]*)\}\}') newvfd[d] = R1.findall(txt) R1 = re.compile(r'\{\{Wikipedia:Pages for deletion/([^\}]*)\}\}') newvfd[d] += R1.findall(txt) R1 = re.compile(r'\{\{Wikipedia:Articles for deletion/([^\}]*)\}\}') newvfd[d] += R1.findall(txt) print "Info: listedOnVfd found %d for %s" \ % (len(newvfd[d]), ds) newvfd[d] = map(lambda x: x.replace('_', ' '), newvfd[d]) data.vfd = newvfd print "Info: listedOnVfd finished" progressReport

def VfdResults: # data.vfdres is a dictionary for closed AfDs # key = AfD page title (without "Wikipedia:Articles for Deletion/" prefix) # value = result (string) or None for parse error

if not 'vfdres' in dir(data): data.vfdres = {}

allafds = [] for v in data.vfd.itervalues: allafds.extend(v)

# Throw old AfDs away for afd in data.vfdres.keys: if afd not in allafds: del data.vfdres[afd]

# Find maths-related AfDs res = [] for afd in allafds: if not data.vfdres.has_key(afd) and \ (afd in data.all.cur or                (afd.endswith('nomination)') and afd.rfind('(') != -1 and afd[:afd.rfind('(')-1] in data.all.cur)):            res.append(afd)

Rclosed = re.compile(r'<div class="[^"]*xfd-closed[^"]*"') Rresult = re.compile(r"The result was *'''([^']*)")

for v in res: afdpage = pywikibot.Page(site, u'Wikipedia:Articles for deletion/' + v)       try: afdtext = afdpage.get except pywikibot.NoPage: print "Warning: VfDresults can't find %s" % afdpage.title continue closed = Rclosed.match(afdtext) if closed: match = Rresult.search(afdtext) if match: data.vfdres[v] = match.group(1).lower else: data.vfdres[v] = None

def listedOnCfd: if not 'cfd' in dir(data) or isinstance(data.cfd, dict): data.cfd = GoimDiff data.cfd.update([]) cats = []; cfdcat = pywikibot.Category(site, "Category:Categories for discussion") for cfdsubcat in cfdcat.subcategories: cs = cfdsubcat.subcategories cs = [ c.title(withNamespace=False) for c in cs ] cats.extend(cs) cats.sort cats = removeDuplicates(cats) print("Info: listedOnCfd found {:d} categories".format(len(cats))) cats = [ c for c in cats if c in data.cat.cur ] res = data.cfd.update(cats) print "Info: listedOnCfd added %d, removed %d" % res progressReport def harvestFAC: if not 'fac' in dir(data): data.fac = GoimDiff data.fac.update([]) ttl = 'Wikipedia:Featured article candidates' txt = getPage(ttl) R1 = re.compile(r'\{\{' + ttl + r'/([^\}]*)\}\}') lst = R1.findall(txt) print "Info: Found %d articles on FAC" % len(lst) lst2 = [ x for x in lst if x in data.all.cur ] lst2.sort res = data.fac.update(removeDuplicates(lst2)) if not res == (0,0): print "Info: harvestFAC added %d, removed %d" % res

def harvestFAR: if not 'far' in dir(data): data.far = GoimDiff data.far.update([]) if not 'farc' in dir(data): data.farc = GoimDiff data.farc.update([]) ttl = 'Wikipedia:Featured article review' R1 = re.compile(r'\{\{' + ttl + r'/([^\}]*)\}\}') txt = getPage(ttl) i = txt.index('==Featured article removal candidates==') lst = R1.findall(txt[:i]) print "Info: Found %d articles on FAR" % len(lst) lst2 = [ x for x in lst if x in data.all.cur ] lst2.sort res = data.far.update(removeDuplicates(lst2)) if not res == (0,0): print "Info: harvestFAR added %d, removed %d to FAR" % res lst = R1.findall(txt[i:]) print "Info: Found %d articles on FARC" % len(lst) lst2 = [ x for x in lst if x in data.all.cur ] lst2.sort res = data.farc.update(removeDuplicates(lst2)) if not res == (0,0): print "Info: harvestFAR added %d, removed %d to FARC" % res

def harvestGAC: if not 'gac' in dir(data): data.gac = GoimDiff data.gac.update([]) ttl = 'Wikipedia:Good article nominations' txt = getPage(ttl) R1 = re.compile(r'\{\{GANentry\|(?:1=)?([^\|\}]*)[\|\}]') lst = R1.findall(txt) print "Info: Found %d articles on GAC" % len(lst) lst2 = [ x for x in lst if x in data.all.cur ] lst2.sort res = data.gac.update(removeDuplicates(lst2)) if not res == (0,0): print "Info: harvestGAC added %d, removed %d" % res

def harvestGAR: if not 'gar' in dir(data): data.gar = GoimDiff data.gar.update([]) txt = getPage('User:VeblenBot/C/Good articles in need of review') R1 = re.compile(r'\{\{CF/Good articles in need of review\|([^\|\}]*)') lst = R1.findall(txt) txt = getPage('User:VeblenBot/C/Wikipedia good article reassessment') R1 = re.compile(r'\{\{CF/Wikipedia good article reassessment\|Good article reassessment/([^/\|]*)') lst.extend(R1.findall(txt)) print "Info: Found %d articles on GAR" % len(lst) lst2 = [ x for x in lst if x in data.all.cur ] lst2.sort res = data.gar.update(removeDuplicates(lst2)) if not res == (0,0): print "Info: harvestGAR added %d, removed %d" % res

def harvestPR: if not 'pr' in dir(data): data.pr = GoimDiff data.pr.update([]) txt = getPage('Wikipedia:Peer review') R1 = re.compile(r'\{\{(User:VeblenBot/C/.*)\}\}') R2 = re.compile(r'\|Peer review/([^/\|]*)') lst = [] subpages = R1.findall(txt) for subpage in subpages: txt = getPage(subpage) reviews = R2.findall(txt) lst.extend(reviews) print "Info: Found %d articles on PR" % len(lst) lst2 = [ x for x in lst if x in data.all.cur ] lst2.sort res = data.pr.update(removeDuplicates(lst2)) if not res == (0,0): print "Info: harvestPR added %d, removed %d" % res

def harvestMACR: if not 'macr' in dir(data): data.macr = GoimDiff ttl = 'Wikipedia:WikiProject Mathematics/A-class rating' txt = getPage(ttl) R1 = re.compile(r'^\{\{' + ttl + r'/([^\}]*)\}\}', re.MULTILINE) lst = R1.findall(txt) print "Info: Found %d articles on MACR" % len(lst) lst.sort res = data.macr.update(removeDuplicates(lst)) if not res == (0,0): print "Info: harvestMACR added %d, removed %d" % res

def featuredContent: goimTry(harvestFAC) goimTry(harvestFAR) goimTry(harvestGAC) goimTry(harvestGAR) goimTry(harvestPR) goimTry(harvestMACR) progressReport def mathArticles: if not 'all' in dir(data): data.all = GoimDiff links = [] links = getLinks(getPage('Lists of mathematics topics')) try: links += getLinks(getPage(u'Wikipedia:WikiProject Mathematics/List of mathematics articles (0-9)')) except pywikibot.IsRedirectPage: links += getLinks(getPage(u'Wikipedia:WikiProject Mathematics/List of mathematics articles (0\u20149)')) for i in range(ord('A'), ord('Z')+1): links += getLinks(getPage('Wikipedia:WikiProject Mathematics/List of mathematics articles ('+chr(i)+')'))

lines = getPage('Lists of mathematicians').split('\n') lines = [ l for l in lines if l.lstrip.startswith('*') ] lines = [ l[:(l.find(']]')+2)] for l in lines if l.find(']]') != -1 ] links += getLinks('\n'.join(lines))

try: for i in range(ord('A'), ord('Z')+1): lines = getPage('List of mathematicians ('+chr(i)+')') lines = lines.split('\n') lines = [ l for l in lines if l.lstrip.startswith('*') ] lines = [ l[:(l.find(']]')+2)] for l in lines if l.find(']]') != -1 ] links += getLinks('\n'.join(lines)) except pywikibot.NoPage: print 'List of mathematicians not yet split.' except: sys.excepthook(*sys.exc_info) print '\nGoIM: Ignoring above exception.\n' links.sort res = data.all.update(removeDuplicates(links)) print "Info: mathArticles added %d, removed %d" % res progressReport

def mathCategories: """Get all mathematical categories (including mathematicians).""" if not 'cat' in dir(data): data.cat = GoimDiff txt = getPage('List of mathematics categories') txt = txt[0 : txt.index('== Mathematics-related categories ==') ] lines = txt.split('\n') cats = [] prefix = '* [[:Category:'   for l in lines:        l = l.lstrip        if l.startswith(prefix):            if '|' in l:                cats.append(l[len(prefix):l.index('|')])            else:                cats.append(l[len(prefix):l.index(']')])    cats.sort    res = data.cat.update(removeDuplicates(cats))    print "Info: mathCategories added %d, removed %d" % res    progressReport

def articlesFromCategory(catname, recursive=False, subcatprefix=None,                        subcatword=None, subcatsuffix=None): """Generator yielding all articles from a category, plus all articles   corresponding to talk pages from the category. Arguments:    * catname: name of category    * recurse: if True, then also go to all subcategories, recursively    * subcatprefix: if set, then also go to all subcategories whose name                     starts with subcatprefix.     * subcatword: setting this is equivalent to setting subcatprefix                   to catname + ' ' + subcatword.    * subcatsuffix: if set, then also go to all subcategories whose name                     ends with subcatsuffix."""

cat = pywikibot.Category(site, "Category:" + catname) for l in cat.articles(recursive): if l.namespace == 0 or l.namespace == 1: yield l.title(withNamespace=False) if subcatword: subcatprefix = catname + " " + subcatword if subcatprefix or subcatsuffix: for c in cat.subcategories: b1 = (subcatprefix and c.title.startswith("Category:" + subcatprefix)) b2 = (subcatsuffix and c.title.endswith("Category:" + subcatsuffix)) if b1 or b2: for l in articlesFromCategory(c.title.replace("Category:", "", 1)): yield l

def reqImages: if not 'rqIm' in dir(data): data.rqIm = GoimDiff links = articlesFromCategory('Wikipedia requested images', subcatprefix='Wikipedia requested') links = [ l for l in links if l in data.all.cur ] catname = 'Wikipedia requested photographs of scientists and academics' links += [ l for l in articlesFromCategory(catname) if l in data.all.cur ] catname = 'Wikipedia requested photographs of people' links += [ l for l in articlesFromCategory(catname) if l in data.all.cur ] catname = 'Wikipedia requested photographs of physics subjects' links += [ l for l in articlesFromCategory(catname) if l in data.all.cur ] # txt = getPage('Wikipedia:Requested pictures/Science') # txt = getSection(txt, 'Mathematics') # txt = removeLines(txt, '==', '==') # links += getLinks(txt) links.sort res = data.rqIm.update(removeDuplicates(links)) print "Info: reqImages added %d, removed %d" % res progressReport

def requested: if not 'req' in dir(data): data.req = GoimDiff txt = getPage('Wikipedia:Requested articles/Mathematics'); txt = removeLines(txt, '==', '==') # remove headings txt = removeLines(txt, "See", "") # remove comments txt = removeLines(txt, ":", "") # remove more comments reComment = re.compile(r'\([^)]*\)\s*-*\s*$', re.M)   txt = reComment.sub('', txt)    links = getLinks(txt)    res = data.req.update(getLinks(txt))    print "Info: requested added %d, removed %d" % res    progressReport

def cleanupMisc0(id, catname, recurse=False, subcatprefix=None,                subcatword=None, subcatsuffix=None): """Put all maths articles from to the id    field of data. Arguments:    * catname: name of category    * recurse: if True, then also go to all subcategories    * subcatprefix: if set, then also go to all subcategories whose name                     starts with subcatprefix.     * subcatword: setting this is equivalent to setting subcatprefix                   to catname + ' ' + subcatword.    * subcatsuffix: if set, then also go to all subcategories whose name                     ends with subcatsuffix."""

if not id in dir(data): setattr(data, id, GoimDiff) links = articlesFromCategory(catname, recurse, subcatprefix, subcatword, subcatsuffix) links = [ l for l in links if l in data.all.cur ] res = getattr(data, id).update(links) if not res == (0,0): print "Info: cleanupMisc0 added %d, removed %d, to/from %s" \ % (res + (id,))

def cleanupMiscDay1: cleanupMisc0('con',  'Wikipedia articles needing context', True) cleanupMisc0('exp',  'All articles needing expert attention') cleanupMisc0('tech', 'Wikipedia articles that are too technical',                          subcatword='from') cleanupMisc0('merge', 'Articles to be merged', subcatword='from') cleanupMisc0('disp', 'Accuracy disputes', subcatword='from')

l = [ '1911 Britannica articles needing updates', # 'Aircraft without proper specifications', 'Articles containing how-to sections', 'Articles in need of internal merging', # unref = 'Articles lacking sources', ['Articles needing original script',True], 'Articles needing sections', 'Articles that are too long', 'Articles that are way too long', # merge \subset 'Articles to be merged', # split = 'Articles to be split', 'Articles to check for link ordering', 'Articles to harmonize', 'Articles using obsolete parameters', 'Articles which may be unencyclopedic', # orig = 'Articles which may contain original research', 'Articles with accessibility problems', 'Articles with confusing statements', 'Articles with incomplete statements', 'Articles with peacock terms', 'Articles with unsourced categories', 'Articles with unsourced quotes', # unsrc = 'Articles with unsourced statements', 'Articles with weasel words', 'Articles without infobox', 'Australia articles needing attention', 'Biographical Directory of the United States Congress cleanup', 'Biographies without real biographical information', 'Biography articles needing attention', 'Books needing cleanup', 'CIA World Factbook cleanup', # 'Categories requiring diffusion', 'Category needed', 'Category needs checking', # 'Comics articles needing cleanup', # 'Comics needing cleanup', 'Disambiguation pages in need of cleanup', # 'Firefly articles needing attention', # 'Guitarist articles needing attention', 'History of Greece articles needing attention', # 'Images for cleanup', 'India articles needing attention', # 'Invalid conservation status', # 'Law-related articles lacking sources', # merge \subset 'Merge by month', # 'Military history articles needing attention', 'New Zealand cleanup', # 'Novel articles needing attention', # 'Novel articles with comments', # 'Nutrition & Dietetics articles requiring major expansion', # orph = 'Orphaned articles' # 'Orphaned categories', # 'Overpopulated stub categories', 'Pearle edits needing manual cleanup', 'Philadelphia articles needing attention', # 'Places of local interest needing cleanup', # 'Portals needing attention', 'Rough translations', # 'Schools needing cleanup', # 'Scouting articles needing attention', 'Self-contradictory articles', # 'Spooks articles with comments', # 'Stub categories', # 'Tree of Life cleanup', # 'U.S. road articles needing work', 'Uncategorised albums', 'Uncategorised books', 'Uncategorised films', # 'Very large categories', 'Virginia articles needing attention', # 'WikiProject Comics cleanup', 'Wikipedia articles containing buzzwords', 'Wikipedia articles containing sections that are an unencyclopedically presented series of quotes', 'Wikipedia articles in need of updating', # clar = 'Wikipedia articles needing clarification', # con = 'Wikipedia articles needing context', 'Wikipedia articles needing copy edit', # vfy = 'Wikipedia articles needing factual verification', # rewr = 'Wikipedia articles needing rewrite', # tone = 'Wikipedia articles needing style editing', 'Wikipedia articles needing their fiction made clear', 'Wikipedia articles requiring OTRS cleanup', # tech = 'Wikipedia articles that are too technical', 'Wikipedia articles using jargon', 'Wikipedia articles with nonstandard pronunciation', 'Wikipedia articles with off-topic sections', 'Wikipedia articles with plot summary needing attention', # 'Wikipedia categories in need of attention', 'Wikipedia cleanup after AFD', 'Wikipedia external links cleanup', 'Wikipedia infobox cleanup', # con = 'Wikipedia introduction cleanup', 'Wikipedia laundry list cleanup', 'Wikipedia list cleanup', # 'Wikipedia maintenance categories sorted by month', 'Wikipedia references cleanup', # rqIm = 'Wikipedia requested images', 'Wikipedia spam cleanup', 'Wikipedia title cleanup', # ---         'Articles which may be biased', 'Articles with limited geographic scope', 'Articles with obsolete information', 'NPOV disputes', 'Too Few Viewpoints' ]

if not 'misc' in dir(data): data.misc = GoimDiff pairs = [] for x in l:       if isinstance(x,list): catname = x[0] recursive = x[1] else: catname = x           recursive = False try: links = articlesFromCategory(catname, recursive) links = [ (l,catname) for l in links if l in data.all.cur ] pairs.extend(links) except: sys.excepthook(*sys.exc_info) print ('\nGoIM: Ignoring above exception in cleanupMiscDay1\n'                  + '      Category = %s\n') % catname res = data.misc.update(pairs) if not res == (0,0): print "Info: cleanupMiscDay1 added %d, removed %d, to/from misc" % res

print "Info: cleanupMiscDay1 finished" progressReport

def unsourced: cleanupMisc0('unsrc', 'Articles with unsourced statements', subcatword='from') progressReport

def cleanupMiscDay2: cleanupMisc0('split', 'Articles to be split', subcatword='from') cleanupMisc0('clar', 'Wikipedia articles needing clarification', subcatword='from') cleanupMisc0('rewr', 'Wikipedia articles needing rewrite', subcatword='from') cleanupMisc0('imp',  'Articles with topics of unclear notability', subcatword='from') cleanupMisc0('tone', 'Wikipedia articles needing style editing', subcatword='from') cleanupMisc0('orig', 'Articles that may contain original research', subcatword='from') cleanupMisc0('orph', 'Orphaned articles', subcatword='from')

print "Info: cleanupMiscDay2 finished" progressReport

def unref: if not 'unref' in dir(data): data.unref = GoimDiff links = [] catname = "Articles lacking sources" cat = pywikibot.Category(site, "Category:" + catname) for c in cat.subcategories: if c.title.startswith("Category:" + catname + " from"): links += articlesFromCategory(c.title.replace("Category:", "", 1)) links += articlesFromCategory(catname) links = [ l for l in links if l in data.all.cur ] res = data.unref.update(links) print "Info: unref added %d, removed %d" % res progressReport

def norelref: if not 'norelref' in dir(data): data.norelref = GoimDiff links = [] catname = "Articles lacking reliable references" cat = pywikibot.Category(site, "Category:" + catname) for c in cat.subcategories: if c.title.startswith("Category:" + catname + " from"): links += articlesFromCategory(c.title.replace("Category:", "", 1)) links += articlesFromCategory(catname) links = [ l for l in links if l in data.all.cur ] res = data.norelref.update(links) print "Info: norelref added %d, removed %d" % res progressReport

def morerefs: if not 'morefs' in dir(data): data.morefs = GoimDiff links = [] catname = "Articles needing additional references" cat = pywikibot.Category(site, "Category:" + catname) for c in cat.subcategories: if c.title.startswith("Category:" + catname + " from"): links += articlesFromCategory(c.title.replace("Category:", "", 1)) links += articlesFromCategory(catname) links = [ l for l in links if l in data.all.cur ] res = data.morefs.update(links) print "Info: morefs added %d, removed %d" % res progressReport

def vfy: if not 'vfy' in dir(data): data.vfy = GoimDiff links = [] catname = "Wikipedia articles needing factual verification" cat = pywikibot.Category(site, "Category:" + catname) for c in cat.subcategories: if c.title.startswith("Category:" + catname + " from"): links += articlesFromCategory(c.title.replace("Category:", "", 1)) links += articlesFromCategory(catname) links = [ l for l in links if l in data.all.cur ] res = data.vfy.update(links) print "Info: vfy added %d, removed %d" % res progressReport

def prop: cleanupMisc0('prop', 'All articles proposed for deletion') progressReport

def cleanup: if not 'clean' in dir(data): data.clean = GoimDiff links = [] cat = pywikibot.Category(site, "Category:Articles needing cleanup") for c in cat.subcategories: if c.title.startswith("Category:Articles needing cleanup from"): links += articlesFromCategory(c.title.replace("Category:", "", 1)) # links += articlesFromCategory('All pages needing cleanup') links = [ l for l in links if l in data.all.cur ] res = data.clean.update(links) print "Info: cleanup added %d, removed %d" % res progressReport

def readData: """Read data from the data file, ignoring errors""" global data try: data = pickle.load(file('data')) except: sys.excepthook(*sys.exc_info) print '\nGoIM: Ignoring above exception, starting with no data.\n' data = GoimData

def transitionData: """Do whatever is necessary to update the data to the new format.   Specifically:    * merge data.stubs into data.all    * remove data.cd    """ if 'stubs' in dir(data): l = data.all.cur + data.stubs.cur l.sort l = removeDuplicates(l) print("Info: Merging data.stubs (%d) into data.all (%d), together (%d)"             % (len(data.stubs.cur), len(data.all.cur), len(l))) data.all.cur = l       del data.stubs if 'cd' in dir(data): print("Info: Removing data.cd") del data.cd   if not 'orph' in dir(data): data.orph = GoimDiff

def cleanupData: """Removed cruft from data. No-op""" pass

def writeData: """Write data to data file, cycling backups""" fn = 'data'; bn = lambda n : fn + '~%d~' % (n,) for n in range(7,1,-1): if os.access(bn(n-1), os.R_OK): os.rename(bn(n-1), bn(n)) if os.access(fn, os.R_OK): os.rename(fn, bn(1)) pickle.dump(data, file(fn, 'w'))

def dumpData(filename='goim.dat'): """Dump data to file""" foo = lambda x: {'cur':x.cur, 'curtime':x.curtime, 'added':x.added, 'rmvd':x.rmvd, 'times':x.times} lst = ['req',  foo(data.req), 'rqIm', foo(data.rqIm), 'all',  foo(data.all),    'cat',   foo(data.cat), 'con',  foo(data.con), 'clean', foo(data.clean), 'exp',   foo(data.exp), 'tech', foo(data.tech),   'vfy',   foo(data.vfy), 'merge', foo(data.merge), 'rewr',  foo(data.rewr), 'disp', foo(data.disp),   'unsrc', foo(data.unsrc), 'unref', foo(data.unref), 'norelref', foo(data.norelref), 'morefs',foo(data.morefs), 'imp',  foo(data.imp), 'split', foo(data.split), 'tone',  foo(data.tone), 'clar', foo(data.clar),   'orig',  foo(data.orig), 'orph', foo(data.orph), 'prop', foo(data.prop), 'misc', foo(data.misc),   'fac',   foo(data.fac), 'far',  foo(data.far),    'farc',  foo(data.farc), 'gac',  foo(data.gac),    'gar',   foo(data.gar), 'pr',   foo(data.pr),     'spr',   foo(data.spr), 'macr', foo(data.macr), 'vfd',  data.vfd,         'vfdres', data.vfdres, 'cfd',  data.cfd ] txt = PrettyPrinter.pformat(lst) file(filename, 'w').write(txt.encode('utf-8'))

def writeRpim: cwd = os.getcwd os.chdir('/data/project/jitse-bot/public_html/rpim-data') file('rpim_number', 'w').write(str(len(data.all.cur)) + '\n') for i in range((len(data.all.cur)+99) / 100): txt = '' for j in range(i*100, min((i+1)*100, len(data.all.cur))): txt += pywikibot.Page(site,data.all.cur[j]).title(asUrl=True) + '\n' f = file('rpim%03d' % (i,), 'w') f.write(txt) f.close os.chdir(cwd)


 * 1) def helpOleg:
 * 2)     """Purge and do empty edits to LoMT (A-C) pages, at Oleg's request."""
 * 3)     prefix = 'Wikipedia:WikiProject Mathematics/List of mathematics articles'
 * 4)     for s in ['A-C', 'D-F', 'G-I', 'J-L', 'M-O', 'P-R', 'S-U', 'V-Z']:
 * 5)         p = pywikibot.Page(site, prefix + ' (' + s + ')')
 * 6)         site.getUrl(site.purge_address(p.urlname)) # purge
 * 7)         p.put(p.get, '')              # empty edit
 * 8)     print "Info: helpOleg finished"
 * 9)     progressReport

def goimTry(fn): """Call fn and print and ignore any exceptions raised during the call.""" try: fn except: sys.excepthook(*sys.exc_info) print '\nGoIM: Ignoring above exception.\n'


 * 1)  Main program

if __name__ == "__main__": for arg in pywikibot.handleArgs: if arg == '-noload': print 'Info: Noload mode enabled' noload = True elif arg == '-nowrite': print 'Info: Nowrite mode enabled' nowrite = True elif arg == '-dryrun': print 'Info: Dryrun mode enabled' noload = True nowrite = True elif arg.startswith('-summary:'): startpos = len('-summary:') subject = arg[startpos:] else: print 'FATAL ERROR: Error parsing argument ' + arg sys.exit(1)

# # For testing purposes: # initProgressReport # readData # transitionData # # ... do the test here # # Possible post-processing: # # writeData # # goimTry(writeGoim) # # goimTry(updateLists) # pywikibot.stopme # sys.exit(0)

initProgressReport readData transitionData if not noload: goimTry(mathArticles) goimTry(mathCategories) goimTry(requested) goimTry(listedOnVfd) goimTry(VfdResults) goimTry(listedOnCfd) goimTry(prop) goimTry(featuredContent) day = datetime.datetime.now.toordinal if day % 4 == 0: goimTry(unref) # 20' if day % 4 == 1: goimTry(unsourced) # 16' goimTry(norelref) # 3' if day % 4 == 2: goimTry(morerefs) # 7' goimTry(reqImages) # 1' goimTry(vfy) # 30"           goimTry(cleanupMiscDay1) # 9'        if day % 4 == 3:            goimTry(cleanup) # 8'            goimTry(cleanupMiscDay2) # 11'    goimTry(cleanupData)    if not nowrite:        writeData

goimTry(writeGoim) goimTry(updateLists) if not nowrite: goimTry(dumpData) goimTry(updatePortal) day = datetime.datetime.now.toordinal # if day % 4 == 3: #    goimTry(helpOleg) goimTry(updateScript) goimTry(writeRpim) else: dumpData('goim.dat.test')

pywikibot.stopme