User:CobraBot/Code


 * 1) -*- coding: utf-8  -*-

import wikipedia import pagegenerators import re import warnings from time import sleep from sys import stdout from oclc import isbn2oclc

docuReplacements = { '&params;': pagegenerators.parameterHelp }
 * 1) This is required for the text that is shown when you run this script
 * 2) with the parameter -help.

TEMPLATE_PREFIX = u"Template:" SITE = wikipedia.getSite def pagesUsingTemplate(templateName): transclusionPageName = unicode(SITE.namespace(10)) + u":" + templateName transclusionPage = wikipedia.Page(SITE, transclusionPageName) gen = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion=True) return gen

class BailOut(StandardError): """Immediately stop processing the current page"""

class OCLCBot: # Edit summary message that should be used. EDIT_SUMMARY = u'Adding OCLC# to book infobox based on ISBN (CobraBot; PLEASE report any problems)' BOOK_INFOBOX = u"Infobox Book" DASHES = [u'-', u'‒', u'–', u'—', u'―'] TERMINATOR = re.compile(u"(}})|\\|") INFOBOX_START = re.compile(u"\\{\\{[ \t\n]*infobox[ _]((book(s)?)|(novel))", re.IGNORECASE) OCLC_PARAM = u"\\|[ \t\n]*oclc[ \t\n]*=[ \t\n]*" ISBN_MIN_LEN = 10

def __init__(self, debug): """       Constructor. Parameters:            * generator - The page generator that determines on which pages                          to work on.            * debug     - If True, doesn't do any real changes, but only shows                          what would have been changed.        """ self.generator = pagesUsingTemplate(self.BOOK_INFOBOX) self.debug = debug self.editCount = 0 self.log = file("skipped.log", 'a')

def run(self): N = 371+145+36+29+38+26+48+56+48+188+85+45+171+130+105 # Set the edit summary message wikipedia.setAction(self.EDIT_SUMMARY) print "Advancing by %s..." % N       stdout.flush for i in xrange(N): next(self.generator) print "Done advancing!" stdout.flush for pageIndex, page in enumerate(self.generator): self.treat(page, pageIndex) self.log.close

#########   def partition(self, text): boxmatch = self.INFOBOX_START.search(text) if not boxmatch: wikipedia.output(u"SKIPPING: Page either uses 'Book infobox' alias or is false positive") raise BailOut, "SKIPPING: Page either uses 'Book infobox' alias or is false positive"

boxStart = boxmatch.start boxEnd = boxStart + re.search(u"\\}\\}", text[boxStart:]).end prebox = text[:boxStart] box = text[boxStart:boxEnd] postbox = text[boxEnd:] return prebox, box, postbox def checkForOclc(self, box): paramMatch = re.search(self.OCLC_PARAM, box) if paramMatch: #has |oclc= oclcValAndRest = box[paramMatch.end:] oclcTermMatch = self.TERMINATOR.search(oclcValAndRest) value = oclcValAndRest[:oclcTermMatch.start].strip # | oclc = VALUE | if value: #already has |oclc= filled in               wikipedia.output(u"SKIPPING: oclc param already filled") raise BailOut, "SKIPPING: oclc param already filled" else: #remove the |oclc= # print "REMOVED OCLC:", repr(paramMatch.group) box = box[:paramMatch.start] + box[paramMatch.start+len(paramMatch.group):] # print "NEW BOX:" # print box return box return box def findIsbnVal(self, box): paramMatch = re.search(u"\\|([ \t\n])*isbn([ \t\n])*=([ \t\n])*", box) if not paramMatch: #no ISBN present wikipedia.output(u"SKIPPING: No isbn param present") raise BailOut, "SKIPPING: No isbn param present" isbnValAndRest = box[paramMatch.end:] termMatch = self.TERMINATOR.search(isbnValAndRest) isbnVal = isbnValAndRest[:termMatch.start] relIsbnTerm = self.TERMINATOR.search(isbnValAndRest).start isbnTerm = paramMatch.end + relIsbnTerm isbnFrag = isbnValAndRest[:relIsbnTerm] if  in isbnFrag and  not in isbnFrag: wikipedia.output(u"SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle") raise BailOut, "SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle" return isbnVal, isbnTerm def removeDashes(self, isbn): for dash in self.DASHES: isbn = isbn.replace(dash, '') return isbn

def checkForNA(self, isbn): if re.match(u"N/?A", isbn, re.IGNORECASE): wikipedia.output(u"SKIPPING: ISBN Not/Applicable") raise BailOut, "SKIPPING: ISBN Not/Applicable" def removeExtraISBN(self, isbnVal): match = re.match(u"([ \t\n])*ISBN([ \t\n])*", isbnVal) if match: return isbnVal[match.end:] return isbnVal

def firstWord(self, isbnVal): wordMatch = re.search("[^ \t\n<,;\\[\\]]+", isbnVal) return wordMatch.group

def normalize(self, string): return string.replace(u' ',u).replace(u"-",u).replace(u"and", u"&").replace(u',', u).replace(u'.', u).replace(u"'", u).replace(u'"', u).replace(u"’", u).lower.replace(u"the", u)

def treat(self, page, pageIndex): """       Loads the given page, does some changes, and saves it.        """ print "==================================================================" # if u"British" not in page.title: return # raw_input("Continue?") print "PAGE TITLE:", page.title print "PAGE#:", pageIndex+1 print "EDIT COUNT:", self.editCount if page.namespace != 0: wikipedia.output(u"SKIPPING: Non-article namespace!") return try: # Load the page text = page.get except wikipedia.NoPage: wikipedia.output(u"Page %s does not exist; skipping." % page.aslink) return except wikipedia.IsRedirectPage: wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink) return

################################################################       # NOTE: Here you can modify the text in whatever way you want. #       ################################################################

# If you find out that you do not want to edit this page, just return.

try: prebox, box, postbox = self.partition(text) # print "BOX:" # print box box = self.checkForOclc(box) isbnVal, isbnTerm = self.findIsbnVal(box) # print "INITIAL ISBN:", repr(isbnVal) isbnVal = self.removeDashes(isbnVal).strip # print "ISBN SANS DASH:", repr(isbnVal) isbnVal = self.removeExtraISBN(isbnVal) self.checkForNA(isbnVal) # print "ISBN SANS ISBN:", repr(isbnVal) if not isbnVal: #empty |isbn= wikipedia.output(u"SKIPPING: Empty isbn param") raise BailOut, "SKIPPING: Empty isbn param" isbn = self.firstWord(isbnVal) # print "ONE TRUE ISBN:", isbn print "ISBN#:", isbn if len(isbn) < self.ISBN_MIN_LEN: wikipedia.output(u"SKIPPING: Malformed ISBN, too short (%s)" % isbn) raise BailOut, ("SKIPPING: Malformed ISBN, too short (%s)" % isbn) if not re.search("[0-9]", isbn): wikipedia.output(u"SKIPPING: Malformed ISBN, no numbers (%s)" % isbn) raise BailOut, ("SKIPPING: Malformed ISBN, no numbers (%s)" % isbn) except BailOut as e:           self.log.write(page.title.encode('utf8')+"; "+e.message+"\n") return #do lookup try: oclc, oclcTitle = isbn2oclc(isbn) except RuntimeError as e:           wikipedia.output(u"ABORTED: Problem looking up OCLC# (%s)" % e.message) return print "PAGE TITLE:", page.title wikiCanon = self.normalize(page.title.split(u"(")[0])       oclcCanon = self.normalize(oclcTitle.split(u":")[0])        titlesMatch = oclcCanon.startswith(wikiCanon)        if titlesMatch:            print            print "--Canonical titles DO MATCH.--"        else:            print wikiCanon            print oclcCanon        box = box[:isbnTerm] + "| oclc= "+oclc+(" " if self.debug else "\n") + box[isbnTerm:]        text = prebox + box + postbox

# only save if something was changed if text != page.get: # Show the title of the page we're working on. # Highlight the title in purple. wikipedia.output(u"\n>>> \03{lightpurple}%s\03{default} <<<" % page.title) # show what was changed wikipedia.showDiff(page.get, text) # raw_input("Continue?") # sleep(3) if not self.debug: choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N') if choice != 'y': return try: # Save the page page.put(text) except wikipedia.LockedPage: wikipedia.output(u"Page %s is locked; skipping." % page.aslink) except wikipedia.EditConflict: wikipedia.output(u'Skipping %s because of edit conflict' % (page.title)) except wikipedia.SpamfilterError, error: wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title, error.url)) else: self.editCount += 1

def main: DEBUG = False # True bot = OCLCBot(DEBUG) with warnings.catch_warnings: warnings.simplefilter("ignore") bot.run

if __name__ == "__main__": try: main finally: wikipedia.stopme