User:CobraBot/Code2


 * 1) !/usr/bin/env python
 * 2) -*- coding: utf-8  -*-

import wikipedia import pagegenerators import re import warnings from time import sleep from contextlib import closing from sys import stdout from oclc import isbn2oclc, ParsingProblem, NotInOclc from congress import isbn2classes, NotInLoc from json import dump, load

docuReplacements = { '&params;': pagegenerators.parameterHelp }
 * 1) This is required for the text that is shown when you run this script
 * 2) with the parameter -help.

TEMPLATE_PREFIX = u"Template:" SITE = wikipedia.getSite def pagesUsingTemplate(templateName): transclusionPageName = unicode(SITE.namespace(10)) + u":" + templateName transclusionPage = wikipedia.Page(SITE, transclusionPageName) gen = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion=True) return gen

class BailOut(StandardError): """Immediately stop processing the current page"""

class AlreadyFilled(BailOut): """Field already filled in"""

class CobraBot: # EDIT_SUMMARY = u'Adding OCLC# to book infobox based on ISBN (CobraBot; PLEASE report any problems)' EDIT_SUMMARY = u'Adding Dewey Decimal and/or LCC to book infobox based on ISBN (CobraBot; PLEASE report any problems)' BOOK_INFOBOX = u"Infobox Book" DASHES = [u'-', u'‒', u'–', u'—', u'―'] TERMINATOR = re.compile(u"(}})|\\|") INFOBOX_START = re.compile(u"\\{\\{[ \t\n]*infobox[ _]((book(s)?)|(novel))", re.IGNORECASE) GENERIC_PARAM = u"\\|[ \t\n]*%s[ \t\n]*=[ \t\n]*" # OCLC_PARAM = GENERIC_PARAM % u"oclc" DEWEY_PARAM = GENERIC_PARAM % u"dewey" LOC_PARAM = GENERIC_PARAM % u"congress" ISBN_MIN_LEN = 10 OFFSET_FILE = 'N.json' JUST_FIC = "[Fic]"

def __init__(self, automatic, debug): """       Constructor. Parameters:            * generator - The page generator that determines on which pages                          to work on.            * debug     - If True, doesn't do any real changes, but only shows                          what would have been changed.        """ self.generator = pagesUsingTemplate(self.BOOK_INFOBOX) self.debug = debug self.editCount = 0 self.log = file("skipped.log", 'a') self.log.write("BEGIN NEW SESSION\n") self.automatic = automatic wikipedia.setAction(self.EDIT_SUMMARY)

def run(self): with closing(file(self.OFFSET_FILE, 'r')) as f:           N = load(f) # Set the edit summary message print "Advancing by %s..." % N       stdout.flush for i in xrange(N): next(self.generator) print "Done advancing!" stdout.flush try: for pageIndex, page in enumerate(self.generator): self.treat(page, pageIndex) finally: self.log.close with closing(file(self.OFFSET_FILE, 'w')) as f:               dump(N+pageIndex-5, f)    def runManual(self): index = 0 while True: title = raw_input("Page: ").decode('utf8') page = wikipedia.Page(None, title) self.treat(page, index) index += 1

#########   def partition(self, text): boxmatch = self.INFOBOX_START.search(text) if not boxmatch: wikipedia.output(u"SKIPPING: Page either uses 'Book infobox' alias or is false positive") raise BailOut, "SKIPPING: Page either uses 'Book infobox' alias or is false positive"

boxStart = boxmatch.start boxEnd = boxStart + re.search(u"\\}\\}", text[boxStart:]).end prebox = text[:boxStart] box = text[boxStart:boxEnd] postbox = text[boxEnd:] return prebox, box, postbox def checkForField(self, box, field_regex): paramMatch = re.search(field_regex, box) if paramMatch: #has |oclc= fieldValAndRest = box[paramMatch.end:] fieldTermMatch = self.TERMINATOR.search(fieldValAndRest) value = fieldValAndRest[:fieldTermMatch.start].strip # | oclc = VALUE | if value: #already has |oclc= filled in               wikipedia.output(u"SKIPPING: param already filled") raise AlreadyFilled, "SKIPPING: param already filled" else: #remove the |oclc= # print "REMOVED OCLC:", repr(paramMatch.group) box = box[:paramMatch.start] + box[paramMatch.start+len(paramMatch.group):] # print "NEW BOX:" # print box return box return box def findIsbnVal(self, box): paramMatch = re.search(u"\\|([ \t\n])*isbn([ \t\n])*=([ \t\n])*", box) if not paramMatch: #no ISBN present wikipedia.output(u"SKIPPING: No isbn param present") raise BailOut, "SKIPPING: No isbn param present" isbnValAndRest = box[paramMatch.end:] termMatch = self.TERMINATOR.search(isbnValAndRest) isbnVal = isbnValAndRest[:termMatch.start] relIsbnTerm = self.TERMINATOR.search(isbnValAndRest).start isbnTerm = paramMatch.end + relIsbnTerm isbnFrag = isbnValAndRest[:relIsbnTerm] if  in isbnFrag and  not in isbnFrag: wikipedia.output(u"SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle") raise BailOut, "SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle" return isbnVal, isbnTerm def removeDashes(self, isbn): for dash in self.DASHES: isbn = isbn.replace(dash, '') return isbn

def checkForNA(self, isbn): if re.match(u"N/?A", isbn, re.IGNORECASE): wikipedia.output(u"SKIPPING: ISBN Not/Applicable") raise BailOut, "SKIPPING: ISBN Not/Applicable" def removeExtraISBN(self, isbnVal): match = re.match(u"([ \t\n])*ISBN([ \t\n])*", isbnVal) if match: return isbnVal[match.end:] return isbnVal

def firstWord(self, isbnVal): wordMatch = re.search("[^ \t\n<,;\\[\\]]+", isbnVal) return wordMatch.group

def normalize(self, string): return string.replace(u' ',u).replace(u"-",u).replace(u"and", u"&").replace(u',', u).replace(u'.', u).replace(u"'", u).replace(u'"', u).replace(u"’", u).lower.replace(u"the", u)

def treat(self, page, pageIndex): """       Loads the given page, does some changes, and saves it.        """ print "==================================================================" print "PAGE TITLE:", page.title print "PAGE#:", pageIndex+1 print "EDIT COUNT:", self.editCount if page.namespace != 0: wikipedia.output(u"SKIPPING: Non-article namespace!") return try: # Load the page text = page.get except wikipedia.NoPage: wikipedia.output(u"Page %s does not exist; skipping." % page.aslink) return except wikipedia.IsRedirectPage: wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink) return

try: prebox, box, postbox = self.partition(text) # print "BOX:" # print box doLOC = True try: box = self.checkForField(box, self.LOC_PARAM) except AlreadyFilled: doLOC = False doDewey = True try: box = self.checkForField(box, self.DEWEY_PARAM) except AlreadyFilled: doDewey = False if not doDewey and not doLOC: return #skip since both filled in

isbnVal, isbnTerm = self.findIsbnVal(box) # print "INITIAL ISBN:", repr(isbnVal) isbnVal = self.removeDashes(isbnVal).strip # print "ISBN SANS DASH:", repr(isbnVal) isbnVal = self.removeExtraISBN(isbnVal) self.checkForNA(isbnVal) # print "ISBN SANS ISBN:", repr(isbnVal) if not isbnVal: #empty |isbn= wikipedia.output(u"SKIPPING: Empty isbn param") raise BailOut, "SKIPPING: Empty isbn param" isbn = self.firstWord(isbnVal) # print "ONE TRUE ISBN:", isbn if not self.automatic: print "ISBN#:", isbn if len(isbn) < self.ISBN_MIN_LEN: wikipedia.output(u"SKIPPING: Malformed ISBN, too short (%s)" % isbn) raise BailOut, ("SKIPPING: Malformed ISBN, too short (%s)" % isbn) if not re.search("[0-9]", isbn): wikipedia.output(u"SKIPPING: Malformed ISBN, no numbers (%s)" % isbn) raise BailOut, ("SKIPPING: Malformed ISBN, no numbers (%s)" % isbn) #do lookup try: loc, dewey = isbn2classes(isbn) except NotInLoc: wikipedia.output(u"SKIPPED: Given ISBN not in LOC database") raise BailOut, "SKIPPED: Given ISBN not in LOC database" except RuntimeError as e:               wikipedia.output(u"ABORTED: Problem looking up data (%s)" % e.message) raw_input("Enter to continue") raise BailOut, e.message

doDewey &= dewey is not None and dewey != self.JUST_FIC

# try: #    oclc, oclcTitle = isbn2oclc(isbn) # except ParsingProblem: #    wikipedia.output(u"SKIPPED: Problem parsing OCLC response") #    raw_input("Enter to continue") #    raise BailOut, "SKIPPED: Problem parsing OCLC response"

except BailOut as e:           try: self.log.write(page.title.encode('utf8')+"; "+e.message+"\n") except: pass return print "LOC Class:", loc print "Dewey Class:", dewey # print "OCLC#:", oclc # if not self.automatic: #    wikiCanon = self.normalize(page.title.split(u"(")[0])        #     oclcCanon = self.normalize(oclcTitle.split(u":")[0])        #     titlesMatch = oclcCanon.startswith(wikiCanon)        #     if titlesMatch:        #         # print        #         print "--Canonical titles DO MATCH--"        #     else:        #         print "!!Canonical titles DON'T MATCH!!"        #         print "PAGE TITLE:", page.title        #         print "OCLC TITLE:", oclcTitle            #     print wikiCanon            #     print oclcCanon        addition = ""        if doDewey: addition = "| dewey= "+dewey+(" " if self.debug else "\n")        if doLOC: addition += "| loc= "+loc+(" " if self.debug else "\n")        box = box[:isbnTerm] + addition + box[isbnTerm:]        text = prebox + box + postbox

# only save if something was changed if text != page.get: # Show the title of the page we're working on. if not self.automatic: # Highlight the title in purple. wikipedia.output(u"\n>>> \03{lightpurple}%s\03{default} <<<" % page.title) # show what was changed wikipedia.showDiff(page.get, text) if not self.debug: if False: pass elif self.automatic: pass else: choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N') if choice == 'n': return try: # Save the page page.put(text) except wikipedia.LockedPage: wikipedia.output(u"Page %s is locked; skipping." % page.aslink) except wikipedia.EditConflict: wikipedia.output(u'Skipping %s because of edit conflict' % (page.title)) except wikipedia.SpamfilterError, error: wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title, error.url)) else: self.editCount += 1

def main: DEBUG = True # False AUTO = False bot = CobraBot(AUTO, DEBUG) with warnings.catch_warnings: warnings.simplefilter("ignore") bot.run #bot.runManual

if __name__ == "__main__": try: main finally: wikipedia.stopme