User:LemmeyBOT/whoipedia

'),       # templates with parameters often have whitespace that is used to        # improve wiki source code readability.        'template':    re.compile(r'(?s)^'),    }

# if we got a string, compile it as a regular expression if type(old) == type() or type(old) == type(u): if caseInsensitive: old = re.compile(old, re.IGNORECASE | re.UNICODE) else: old = re.compile(old) #noTouch = '|'.join([exceptions[name] for name in exceptList]) #noTouchR = re.compile(noTouch) # How much of the text we have looked at so far dontTouchRegexes = [exceptionRegexes[name] for name in exceptions] index = 0 markerpos = len(text) while True: match = old.search(text, index) if not match: # nothing left to replace break

# check which exception will occur next. nextExceptionMatch = None for dontTouchR in dontTouchRegexes: excMatch = dontTouchR.search(text, index) if excMatch and (                   nextExceptionMatch is None or                    excMatch.start < nextExceptionMatch.start): nextExceptionMatch = excMatch

if nextExceptionMatch is not None and nextExceptionMatch.start <= match.start: # an HTML comment or text in nowiki tags stands before the next valid match. Skip. index = nextExceptionMatch.end else: # We found a valid match. Replace it.

# We cannot just insert the new string, as it may contain regex # group references such as \2 or \g. # On the other hand, this approach does not work because it can't           # handle lookahead or lookbehind (see bug #1731008): #replacement = old.sub(new, text[match.start:match.end]) #text = text[:match.start] + replacement + text[match.end:]

# So we have to process the group references manually. replacement = new

groupR = re.compile(r'\\(?P \d+)|\\g<(?P .+?)>') while True: groupMatch = groupR.search(replacement) if not groupMatch: break groupID = groupMatch.group('name') or int(groupMatch.group('number')) replacement = replacement[:groupMatch.start] + match.group(groupID) + replacement[groupMatch.end:] text = text[:match.start] + replacement + text[match.end:]

# continue the search on the remaining text if allowoverlap: index = match.start + 1 else: index = match.start + len(replacement) markerpos = match.start + len(replacement) text = text[:markerpos] + marker + text[markerpos:] return text

def removeDisabledParts(text): """   Removes those parts of a wiki text where wiki markup is disabled, i.e.    * HTML comments    * nowiki tags    * includeonly tags    """ toRemoveR = re.compile(r' .*? || .*? ', re.IGNORECASE | re.DOTALL) return toRemoveR.sub('', text)


 * 1) Part of library dealing with interwiki links

def getLanguageLinks(text, insite = None, pageLink = ""): """   Returns a dictionary with language codes as keys and Page objects as values    for each interwiki link found in the text. Do not call this routine    directly, use Page objects instead""" if insite == None: insite = getSite result = {} # Ignore interwiki links within nowiki tags, includeonly tags, and HTML comments text = removeDisabledParts(text)

# This regular expression will find every link that is possibly an   # interwiki link. # NOTE: language codes are case-insensitive and only consist of basic latin # letters and hyphens. interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]') for lang, pagetitle in interwikiR.findall(text): lang = lang.lower # Check if it really is in fact an interwiki link to a known # language, or if it's e.g. a category tag or an internal link if lang in insite.family.obsolete: lang = insite.family.obsolete[lang] if lang in insite.validLanguageLinks: if '|' in pagetitle: # ignore text after the pipe pagetitle = pagetitle[:pagetitle.index('|')] if not pagetitle: output(u"ERROR: %s - ignoring impossible link to %s:%s" % (pageLink, lang, pagetitle)) else: # we want the actual page objects rather than the titles site = insite.getSite(code = lang) result[site] = Page(site, pagetitle, insite = insite) return result

def removeLanguageLinks(text, site = None, marker = ''): """Given the wiki-text of a page, return that page with all interwiki      links removed. If a link to an unknown language is encountered,       a warning is printed. If a marker is defined, the marker is placed       at the location of the last occurence of an interwiki link (at the end       if there are no interwikilinks).""" if site == None: site = getSite if not site.validLanguageLinks: return text # This regular expression will find every interwiki link, plus trailing # whitespace. languageR = '|'.join(site.validLanguageLinks) interwikiR = re.compile(r'\[\[(%s)\s?:[^\]]*\]\][\s]*' % languageR, re.IGNORECASE) text = replaceExcept(text, interwikiR, '', ['nowiki', 'comment', 'math', 'pre'], marker = marker) return normalWhitespace(text)

def replaceLanguageLinks(oldtext, new, site = None): """Replace the interwiki language links given in the wikitext given      in oldtext by the new links given in new.

'new' should be a dictionary with the language names as keys, and Page objects as values. """   # Find a marker that is not already in the text.    marker = '@@'    while marker in oldtext:        marker += '@'    if site == None:        site = getSite    s = interwikiFormat(new, insite = site)    s2 = removeLanguageLinks(oldtext, site = site, marker = marker)    if s:        if site.language in site.family.interwiki_attop:            newtext = s + site.family.interwiki_text_separator + s2.replace(marker,).strip        else:            # calculate what was after the language links on the page            firstafter = s2.find(marker) + len(marker)            # Is there any text in the 'after' part that means we should keep it after?            if " " in s2[firstafter:]:                newtext = s2[:firstafter] + s + s2[firstafter:]            elif site.language in site.family.categories_last:                cats = getCategoryLinks(s2, site = site)                s2 = removeCategoryLinks(s2.replace(marker,).strip, site) + site.family.interwiki_text_separator + s                newtext = replaceCategoryLinks(s2, cats, site=site) else: newtext = s2.replace(marker,).strip + site.family.interwiki_text_separator + s           newtext = newtext.replace(marker,) else: newtext = s2.replace(marker,'') return newtext

def interwikiFormat(links, insite = None): """Create a suitable string encoding all interwiki links for a wikipedia      page.

'links' should be a dictionary with the language codes as keys, and Page objects as values.

The string is formatted for inclusion in insite (defaulting to your      own site). """   if insite is None:        insite = getSite    if not links:        return ''    # Security check: site may not refer to itself.    #    # Disabled because MediaWiki was changed so that such links appear like    # normal links, and some people accidentally use them for normal links.    # While such links are bad style, they are not worth crashing the bot.    #    #for pl in links.values:    #    if pl.site == insite:    #        raise ValueError("Trying to add interwiki link to self")    s = []    ar = links.keys    ar.sort    putfirst = insite.interwiki_putfirst    if putfirst:        #In this case I might have to change the order        ar2 = []        for code in putfirst:            # The code may not exist in this family?            if code in getSite.validLanguageLinks:                site = insite.getSite(code = code)                if site in ar:                    del ar[ar.index(site)] ar2 = ar2 + [site] ar = ar2 + ar   if insite.interwiki_putfirst_doubled(ar): ar = insite.interwiki_putfirst_doubled(ar) + ar   for site in ar: try: link = links[site].aslink(forceInterwiki = True) s.append(link) except AttributeError: s.append(site.linkto(links[site],othersite=insite)) if insite.lang in insite.family.interwiki_on_one_line: sep = ' ' else: sep = '\r\n' s=sep.join(s) + '\r\n' return s

def normalWhitespace(text): # Remove white space at the beginning while 1: if text and text.startswith('\r\n'): text=text[2:] elif text and text.startswith(' '): # This assumes that the first line NEVER starts with a space! text=text[1:] else: break # Remove white space at the end while 1: if text and text[-1:] in '\r\n \t': text=text[:-1] else: break return text


 * 1) Categories

def getCategoryLinks(text, site): import catlib """Returns a list of category links.      in the form {code:pagename}. Do not call this routine directly, use       Page objects instead""" result = [] # Ignore category links within nowiki tags, includeonly tags, and HTML comments text = removeDisabledParts(text) catNamespace = '|'.join(site.category_namespaces) R = re.compile(r'\[\[\s*(?P %s)\s*:\s*(?P.+?)(?:\|(?P.+?))?\s*\]\]' % catNamespace) for match in R.finditer(text): cat = catlib.Category(site, '%s:%s' % (match.group('namespace'), match.group('catName')), sortKey = match.group('sortKey')) result.append(cat) return result

def removeCategoryLinks(text, site, marker = ''): """Given the wiki-text of a page, return that page with all category      links removed. Puts the marker after the last replacement (at the       end of the text if there is no replacement)""" # This regular expression will find every link that is possibly an   # interwiki link, plus trailing whitespace. The language code is grouped. # NOTE: This assumes that language codes only consist of non-capital # ASCII letters and hyphens. catNamespace = '|'.join(site.category_namespaces) categoryR = re.compile(r'\[\[\s*(%s)\s*:.*?\]\][\s]*' % catNamespace) text = replaceExcept(text, categoryR, '', ['nowiki', 'comment', 'math', 'pre'], marker = marker) return normalWhitespace(text)

def replaceCategoryInPlace(oldtext, oldcat, newcat, site = None): """Replaces the category oldcat with the category newcat and then returns      the modified Wiki source.    """ #Note that this doesn't work yet and it has some very strange side-effects.

if site is None: site = getSite

catNamespace = '|'.join(site.category_namespaces) categoryR = re.compile(r'\[\[\s*(%s)\s*:%s\]\]' % (catNamespace, oldcat.titleWithoutNamespace)) text = replaceExcept(oldtext, categoryR, '' % newcat.titleWithoutNamespace, ['nowiki', 'comment', 'math', 'pre']) categoryR = re.compile(r'\[\[\s*(%s)\s*:%s\]\]' % (catNamespace, oldcat.titleWithoutNamespace.replace(' ','_'))) text = replaceExcept(text, categoryR, '' % newcat.titleWithoutNamespace, ['nowiki', 'comment', 'math', 'pre']) return text

def replaceCategoryLinks(oldtext, new, site = None): """Replace the category links given in the wikitext given      in oldtext by the new links given in new.

'new' should be a list of Category objects. """

# Find a marker that is not already in the text. marker = '@@' while marker in oldtext: marker += '@'

if site is None: site = getSite if site.sitename == 'wikipedia:de': raise Error('The PyWikipediaBot is no longer allowed to touch categories on the German Wikipedia. See http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv2#Position_der_Personendaten_am_.22Artikelende.22')

s = categoryFormat(new, insite = site) s2 = removeCategoryLinks(oldtext, site = site, marker = marker)

if s:       if site.language in site.family.category_attop: newtext = s + site.family.category_text_separator + s2       else: # calculate what was after the categories links on the page firstafter = s2.find(marker) # Is there any text in the 'after' part that means we should keep it after? if " " in s2[firstafter:]: newtext = s2[:firstafter] + s + s2[firstafter:] elif site.language in site.family.categories_last: newtext = s2.replace(marker,'').strip + site.family.category_text_separator + s           else: interwiki = getLanguageLinks(s2) s2 = removeLanguageLinks(s2.replace(marker,'').strip, site) + site.family.category_text_separator + s               newtext = replaceLanguageLinks(s2, interwiki, site) newtext = newtext.replace(marker,'') else: s2 = s2.replace(marker,'') return s2   return newtext

def categoryFormat(categories, insite = None): """Create a suitable string with all category links for a wiki      page.

'categories' should be a list of Category objects.

The string is formatted for inclusion in insite. """   if not categories:        return ''    if insite is None:        insite = getSite    catLinks = [category.aslink for category in categories]    if insite.category_on_one_line:        sep = ' '    else:        sep = '\r\n'    # Some people don't like the categories sorted    #catLinks.sort    return sep.join(catLinks) + '\r\n'


 * 1) end of category specific code

def url2link(percentname, insite, site): """Convert a url-name of a page into a proper name for an interwiki link      the argument 'insite' specifies the target wiki       """ percentname = percentname.replace('_', ' ') x = url2unicode(percentname, site = site) return unicode2html(x, insite.encoding)

def resolveEsperantoXConvention(text):

"""   Resolves the x convention used to encode Esperanto special characters,    e.g. Cxefpagxo and CXefpagXo will both be converted to Ĉefpaĝo.    Note that to encode non-Esperanto words like Bordeaux, one uses a    double x, i.e. Bordeauxx or BordeauxX.    """ chars = { u'c': u'ĉ', u'C': u'Ĉ', u'g': u'ĝ', u'G': u'Ĝ', u'h': u'ĥ', u'H': u'Ĥ', u'j': u'ĵ', u'J': u'Ĵ', u's': u'ŝ', u'S': u'Ŝ', u'u': u'ŭ', u'U': u'Ŭ', }   for latin, esperanto in chars.iteritems: # A regular expression that matches a letter combination which IS       # encoded using x-convention. xConvR = re.compile(latin + '[xX]+') pos = 0 result = '' # Each matching substring will be regarded exactly once. while True: match = xConvR.search(text[pos:]) if match: old = match.group if len(old) % 2 == 0: # The first two chars represent an Esperanto letter. # Following x's are doubled. new = esperanto + ''.join([old[2 * i] for i in range(1, len(old)/2)]) else: # The first character stays latin; only the x's are doubled. new = latin + ''.join([old[2 * i + 1] for i in range(0, len(old)/2)]) result += text[pos : match.start + pos] + new pos += match.start + len(old) else: result += text[pos:] text = result break return text

def doubleXForEsperanto(text): """   Doubles X-es where necessary so that we can submit a page to an Esperanto    wiki. Again, we have to keep stupid stuff like cXxXxxX in mind. Maybe    someone wants to write about the Sony Cyber-shot DSC-Uxx camera series on    eo: ;)    """    # A regular expression that matches a letter combination which is NOT    # encoded in x-convention.    notXConvR = re.compile('[cghjsuCGHJSU][xX]+')    pos = 0    result =     while True:        match = notXConvR.search(text[pos:])        if match:            old = match.group            # the first letter stays; add an x after each X or x.            new = old[0] + .join([old[i] + 'x' for i in range(1, len(old))])            result += text[pos : match.start + pos] + new            pos += match.start + len(old)        else:            result += text[pos:]            text = result            break    return text

def sectionencode(text, encoding): # change the text so that it can be used as a section title in wiki-links return urllib.quote(text.replace(" ","_").encode(encoding)).replace("%",".")


 * 1) Unicode library functions ########

def UnicodeToAsciiHtml(s): html = [] for c in s:       cord = ord(c) if cord < 128: html.append(c) else: html.append('&#%d;'%cord) return ''.join(html)

def url2unicode(title, site, site2 = None): # create a list of all possible encodings for both hint sites encList = [site.encoding] + list(site.encodings) if site2 and site2 <> site: encList.append(site.encoding) encList += list(site2.encodings) firstException = None # try to handle all encodings (will probably retry utf-8) for enc in encList: try: t = title.encode(enc) t = urllib.unquote(t) return unicode(t, enc) except UnicodeError, ex: if not firstException: firstException = ex           pass # Couldn't convert, raise the original exception raise firstException

def unicode2html(x, encoding): """   We have a unicode string. We can attempt to encode it into the desired    format, and if that doesn't work, we encode the unicode into html #    entities. If it does work, we return it unchanged.    """ try: x.encode(encoding) except UnicodeError: x = UnicodeToAsciiHtml(x) return x

def html2unicode(text, ignore = []): """   Given a string, replaces all HTML entities by the equivalent unicode    characters.    """ # This regular expression will match any decimal and hexadecimal entity and # also entities that might be named entities. entityR = re.compile(r'&(#(?P \d+)|#x(?P [0-9a-fA-F]+)|(?P [A-Za-z]+));') result = u'' i = 0 found = True while found: text = text[i:] match = entityR.search(text) if match: unicodeCodepoint = None if match.group('decimal'): unicodeCodepoint = int(match.group('decimal')) elif match.group('hex'): unicodeCodepoint = int(match.group('hex'), 16) elif match.group('name'): name = match.group('name') if htmlentitydefs.name2codepoint.has_key(name): # We found a known HTML entity. unicodeCodepoint = htmlentitydefs.name2codepoint[name] result += text[:match.start] if unicodeCodepoint and unicodeCodepoint not in ignore and (WIDEBUILD or unicodeCodepoint < 65534): result += unichr(unicodeCodepoint) else: # Leave the entity unchanged result += text[match.start:match.end] i = match.end else: result += text found = False return result

def Family(fam = None, fatal = True): """   Import the named family.    If fatal is true, the bot will stop running when the given family is    unknown. If fatal is false, it will only raise a ValueError exception.    """ if fam == None: fam = config.family try: # search for family module in the 'families' subdirectory import wikipediatools as _wt sys.path.append(_wt.absoluteFilename('families')) exec "import %s_family as myfamily" % fam except ImportError: if fatal: output(u"Error importing the %s family. This probably means the family does not exist. Also check your configuration file." % fam) import traceback traceback.print_stack sys.exit(1) else: raise ValueError("Family does not exist") return myfamily.Family

class Site(object): def __init__(self, code, fam=None, user=None): """Constructor takes three arguments:

code   language code for Site fam    Wikimedia family (optional: defaults to configured). Can either be a string or a Family object. user   User to use (optional: defaults to configured)"""

self.lang = code.lower if isinstance(fam, basestring) or fam is None: self.family = Family(fam, fatal = False) else: self.family = fam if self.lang not in self.languages: raise KeyError("Language %s does not exist in family %s"%(self.lang,self.family.name))

# if we got an outdated language code, use the new one instead. if self.lang in self.family.obsolete and self.family.obsolete[self.lang]: self.lang = self.family.obsolete[self.lang]

self.messages=False self._mediawiki_messages = {} self.nocapitalize = self.lang in self.family.nocapitalize self.user = user self._token = None self._sysoptoken = None #self.loginStatusKnown = {} #self._loggedInAs = None self.loginStatusKnown = True self._loggedInAs = 'Larry' self.userGroups = [] # Calculating valid languages took quite long, so we calculate it once # in initialization instead of each time it is used. self._validlanguages = [] for language in self.languages: if not language[0].upper + language[1:] in self.namespaces: self._validlanguages.append(language) self.sandboxpage = Page(self,self.family.sandboxpage(code))

def urlEncode(self, query): """This can encode a query so that it can be sent as a query using       a http POST request""" if not query: return None l = [] for key, value in query.iteritems: if isinstance(key, unicode): key = key.encode('utf-8') if isinstance(value, unicode): value = value.encode('utf-8') key = urllib.quote(key) value = urllib.quote(value) l.append(key + '=' + value) return '&'.join(l)

def postForm(self, address, predata, sysop = False, useCookie=True): """       Posts the given form data to the given address at this site.        address is the absolute path without hostname.        predata is a list of key-value tuples.        Returns a (response, data) tuple where response is the HTTP        response object and data is a Unicode string containing the        body of the response.        """ data = self.urlEncode(predata) return self.postData(address, data, sysop = sysop, useCookie=useCookie)

def postData(self, address, data, contentType = 'application/x-www-form-urlencoded', sysop = False, useCookie=True): """       Posts the given data to the given address at this site.        address is the absolute path without hostname.        data is an ASCII string. (or isn't it?)        Returns a (response, data) tuple where response is the HTTP        response object and data is a Unicode string containing the        body of the response.        """

# TODO: add the authenticate stuff here

# Encode all of this into a HTTP request conn = httplib.HTTPConnection(self.hostname)

conn.putrequest('POST', address) conn.putheader('Content-Length', str(len(data))) conn.putheader('Content-type', contentType) conn.putheader('User-agent', useragent) if useCookie and self.cookies(sysop = sysop): conn.putheader('Cookie', self.cookies(sysop = sysop)) conn.endheaders conn.send(data)

# Prepare the return values # Note that this can raise network exceptions which are not # caught here. response = conn.getresponse data = response.read.decode(self.encoding) conn.close return response, data

def forceLogin(self, sysop = False): self.loginStatusKnown = True self._loggedInAs = 'Larry' #loginMan.username
 * 1)        if not self.loggedInAs(sysop = sysop):
 * 2)            loginMan = login.LoginManager(site = self, sysop = sysop)
 * 3)            if loginMan.login(retry = True):
 * 4)                self.loginStatusKnown = True
 * 5)                self._loggedInAs = loginMan.username

def loggedInAs(self, sysop = False): """       Checks if we're logged in by loading a page and looking for the login        link. We assume that we're not being logged out during a bot run, so        loading the test page is only required once.

If logged in, returns the username. Otherwise, returns None """       self._loadCookies(sysop = sysop)        if not self.loginStatusKnown:            output(u'Getting a page to check if we\'re logged in on %s' % self)            path = self.put_address('Non-existing_page')            text = self.getUrl(path, sysop = sysop)            # Search for the "my talk" link at the top            mytalkR = re.compile('(?P .+?)')            m = mytalkR.search(text)            if m:                self.loginStatusKnown = True                self._loggedInAs = m.group('username')                # While we're at it, check if we have got unread messages                if ' ' in text:                    output(u'NOTE: You have unread messages on %s' % self)                    messages=True                else:                    messages=False                # Check whether we found a token                Rwatch = re.compile(r"\<input type='hidden' value=\"(.*?)\" name=\"wpEditToken\"") tokenloc = Rwatch.search(text) if tokenloc: self.putToken(tokenloc.group(1), sysop = sysop) return self._loggedInAs

def cookies(self, sysop = False): # TODO: cookie caching is disabled #if not hasattr(self,'_cookies'): self._loadCookies(sysop = sysop) return self._cookies

def _loadCookies(self, sysop = False): """Retrieve session cookies for login""" try: if sysop: try: username = config.sysopnames[self.family.name][self.lang] except KeyError: self._cookies = None self.loginStatusKnown = True #raise NoUsername('You tried to perform an action that requires admin privileges, but you haven\'t entered your sysop name in your user-config.py. Please add sysopnames[\'%s\'][\'%s\']=\'name\' to your user-config.py' % (self.family.name, self.lang)) else: username = config.usernames[self.family.name][self.lang] except KeyError: self._cookies = None self.loginStatusKnown = True else: import wikipediatools as _wt username = config.usernames[self.family.name][self.lang] tmp = '%s-%s-%s-login.data' % (self.family.name, self.lang, username) fn = _wt.absoluteFilename('login-data', tmp) if not os.path.exists(fn): self._cookies = None self.loginStatusKnown = True else: f = open(fn) self._cookies = '; '.join([x.strip for x in f.readlines]) f.close

r_userGroups = re.compile(ur'var wgUserGroups \= (.*)\;') def getUrl(self, path, retry = True, sysop = False, data = None, compress = True): """       Low-level routine to get a URL from the wiki.

Parameters: path - The absolute path, without the hostname. retry - If True, retries loading the page when a network error occurs. sysop - If True, the sysop account's cookie will be used. data - An optional dict providing extra post request parameters

Returns the HTML text of the page converted to unicode. """       if self.hostname in config.authenticate.keys:            uo = authenticateURLopener        else:            uo = MyURLopener            if self.cookies(sysop = sysop):                uo.addheader('Cookie', self.cookies(sysop = sysop))            if compress:                uo.addheader('Accept-encoding', 'gzip')

url = 'http://%s%s' % (self.hostname, path) data = self.urlEncode(data)

# Try to retrieve the page until it was successfully loaded (just in       # case the server is down or overloaded). # Wait for retry_idle_time minutes (growing!) between retries. retry_idle_time = 1 retrieved = False while not retrieved: try: if self.hostname in config.authenticate.keys: if compress: request = urllib2.Request(url, data) request.add_header('Accept-encoding', 'gzip') opener = urllib2.build_opener f = opener.open(request) else: f = urllib2.urlopen(url, data) else: f = uo.open(url, data) retrieved = True except KeyboardInterrupt: raise except Exception, e:               if retry: # We assume that the server is down. Wait some time, then try again. output(u"%s" % e)                   output(u"WARNING: Could not open 'http://%s%s'. Maybe the server or your connection is down. Retrying in %i minutes..." % (self.hostname, path, retry_idle_time)) time.sleep(retry_idle_time * 60) # Next time wait longer, but not longer than half an hour retry_idle_time *= 2 if retry_idle_time > 30: retry_idle_time = 30 else: raise text = f.read if compress and f.headers.get('Content-Encoding') == 'gzip': import StringIO, gzip compressedstream = StringIO.StringIO(text) gzipper = gzip.GzipFile(fileobj=compressedstream) text = gzipper.read # Find charset in the content-type meta tag contentType = f.info['Content-Type'] R = re.compile('charset=([^\'\";]+)')       m = R.search(contentType)        if m:            charset = m.group(1)        else:            output(u"WARNING: No character set found.")            # UTF-8 as default            charset = 'utf-8'        # Check if this is the charset we expected        self.checkCharset(charset)        # Convert HTML to Unicode        try:            text = unicode(text, charset, errors = 'strict')        except UnicodeDecodeError, e:            print e            output(u'ERROR: Invalid characters found on http://%s%s, replaced by \\ufffd.' % (self.hostname, path))            # We use error='replace' in case of bad encoding.            text = unicode(text, charset, errors = 'replace')

# Try and see whether we can extract the user groups match = self.r_userGroups.search(text) if match: self.userGroups = [] if match.group(1) != 'null': uG = match.group(1)[1:-1].split(', ') for group in uG: if group.strip('"') != '*':                       self.userGroups.append(group.strip('"'))

return text

def mediawiki_message(self, key): """Return the MediaWiki message text for key "key" """ global mwpage, tree if key not in self._mediawiki_messages.keys \ and not hasattr(self, "_phploaded"): retry_idle_time = 1 while True: get_throttle mwpage = self.getUrl("%s?title=%s:%s&action=edit"                        % (self.path, urllib.quote(                                self.namespace(8).replace(' ', '_').encode( self.encoding)), key)) tree = BeautifulSoup(mwpage,                                    convertEntities=BeautifulSoup.HTML_ENTITIES,                                     parseOnlyThese=SoupStrainer("textarea")) if tree.textarea is None: # We assume that the server is down. # Wait some time, then try again. output( u"""WARNING: No text area found on %s%s?title=MediaWiki:%s&action=edit. Maybe the server is down. Retrying in %i minutes..."""                       % (self.hostname, self.path, key, retry_idle_time)                    ) time.sleep(retry_idle_time * 60) # Next time wait longer, but not longer than half an hour retry_idle_time *= 2 if retry_idle_time > 30: retry_idle_time = 30 continue break value = tree.textarea.string.strip if value: self._mediawiki_messages[key] = value else: self._mediawiki_messages[key] = None # Fallback in case MediaWiki: page method doesn't work if verbose: output(                     u"Retrieving mediawiki messages from Special:Allmessages") get_throttle phppage = self.getUrl(self.get_address("Special:Allmessages")                                     + "&ot=php") Rphpvals = re.compile(r"(?ms)'([^']*)' =&gt; '(.*?[^\\])',") for (phpkey, phpval) in Rphpvals.findall(phppage): self._mediawiki_messages[str(phpkey)] = phpval self._phploaded = True

if self._mediawiki_messages[key] is None: raise KeyError("MediaWiki key '%s' does not exist on %s"                          % (key, self)) return self._mediawiki_messages[key]

def has_mediawiki_message(self, key): """Return True iff this site defines a MediaWiki message for key "key" """ try: v = self.mediawiki_message(key) return True except KeyError: return False

# TODO: avoid code duplication for the following methods def newpages(self, number = 10, get_redirect = False, repeat = False): """Generator which yields new articles subsequently.          It starts with the article created 'number' articles           ago (first argument). When these are all yielded           and repeat is True,           it fetches NewPages again. If there is no new page,           it blocks until there is one, sleeping between subsequent           fetches of NewPages.

The objects yielded are dictionairies. The keys are date (datetime object), title (pagelink), length (int) user_login (only if user is logged in, string), comment (string) and user_anon (if user is not logged in, string).

"""       # The throttling is important here, so always enabled.        if repeat:            throttle = True        seen = set        while True:            path = self.newpages_address(n=number)            get_throttle            html = self.getUrl(path)            entryR = re.compile(']*>(?P .+?) \S*?.+?.+?[\(\[](?P \d+)[^\)\]]*[\)\]] .?')            for m in entryR.finditer(html):                date = m.group('date')                title = m.group('title')                title = title.replace('&quot;', '"')                length = int(m.group('length'))                loggedIn = u                username = m.group('username')                comment = u

if title not in seen: seen.add(title) page = Page(self, title) yield page, date, length, loggedIn, username, comment

if not repeat: break

def longpages(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.longpages_address(n=number) get_throttle html = self.getUrl(path) entryR = re.compile(ur'\(hist\) ‎.+? ‎\[(?P \d+)(.+?)\]') for m in entryR.finditer(html): title = m.group('title') length = int(m.group('length'))

if title not in seen: seen.add(title) page = Page(self, title) yield page, length if not repeat: break

def shortpages(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.shortpages_address(n = number) get_throttle html = self.getUrl(path) entryR = re.compile(ur'<li>\(<a href=".+?" title=".+?">hist</a>\) ‎<a href=".+?" title="(?P .+?)">.+?</a> ‎\[(?P \d+)(.+?)\]</li>') for m in entryR.finditer(html): title = m.group('title') length = int(m.group('length'))

if title not in seen: seen.add(title) page = Page(self, title) yield page, length if not repeat: break

def categories(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.categories_address(n=number) get_throttle html = self.getUrl(path) entryR = re.compile('<li><a href=".+?" title="(?P .+?)">.+?</a></li>') for m in entryR.finditer(html): title = m.group('title')

if title not in seen: seen.add(title) page = Page(self, title) yield page if not repeat: break

def deadendpages(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.deadendpages_address(n=number) get_throttle html = self.getUrl(path) entryR = re.compile('<li><a href=".+?" title="(?P .+?)">.+?</a></li>') for m in entryR.finditer(html): title = m.group('title')

if title not in seen: seen.add(title) page = Page(self, title) yield page if not repeat: break

def ancientpages(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.ancientpages_address(n=number) get_throttle html = self.getUrl(path) entryR = re.compile('<li><a href=".+?" title="(?P .+?)">.+?</a> (?P .+?)</li>') for m in entryR.finditer(html): title = m.group('title') date = m.group('date')

if title not in seen: seen.add(title) page = Page(self, title) yield page, date if not repeat: break

def lonelypages(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.lonelypages_address(n=number) get_throttle html = self.getUrl(path) entryR = re.compile('<li><a href=".+?" title="(?P .+?)">.+?</a></li>') for m in entryR.finditer(html): title = m.group('title')

if title not in seen: seen.add(title) page = Page(self, title) yield page if not repeat: break

def unwatchedpages(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.unwatchedpages_address(n=number) get_throttle html = self.getUrl(path, sysop = True) print html entryR = re.compile('<li><a href=".+?" title="(?P .+?)">.+?</a>.+?</li>') for m in entryR.finditer(html): title = m.group('title') if title not in seen: seen.add(title) page = Page(self, title) yield page if not repeat: break

def uncategorizedcategories(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.uncategorizedcategories_address(n=number) get_throttle html = self.getUrl(path) entryR = re.compile('<li><a href=".+?" title="(?P .+?)">.+?</a></li>') for m in entryR.finditer(html): title = m.group('title')

if title not in seen: seen.add(title) page = Page(self, title) yield page if not repeat: break

def uncategorizedpages(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.uncategorizedpages_address(n=number) get_throttle html = self.getUrl(path) entryR = re.compile('<li><a href=".+?" title="(?P .+?)">.+?</a></li>') for m in entryR.finditer(html): title = m.group('title')

if title not in seen: seen.add(title) page = Page(self, title) yield page if not repeat: break

def unusedcategories(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.unusedcategories_address(n=number) get_throttle html = self.getUrl(path) entryR = re.compile('<li><a href=".+?" title="(?P .+?)">.+?</a></li>') for m in entryR.finditer(html): title = m.group('title')

if title not in seen: seen.add(title) page = Page(self, title) yield page if not repeat: break

def unusedfiles(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.unusedfiles_address(n=number) get_throttle html = self.getUrl(path) entryR = re.compile('<li>\(<a href=".+?" title="(?P .+?)">.+?</a>\) ') for m in entryR.finditer(html): title = m.group('title')

if title not in seen: seen.add(title) page = ImagePage(self, title) yield page if not repeat: break

def withoutinterwiki(self, number = 10, repeat = False): throttle = True seen = set while True: path = self.withoutinterwiki_address(n=number) get_throttle html = self.getUrl(path) entryR = re.compile('<li><a href=".+?" title="(?P .+?)">.+?</a></li>') for m in entryR.finditer(html): title = m.group('title') if title not in seen: seen.add(title) page = Page(self, title) yield page if not repeat: break

def allpages(self, start = '!', namespace = 0, includeredirects = True, throttle = True): """Generator which yields all articles in the home language in          alphanumerical order, starting at a given page. By default,           it starts at '!', so it should yield all pages.

If includeredirects is False, redirects will not be found. If includeredirects equals the string 'only', only redirects will be found. Note that this has not been tested on older versions of the MediaWiki code.

The objects returned by this generator are all Pages.

It is advised not to use this directly, but to use the AllpagesPageGenerator from pagegenerators.py instead. """       while True:            # encode Non-ASCII characters in hexadecimal format (e.g. %F6)            start = start.encode(self.encoding)            start = urllib.quote(start)            # load a list which contains a series of article names (always 480)            path = self.allpages_address(start, namespace)            output(u'Retrieving Allpages special page for %s from %s, namespace %i' % (repr(self), start, namespace))            returned_html = self.getUrl(path)            # Try to find begin and end markers            try:                # In 1.4, another table was added above the navigational links                if self.versionnumber >= 4:                    begin_s = ' <table'                    end_s = '</table'                else:                    begin_s = '<table'                    end_s = '</table'                ibegin = returned_html.index(begin_s) iend = returned_html.index(end_s,ibegin + 3) except ValueError: raise ServerError('Couldn\'t extract allpages special page. Make sure you\'re using the MonoBook skin.') # remove the irrelevant sections returned_html = returned_html[ibegin:iend] if self.versionnumber==2: R = re.compile('/wiki/(.*?)\" *class=[\'\"]printable') elif self.versionnumber<5: # Apparently the special code for redirects was added in 1.5 R = re.compile('title ?=\"(.*?)\"') elif not includeredirects: R = re.compile('\<td\>\<a href=\"\S*\" +title ?="(.*?)"') elif includeredirects == 'only': R = re.compile('\ \<[^\<\>]*allpagesredirect\"\>\<a href=\"\S*\" +title ?="(.*?)"')           else:                R = re.compile('title ?=\"(.*?)\"')            # Count the number of useful links on this page            n = 0            for hit in R.findall(returned_html):                # count how many articles we found on the current page                n = n + 1                if self.versionnumber==2:                    yield Page(self, url2link(hit, site = self, insite = self))                else:                    yield Page(self, hit)                # save the last hit, so that we know where to continue when we                # finished all articles on the current page. Append a '!' so that                # we don't yield a page twice.                start = Page(self,hit).titleWithoutNamespace + '!'            # A small shortcut: if there are less than 100 pages listed on this # page, there is certainly no next. Probably 480 would do as well, # but better be safe than sorry. if n < 100: if (not includeredirects) or includeredirects == 'only': # Maybe there were only so few because the rest is or is not a redirect R = re.compile('title ?=\"(.*?)\"') if len(R.findall(returned_html)) < 100: break else: break

def linksearch(self,siteurl): # gives a list of page items, being the pages found on a linksearch # for site site. if siteurl.startswith('*.'): siteurl = siteurl[2:] for url in [siteurl,"*."+siteurl]: path = self.family.linksearch_address(self.lang,url) get_throttle html = self.getUrl(path) loc = html.find(' ') if loc > -1: html = html[loc:] loc = html.find(' ') if loc > -1: html = html[:loc] R = re.compile('title ?=\"(.*?)\"') for title in R.findall(html): if not siteurl in title: # the links themselves have similar form yield Page(self,title)

def __repr__(self): return self.family.name+":"+self.lang

def linkto(self, title, othersite = None): if othersite and othersite.lang != self.lang: return '%s:%s' % (self.lang, title) else: return '%s' % title

def isInterwikiLink(self, s): """       Try to check whether s is in the form "foo:bar" or ":foo:bar"        where foo is a known language code or family. In such a case        we are dealing with an interwiki link.        Called recursively if the first part of the link refers to this        site's own family and/or language.        """ s = s.lstrip(":") if not ':' in s:           return False first, rest = s.split(':',1) # interwiki codes are case-insensitive first = first.lower.strip # commons: forwards interlanguage links to wikipedia:, etc.       if self.family.interwiki_forward: interlangTargetFamily = Family(self.family.interwiki_forward) else: interlangTargetFamily = self.family if self.getNamespaceIndex(first): return False if first in interlangTargetFamily.langs: if first == self.lang: return self.isInterwikiLink(rest) else: return True if first in self.family.known_families: if first == self.family.name: return self.isInterwikiLink(rest) else: return True return False

def encoding(self): return self.family.code2encoding(self.lang)

def encodings(self): return self.family.code2encodings(self.lang)

def redirect(self, default = False): """       Gives the localized redirect tag for the site. Falls back        to 'REDIRECT' if the site has no special redirect tag.        """ if default: return self.family.redirect.get(self.lang, "REDIRECT") else: return self.family.redirect.get(self.lang, None)

def redirectRegex(self): """       Regular expression recognizing redirect pages, with a        group on the target title.        """ try: redirKeywords = [u'redirect'] + self.family.redirect[self.lang] redirKeywordsR = r'(?:' + '|'.join(redirKeywords) + ')' except KeyError: # no localized keyword for redirects redirKeywordsR = r'redirect' # A redirect starts with hash (#), followed by a keyword, then # arbitrary stuff, then a wikilink. The link target ends before # either a | or a ]. return re.compile(r'#' + redirKeywordsR + '.*?\[\[(.*?)(?:\]|\|)', re.IGNORECASE | re.UNICODE | re.DOTALL)

# The following methods are for convenience, so that you can access # methods of the Family class easier. def category_namespace(self): return self.family.category_namespace(self.lang)

def category_namespaces(self): return self.family.category_namespaces(self.lang)

def image_namespace(self, fallback = '_default'): return self.family.image_namespace(self.lang, fallback)

def template_namespace(self, fallback = '_default'): return self.family.template_namespace(self.lang, fallback)

def export_address(self): return self.family.export_address(self.lang)

def query_address(self): return self.family.query_address(self.lang) def api_address(self): return self.family.api_address(self.lang)

def hostname(self): return self.family.hostname(self.lang)

def path(self): return self.family.path(self.lang)

def dbName(self): return self.family.dbName(self.lang)

def move_address(self): return self.family.move_address(self.lang)

def delete_address(self, s): return self.family.delete_address(self.lang, s)

def undelete_view_address(self, s, ts=''): return self.family.undelete_view_address(self.lang, s, ts)

def undelete_address(self): return self.family.undelete_address(self.lang)

def protect_address(self, s): return self.family.protect_address(self.lang, s)

def unprotect_address(self, s): return self.family.unprotect_address(self.lang, s)

def put_address(self, s): return self.family.put_address(self.lang, s)

def get_address(self, s): return self.family.get_address(self.lang, s)

def nice_get_address(self, s): return self.family.nice_get_address(self.lang, s)

def edit_address(self, s): return self.family.edit_address(self.lang, s)

def purge_address(self, s): return self.family.purge_address(self.lang, s)

def block_address(self): return self.family.block_address(self.lang)

def unblock_address(self): return self.family.unblock_address(self.lang)

def blocksearch_address(self, s): return self.family.blocksearch_address(self.lang, s)

def linksearch_address(self, s, limit=500, offset=0): return self.family.linksearch_address(self.lang, s, limit=limit, offset=offset)

def checkCharset(self, charset): if not hasattr(self,'charset'): self.charset = charset assert self.charset.lower == charset.lower, "charset for %s changed from %s to %s" % (repr(self), self.charset, charset) if self.encoding.lower != charset.lower: raise ValueError("code2encodings has wrong charset for %s. It should be %s, but is %s" % (repr(self), charset, self.encoding))

def allpages_address(self, s, ns = 0): return self.family.allpages_address(self.lang, start = s, namespace = ns)

def newpages_address(self, n=50): return self.family.newpages_address(self.lang, n)

def longpages_address(self, n=500): return self.family.longpages_address(self.lang, n)

def shortpages_address(self, n=500): return self.family.shortpages_address(self.lang, n)

def unusedfiles_address(self, n=500): return self.family.unusedfiles_address(self.lang, n)

def categories_address(self, n=500): return self.family.categories_address(self.lang, n)

def deadendpages_address(self, n=500): return self.family.deadendpages_address(self.lang, n)

def ancientpages_address(self, n=500): return self.family.ancientpages_address(self.lang, n)

def lonelypages_address(self, n=500): return self.family.lonelypages_address(self.lang, n)

def unwatchedpages_address(self, n=500): return self.family.unwatchedpages_address(self.lang, n)

def uncategorizedcategories_address(self, n=500): return self.family.uncategorizedcategories_address(self.lang, n)

def uncategorizedpages_address(self, n=500): return self.family.uncategorizedpages_address(self.lang, n)

def unusedcategories_address(self, n=500): return self.family.unusedcategories_address(self.lang, n)

def withoutinterwiki_address(self, n=500): return self.family.withoutinterwiki_address(self.lang, n)

def references_address(self, s): return self.family.references_address(self.lang, s)

def allmessages_address(self): return self.family.allmessages_address(self.lang)

def upload_address(self): return self.family.upload_address(self.lang)

def maintenance_address(self, sub, default_limit = True): return self.family.maintenance_address(self.lang, sub, default_limit)

def double_redirects_address(self, default_limit = True): return self.family.double_redirects_address(self.lang, default_limit)

def broken_redirects_address(self, default_limit = True): return self.family.broken_redirects_address(self.lang, default_limit)

def __hash__(self): return hash(repr(self))

def version(self): return self.family.version(self.lang)

def versionnumber(self): return self.family.versionnumber(self.lang)

def live_version(self): """Return the 'real' version number found on Special:Versions          as a tuple (int, int, str) of the major and minor version numbers           and any other text contained in the version.        """ global htmldata if not hasattr(self, "_mw_version"): versionpage = self.getUrl(self.get_address("Special:Version")) htmldata = BeautifulSoup(versionpage, convertEntities="html") versionstring = htmldata.findAll(text="MediaWiki"                                            )[1].parent.nextSibling m = re.match(r"^: ([0-9]+)\.([0-9]+)(.*)$", str(versionstring)) if m:               self._mw_version = (int(m.group(1)), int(m.group(2)),                                        m.group(3)) else: self._mw_version = self.family.version(self.lang).split(".") return self._mw_version

def __cmp__(self, other): """Pseudo method to be able to use equality and inequality tests on          Site objects""" if not isinstance(other,Site): return 1 if self.family==other.family: return cmp(self.lang,other.lang) return cmp(self.family.name,other.family.name)

def category_on_one_line(self): return self.lang in self.family.category_on_one_line

def interwiki_putfirst(self): return self.family.interwiki_putfirst.get(self.lang,None)

def interwiki_putfirst_doubled(self,list_of_links): if self.family.interwiki_putfirst_doubled.has_key(self.lang): if len(list_of_links) >= self.family.interwiki_putfirst_doubled[self.lang][0]: list_of_links2 = [] for lang in list_of_links: list_of_links2.append(lang.language) list = [] for lang in self.family.interwiki_putfirst_doubled[self.lang][1]: try: list.append(list_of_links[list_of_links2.index(lang)]) except ValueError: pass return list else: return False else: return False

def login_address(self): return self.family.login_address(self.lang)

def watchlist_address(self): return self.family.watchlist_address(self.lang)

def contribs_address(self, target, limit=500, offset=''): return self.family.contribs_address(self.lang,target,limit,offset)

def getSite(self, code): return getSite(code = code, fam = self.family, user=self.user)

def namespace(self, num, all = False): return self.family.namespace(self.lang, num, all = all)

def normalizeNamespace(self, value): return self.family.normalizeNamespace(self.lang, value)

def namespaces(self): if _namespaceCache.has_key(self): return _namespaceCache[self] else: nslist = [] for n in self.family.namespaces: try: ns = self.family.namespace(self.lang, n)               except KeyError: # No default namespace defined continue if ns is not None: nslist.append(self.family.namespace(self.lang, n)) _namespaceCache[self] = nslist return nslist

def getNamespaceIndex(self, namespace): return self.family.getNamespaceIndex(self.lang, namespace)

def linktrail(self): return self.family.linktrail(self.lang)

def language(self): return self.lang

def fam(self): return self.family

def sitename(self): return self.family.name+':'+self.lang

def languages(self): return self.family.langs.keys def validLanguageLinks(self): return self._validlanguages

def disambcategory(self): import catlib try: return catlib.Category(self,self.namespace(14)+':'+self.family.disambcatname[self.lang]) except KeyError: raise NoPage

def getToken(self, getalways = True, getagain = False, sysop = False): if getagain or (getalways and ((sysop and not self._sysoptoken) or (not sysop and not self._token))): output(u"Getting page to get a token.") try: self.sandboxpage.get(force = True, get_redirect = True, sysop = sysop) #Page(self, "Non-existing page").get(force = True, sysop = sysop) except UserBlocked: #raise pass except Error: pass if sysop: if not self._sysoptoken: return False else: return self._sysoptoken else: if not self._token: return False else: return self._token

def putToken(self,value, sysop = False): if sysop: self._sysoptoken = value else: self._token = value return

_sites = {} _namespaceCache = {}
 * 1) Caches to provide faster access

def getSite(code = None, fam = None, user=None): if code == None: code = default_code if fam == None: fam = default_family key = '%s:%s'%(fam,code) if not _sites.has_key(key): _sites[key] = Site(code=code, fam=fam, user=user) return _sites[key]

def setSite(site): default_code = site.language default_family = site.family

def calledModuleName: """   Gets the name of the module calling this function. This is    required because the -help option loads the module's docstring    and because the module name will be used for the filename of the    log.    """ # get commandline arguments args = sys.argv try: # clip off the '.py' filename extension return args[0][:args[0].rindex('.')] except ValueError: return args[0]

def handleArgs: '''   Takes the commandline arguments, converts them to Unicode, processes all global parameters such as -lang or -log. Returns a list of all arguments that are not global. This makes sure that global arguments are applied first, regardless of the order in which the arguments were given. '''   global default_code, default_family, verbose # get commandline arguments args = sys.argv # get the name of the module calling this function. This is   # required because the -help option loads the module's docstring and because # the module name will be used for the filename of the log. # TODO: check if the following line is platform-independent moduleName = calledModuleName nonGlobalArgs = [] for arg in args[1:]: if sys.platform=='win32': # stupid Windows gives parameters encoded as windows-1252, but input # encoded as cp850 arg = unicode(arg, 'windows-1252') else: # Linux uses the same encoding for both arg = unicode(arg, config.console_encoding) if arg == '-help': showHelp(moduleName) sys.exit(0) elif arg.startswith('-family:'): global default_family default_family = arg[8:] elif arg.startswith('-lang:'): global default_code default_code = arg[6:] elif arg.startswith('-putthrottle:'): put_throttle.setDelay(int(arg[13:]), absolute = True) elif arg.startswith('-pt:'): put_throttle.setDelay(int(arg[4:]), absolute = True) elif arg == '-log': setLogfileStatus(True) elif arg.startswith('-log:'): setLogfileStatus(True, arg[5:]) elif arg == '-nolog': setLogfileStatus(False) elif arg == '-verbose' or arg == "-v": import version output('Pywikipediabot %s' % (version.getversion)) output('Python %s' % (sys.version)) verbose += 1 else: # the argument is not global. Let the specific bot script care # about it. nonGlobalArgs.append(arg) return nonGlobalArgs


 * 1) Interpret configuration
 * 1) Interpret configuration

import wikipediatools as _wt sys.path.append(_wt.absoluteFilename('userinterfaces')) exec "import %s_interface as uiModule" % config.userinterface ui = uiModule.UI verbose = 0
 * 1) search for user interface module in the 'userinterfaces' subdirectory

default_family = config.family default_code = config.mylang logfile = None try: getSite except KeyError: print( u"""Please create a file user-config.py, and put in there:\n One line saying \"mylang='language'\" One line saying \"usernames['wikipedia']['language']='yy'\"\n ...filling in your username and the language code of the wiki you want to work on.\n For other possible configuration variables check config.py. """) sys.exit(1)
 * 1) Check


 * 1) Languages to use for comment text after the actual language but before
 * en:. For example, if for language 'xx', you want the preference of
 * 1) languages to be:
 * xx:, then fr:, then ru:, then en:
 * 1) you let altlang return ['fr','ru'].
 * 2) This code is used by translate below.

def altlang(code): if code=='aa': return ['am'] if code in ['fa','so']: return ['ar'] if code=='ku': return ['ar','tr'] if code=='sk': return ['cs'] if code in ['bar','hsb','ksh']: return ['de'] if code in ['als','lb']: return ['de','fr'] if code=='io': return ['eo'] if code in ['an','ast','ay','ca','gn','nah','qu']: return ['es'] if code == ['cbk-zam']: return ['es','tl'] if code=='eu': return ['es','fr'] if code in ['glk','mzn']: return ['fa','ar'] if code=='gl': return ['es','pt'] if code=='lad': return ['es','he'] if code in ['br','ht','kab','ln','lo','nrm','wa']: return ['fr'] if code in ['ie','oc']: return ['ie','oc','fr'] if code in ['co','frp']: return ['fr','it'] if code=='yi': return ['he','de'] if code=='sa': return ['hi'] if code in ['eml','lij','lmo','nap','pms','roa-tara','sc','scn','vec']: return ['it'] if code=='rm': return ['it','de','fr'] if code in ['bat-smg','ltg']: return ['lt'] if code=='ia': return ['la','es','fr','it'] if code=='nds': return ['nds-nl','de'] if code=='nds-nl': return ['nds','nl'] if code in ['fy','pap','vls','zea']: return ['nl'] if code=='li': return ['nl','de'] if code=='csb': return ['pl'] if code in ['fab','tet']: return ['pt'] if code in ['mo','roa-rup']: return ['ro'] if code in ['av','bxr','cv','hy','lbe','ru-sib','tg','tt','udm','uk','xal']: return ['ru'] if code in ['be','be-x-old']: return ['be','be-x-old','ru'] if code in ['kk','ky','tk']: return ['tr','ru'] if code == 'zh-classic': # the database uses 'zh-classic' instead of 'zh-classical' as the field is varchar(10) return ['zh-classical','zh','zh-cn','zh-tw'] if code in ['diq','ug','uz']: return ['tr'] if code in ['ja','minnan','zh','zh-cn']: return ['zh','zh-tw','zh-classical','zh-cn'] if code in ['bo','cdo','hak','wuu','za','zh-cdo','zh-classical','zh-tw','zh-yue']: return ['zh','zh-cn','zh-classical','zh-tw'] if code=='da': return ['nb','no'] if code in ['is','no','nb','nn']: return ['no','nb','nn','da','sv'] if code=='sv': return ['da','no','nb'] if code=='se': return ['no','nb','sv','nn','fi','da'] if code in ['bug','id','jv','map-bms','ms','su']: return ['id','ms','jv'] if code in ['bs','hr','sh']: return ['sh','hr','bs','sr'] if code in ['mk','sr']: return ['sh','sr','hr','bs'] if code in ['ceb','pag','tl','war']: return ['tl','es'] if code=='bi': return ['tpi'] if code=='tpi': return ['bi'] if code == 'new': return ['ne'] if code == 'nov': return ['io','eo'] return []

def translate(code, dict): """   Given a language code and a dictionary, returns the dictionary's value for    key 'code' if this key exists; otherwise tries to return a value for an    alternative language that is most applicable to use on the Wikipedia in    language 'code'.    The language itself is always checked first, then languages that    have been defined to be alternatives, and finally English. If none of    the options gives result, we just take the first language in the    list.    """ # If a site is given instead of a code, use its language if hasattr(code,'lang'): code = code.lang

if dict.has_key(code): return dict[code] for alt in altlang(code): if dict.has_key(alt): return dict[alt] if dict.has_key('en'): return dict['en'] return dict.values[0]

def showDiff(oldtext, newtext): """   Prints a string showing the differences between oldtext and newtext.    The differences are highlighted (only on Unix systems) to show which    changes were made.    """ # For information on difflib, see http://pydoc.org/2.3/difflib.html color = { '+': 'lightgreen', '-': 'lightred', }   diff = u'' colors = [] # This will store the last line beginning with + or -. lastline = None # For testing purposes only: show original, uncolored diff #    for line in difflib.ndiff(oldtext.splitlines, newtext.splitlines): #        print line for line in difflib.ndiff(oldtext.splitlines, newtext.splitlines): if line.startswith('?'): # initialize color vector with None, which means default color lastcolors = [None for c in lastline] # colorize the + or - sign lastcolors[0] = color[lastline[0]] # colorize changed parts in red or green for i in range(min(len(line), len(lastline))): if line[i] != ' ': lastcolors[i] = color[lastline[0]] diff += lastline + '\n' # append one None (default color) for the newline character colors += lastcolors + [None] elif lastline: diff += lastline + '\n' # colorize the + or - sign only lastcolors = [None for c in lastline] lastcolors[0] = color[lastline[0]] colors += lastcolors + [None] lastline = None if line[0] in ('+', '-'): lastline = line # there might be one + or - line left that wasn't followed by a ? line. if lastline: diff += lastline + '\n' # colorize the + or - sign only lastcolors = [None for c in lastline] lastcolors[0] = color[lastline[0]] colors += lastcolors + [None]

result = u'' lastcolor = None for i in range(len(diff)): if colors[i] != lastcolor: if lastcolor is None: result += '\03{%s}' % colors[i] else: result += '\03{default}' lastcolor = colors[i] result += diff[i] output(result)

def makepath(path): """ creates missing directories for the given path and       returns a normalized absolute version of the path.

- if the given path already exists in the filesystem the filesystem is not modified.

- otherwise makepath creates directories along the given path using the dirname of the path. You may append a '/' to the path if you want it to be a directory path.

from holger@trillke.net 2002/03/18 """   from os import makedirs    from os.path import normpath,dirname,exists,abspath

dpath = normpath(dirname(path)) if not exists(dpath): makedirs(dpath) return normpath(abspath(path))

def setLogfileStatus(enabled, logname = None): global logfile if enabled: if not logname: logname = '%s.log' % calledModuleName import wikipediatools as _wt logfn = _wt.absoluteFilename('logs', logname) try: logfile = codecs.open(logfn, 'a', 'utf-8') except IOError: logfile = codecs.open(logfn, 'w', 'utf-8') else: # disable the log file logfile = None

if '*' in config.log or calledModuleName in config.log: setLogfileStatus(True)

colorTagR = re.compile('\03{.*?}', re.UNICODE)

def log(text): """   Writes the given text to the logfile.    """ if logfile: # remove all color markup # TODO: consider pre-compiling this regex for speed improvements plaintext = colorTagR.sub('', text) # save the text in a logfile (will be written in utf-8) logfile.write(plaintext) logfile.flush

output_lock = threading.Lock input_lock = threading.Lock output_cache = [] def output(text, decoder = None, newline = True, toStdout = False): """   Works like print, but uses the encoding used by the user's console    (console_encoding in the configuration file) instead of ASCII.    If decoder is None, text should be a unicode string. Otherwise it    should be encoded in the given encoding.

If newline is True, a linebreak will be added after printing the text.

If toStdout is True, the text will be sent to standard output, so that it can be piped to another process. All other text will be sent to stderr. See: http://en.wikipedia.org/wiki/Pipeline_%28Unix%29

text can contain special sequences to create colored output. These consist of the escape character \03 and the color name in curly braces, e. g. \03{lightpurple}. \03{default} resets the color. """   output_lock.acquire    try:        if decoder:            text = unicode(text, decoder)        elif type(text) != type(u''):            if verbose:                print "DBG> BUG: Non-unicode passed to wikipedia.output without decoder!"                print traceback.print_stack                print "DBG> Attempting to recover, but please report this problem"            try:                text = unicode(text, 'utf-8')            except UnicodeDecodeError:                text = unicode(text, 'iso8859-1')        if newline:            text += u'\n'        log(text)        if input_lock.locked:            cache_output(text, toStdout = toStdout)        else:            ui.output(text, toStdout = toStdout)    finally:        output_lock.release

def cache_output(*args, **kwargs): output_cache.append((args, kwargs))

def flush_output_cache: while(output_cache): (args, kwargs) = output_cache.pop(0) ui.output(*args, **kwargs) def input(question, password = False): """   Asks the user a question, then returns the user's answer.

Parameters: * question - a unicode string that will be shown to the user. Don't add a                space after the question mark/colon, this method will do this for you. * password - if True, hides the user's input (for password entry).

Returns a unicode string. """   input_lock.acquire    try:        data = ui.input(question, password)    finally:            flush_output_cache        input_lock.release    return data def inputChoice(question, answers, hotkeys, default = None):    """ Asks the user a question and offers several options, then returns the user's choice. The user's input will be case-insensitive, so the hotkeys should be distinctive case-insensitively.

Parameters: * question - a unicode string that will be shown to the user. Don't add a                space after the question mark, this method will do this for you. * answers - a list of strings that represent the options. * hotkeys - a list of one-letter strings, one for each answer. * default - an element of hotkeys, or None. The default choice that will be returned when the user just presses Enter.

Returns a one-letter string in lowercase. """   input_lock.acquire    try:        data = ui.inputChoice(question, answers, hotkeys, default).lower    finally:        flush_output_cache        input_lock.release

return data

def showHelp(moduleName = None): # the parameter moduleName is deprecated and should be left out. moduleName = moduleName or sys.argv[0][:sys.argv[0].rindex('.')] try: moduleName = moduleName[moduleName.rindex("\\")+1:] except ValueError: # There was no \ in the module name, so presumably no problem pass globalHelp =u'''

Global arguments available for all bots:

-lang:xx         Set the language of the wiki you want to work on, overriding the configuration in user-config.py. xx should be the language code.

-family:xyz      Set the family of the wiki you want to work on, e.g.                  wikipedia, wiktionary, wikitravel, ...                  This will override the configuration in user-config.py.

-log             Enable the logfile. Logs will be stored in the logs subdirectory.

-log:xyz         Enable the logfile, using xyz as the filename.

-nolog           Disable the logfile (if it is enabled by default).

-putthrottle:nn  Set the minimum time (in seconds) the bot will wait between -pt:n            saving pages.

-verbose         Have the bot provide additional output that may be useful in -v                debugging. '''   output(globalHelp) try: exec('import %s as module' % moduleName) helpText = module.__doc__.decode('utf-8') if hasattr(module, 'docuReplacements'): for key, value in module.docuReplacements.iteritems: helpText = helpText.replace(key, value.strip('\n\r')) output(helpText) except: raise output(u'Sorry, no help available for %s' % moduleName)

page_put_queue = Queue.Queue def async_put: '''   Daemon that takes pages from the queue and tries to save them on the wiki. '''   while True: page, newtext, comment, watchArticle, minorEdit = page_put_queue.get if page is None: # needed for compatibility with Python 2.3 and 2.4 # in 2.5, we could use the Queue's task_done and join methods return try: page.put(newtext, comment, watchArticle, minorEdit) except SpamfilterError, ex: output(u"Saving page %s prevented by spam filter: %s"                  % (page.title, ex.url)) except PageNotSaved, ex: output(u"Saving page %s failed: %s"                  % (page.title, ex.message)) except LockedPage, ex: output(u"Page %s is locked; not saved." % page.title) except: tb = traceback.format_exception(*sys.exc_info) output(u"Saving page %s failed:\n%s"                   % (page.title, "".join(tb)))

_putthread = threading.Thread(target=async_put) _putthread.setName('Put-Thread') _putthread.setDaemon(True) _putthread.start def stopme: """This should be run when a bot does not interact with the Wiki, or      when it has stopped doing so. After a bot has run stopme it will       not slow down other bots any more.    """ get_throttle.drop
 * 1) identification for debugging purposes

def _flush: '''Wait for the page-putter to flush its queue; called automatically upon exiting from Python. '''   if page_put_queue.qsize > 0: import datetime remaining = datetime.timedelta(seconds=(page_put_queue.qsize+1) * config.put_throttle) output('Waiting for %i pages to be put. Estimated time remaining: %s' % (page_put_queue.qsize+1, remaining)) page_put_queue.put((None, None, None, None, None)) while(_putthread.isAlive): try: _putthread.join(1) except KeyboardInterrupt: answer = inputChoice(u'There are %i pages remaining in the queue. Estimated time remaining: %s\nReally exit?'                            % (page_put_queue.qsize, datetime.timedelta(seconds=(page_put_queue.qsize) * config.put_throttle)),                             ['yes', 'no'], ['y', 'N'], 'N') if answer in ['y', 'Y']: return

import atexit atexit.register(_flush)

def debugDump(name, site, error, data): import time name = unicode(name) error = unicode(error) site = unicode(repr(site).replace(u':',u'_')) filename = '%s_%s__%s.dump' % (name, site, time.asctime) filename = filename.replace(' ','_').replace(':','-') f = file(filename, 'wb') #trying to write it in binary  #f = codecs.open(filename, 'w', 'utf-8') f.write(u'Error reported: %s\n\n' % error) try: f.write(data.encode("utf8")) except UnicodeDecodeError: f.write(data) f.close output( u'ERROR: %s caused error %s. Dump %s created.' % (name,error,filename) )

get_throttle = Throttle(config.minthrottle,config.maxthrottle) put_throttle = Throttle(config.put_throttle,config.put_throttle,False)

class MyURLopener(urllib.FancyURLopener): version="PythonWikipediaBot/1.0"

if config.authenticate: import urllib2, cookielib import wikipediatools as _wt COOKIEFILE = _wt.absoluteFilename('login-data', 'cookies.lwp') cj = cookielib.LWPCookieJar if os.path.isfile(COOKIEFILE): cj.load(COOKIEFILE) passman = urllib2.HTTPPasswordMgrWithDefaultRealm for site in config.authenticate.keys: passman.add_password(None, site, config.authenticate[site][0], config.authenticate[site][1]) authhandler = urllib2.HTTPBasicAuthHandler(passman) authenticateURLopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj),authhandler) urllib2.install_opener(authenticateURLopener)
 * 1) Special opener in case we are using a site with authentication

if __name__ == '__main__': import version, doctest print 'Pywikipediabot %s' % version.getversion print 'Python %s' % sys.version doctest.testmod