User:Sminthopsis84/temp

Python 3 changes required:

dateutil must be imported (as before, note that pip has become pip3)

Syntax changes required Replace import urllib import urllib2 with import urllib.request import urllib.error (though urllib.error isn't used in Visviva's code)

Change urllib calls, e.g., request=urllib2.Request becomes request=urllib.request.Request

Replace all print statements with function calls, e.g., print str(u),"already done" becomes print (str(u),"already done")

Exception handling requires the keyword "as": except Exception, e:     print (str(e)) becomes except Exception as e:     print (str(e))

Reserved words must be changed: sorted max

Run-time errors Strict typing is now required for byte strings versus Unicode strings. urlpaths=re.findall('"[^"]+-stub-meta-history\d.*?\.xml\.gz"',dumpspage) becomes urlpaths=re.findall(b'"[^"]+-stub-meta-history\d.*?\.xml\.gz"',dumpspage)

self.urls=[self.dumpsurl+x.replace('"',) for x in urlpaths] becomes self.urls=[bytes(self.dumpsurl,'utf-8')+x.replace(b'"',b) for x in urlpaths]

print ("Downloading "+u) becomes print ("Downloading ", u)

urllib.request.urlretrieve requires a string or a Request object for the url

-- Visviva's code as converted by me -- import gzip import os import re import time import urllib.request import urllib.error

from sys import stdout from collections import defaultdict

class Downloader: def __init__(self): self.dumpsurl="http://dumps.wikimedia.your.org/enwiki/latest/" self.headers={'User-agent' : 'JumpingSpider/0.0'} self.counters=[] self.trackers=[] self.matchups={} self.replaced_users=set

def process(self): # get URLs of all pre-combination stub-meta-history files request=urllib.request.Request(self.dumpsurl,headers=self.headers) dumpspage=urllib.request.urlopen(request,timeout=240).read urlpaths=re.findall('"[^"]+-stub-meta-history\d.*?\.xml\.gz"',dumpspage)       self.urls=[self.dumpsurl+x.replace('"','') for x in urlpaths] def go(self): # to download, process, and delete the segmented stub-meta-history files in sequence doneurls=[x[0] for x in self.counters] for u in self.urls: if u in doneurls: print (u,"already done") continue filepath="stubhist_working.xml.gz" print ("Downloading "+u) done=False while not done: try: urllib.request.urlretrieve(u, filepath) done=True except Exception as e:                   print (str(e)) time.sleep(10) print ("Reading....") gfile=gzip.GzipFile(filepath) with gfile: self.counters.append((u,self.countusers(gfile))) # avoid dict of dicts, too slippery print print ("Deleting ....") os.unlink(filepath)

def run(self, filepaths): # to just use already-downloaded DB files for f in filepaths: print (f) self.countem=self.countusers(f) open("wikicount_dump.txt","w").write(self.dump) def dump(self): output="" for c in self.counters: path=c[0] dixie=c[1] for d in dixie.keys: newline=path+"\t"+str(d)+"\t"+str(dixie[d])+"\n" output+=newline return output def countusers(self,path): import dateutil.parser if path.endswith(".gz"): file=gzip.GzipFile(path) else: file=open(path) i=0 users= defaultdict(int) tracker=defaultdict(set) reading=False reading_rev=False try: for line in file: i+=1 line=line.strip if line.startswith("<page"): reading=True revisions=[] reading_rev=False thetitle="" continue if reading is not True: continue else: if line.startswith(" "): sortedrevs=list(revisions) sortedrevs.sort username=sortedrevs[0][1] if username != revisions[0][1]: self.replaced_users.add((thetitle,username,revisions[0][1])) users[username]+=1 tracker[username].add(thetitle) self.matchups[thetitle]=username reading=False reading_rev=False stdout.write("\r") #put progress counter here to minimize waste stdout.flush stdout.write(str(i)) continue elif reading_rev is True: if line.startswith(" "): timestamp=line.split(">")[1].split("<")[0] continue elif line.startswith(""): #need to avoid counting pages created by IP for the first registered user to edit username="IP:"+line.split(">")[1].split("<")[0].strip elif line.startswith(" "): username="" elif line.startswith("")[1].split("<")[0].strip revisions.append((thetime,username)) reading_rev=False elif line.startswith("")[1].split("<")[0].strip continue elif line.startswith(""): if not line.startswith("0<"): reading=False continue elif line.startswith("<redirect"): reading=False continue except Exception as e:           print (str(e)) for u in users.keys: if len(tracker[u]) != users[u]: print ("Discrepancy:",u,str(len(tracker[u])),str(users[u])) self.counters.append((path,users)) self.trackers.append((path,tracker)) return users

def sortusers(users): sorted1=[] for u in users.keys: sorted1.append((users[u],u)) sorted1.sort sorted1.reverse return sorted1 def summate(counters): output={} for o in counters: print (o[0],len(output),sum(output.values)) for k in o[1].keys: if k in output.keys: output[k]+=o[1][k] else: output[k]=o[1][k] return output def summate2(counters): output=defaultdict(int) for o in counters: print (o[0],len(output),sum(output.values)) for k in o[1].keys: output[k]+=o[1][k] return output

def truncate(summation,max1=10000): userlist=[] for s in summation.keys: userlist.append((summation[s],s)) print (len(userlist)) userlist.sort userlist.reverse userlist=userlist[:max1] return userlist def get_current_totals: output=[] pagename="Wikipedia:List_of_Wikipedians_by_article_count/Data" url="http://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&rvprop=content&format=xml" % pagename page=urllib.request.urlopen(url,timeout=60).read page=page.split("",1)[1].split("<")[0] pieces=page.split("|}")[0].split("|-")[2:] pieces=[x.strip for x in pieces] for p in pieces: data=[x.strip for x in p.split("|") if x.strip] if not data: continue rank=int(data[0]) username=data[1] count=int(data[2].replace(",","")) output.append(tuple([rank,username,count])) return output def get_mismatches(current,summation): mismatched=[] # list of tuples: (discrepancy,username,current,new) currentdict=dict([(x[1],x[2]) for x in current]) for c in currentdict.keys: if c in summation.keys: if int(summation[c]) != int(currentdict[c]): diff=int(summation[c])-int(currentdict[c]) mismatched.append((diff,c,currentdict[c],summation[c])) mismatched.sort mismatched.reverse return mismatched def getanons: pagename="Wikipedia:List of Wikipedians by number of edits/Anonymous".replace(" ","_") url="http://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&rvprop=content&format=xml" % pagename anonpage=urllib.request.urlopen(url,timeout=60).read anonpage=anonpage.split("==\n",1)[1] anons=[x.split("]]")[0] for x in anonpage.split("User:")[1: print (str(len(anons))+" anons") return anons def replaceanons(wikitext,anons=[]): if not anons: anons=getanons for anon in anons: catchme="| %s\n" % anon if catchme in wikitext: print ("Effacing "+anon) wikitext=wikitext.replace(catchme, "| [Placeholder]\n") return wikitext

def dumpusers(foo,userlist=[]): # Downloader object outdict=defaultdict(set) for tracker in foo.trackers: path=tracker[0] for user in tracker[1].keys: outdict[user] |= tracker[1][user] outtext="" for user in outdict.keys: newline=user+"\t" newline=""       newline+=" - ".join(outdict[user])        newline+="\n" outtext+=newline return outtext def makedatapage(userlist): #as returned by truncate text="""{| class="wikitable sortable" ! No. ! User ! Article count   for u in userlist:        number=str(userlist.index(u)+1)        count=str(u[0])        newlines=""" text += newlines text += "\n|}" return text def totalprep(foo): # take completed Downloader and make Data page summation=summate2(foo.counters) truncation=truncate(summation,5000) datapage=makedatapage(truncation) datapage=replaceanons(datapage) return datapage
 * - style="white-space:nowrap;"
 * %s
 * %s
 * %s
 * -""" % (number,u[1],count)