User:Zetawoof/BookList/Code

Run this first (book list grabber)
import sys, os, urllib, json, time

class Book(object): __slots__ = ('id', 'title', 'text') def __init__(self, id, title, text): self.id = long(id) self.title = unicode(title) self.text = unicode(text)

def apiQuery(**params): d = {'format': 'json'} d.update(params) for k, v in list(d.items): if v is None: del d[k] return json.load(urllib.urlopen('http://en.wikipedia.org/w/api.php?' + urllib.urlencode(d)))

def generateBooks: genFrom = None

while True: q = apiQuery(action='query', generator='embeddedin',                    geititle='Template:Saved book', prop='revisions',                     rvprop='content', geilimit=50, geicontinue=genFrom)

for bk in q['query']['pages'].values: yield Book(                   id = bk['pageid'],                    title = bk['title'],                    text = bk['revisions'][0]['*']                   )

if 'query-continue' in q:           genFrom = q['query-continue']['embeddedin']['geicontinue'] else: return

for b in generateBooks: print b.title f = open('books/' + str(b.id), 'w') f.write(b.title.encode("UTF-8") + "\n" + b.text.encode("UTF-8"))

Run this second (report generator)
import os, re, time

bookdir = "books"

badprefix = set(("user", "user talk", "wikipedia", "wikipedia talk", "template", "category", "portal", "portal talk", "help", "help talk"))

whitelist = set(( "Wikipedia:Books/The Missing Manual", "User:Sue Gardner/Books/Welcome", "User:Sue Gardner/Books/BLP", "User:Miya/Books/Helps and Extensions", "User:BookSpace/Books/Sandbox1", "User:BookSpace/Books/Sandbox2", "User:BookSpace/Books/Sandbox3", "User:BookSpace/Books/Sandbox4", "User:BookSpace/Books/Sandbox5", "User:BookSpace/Books/Sandbox6", "User:BookSpace/Books/Sandbox7", "User:BookSpace/Books/Sandbox8", "User:BookSpace/Books/Template", ))

booksNoLinks = set booksBadLinks = set booksOneLink = set booksGoodUser = set booksGoodProject = set booksGoodWtf = set

for bf in os.listdir(bookdir): f = open(os.path.join(bookdir, bf)) title = f.readline.strip

if title in whitelist: continue

links = set lines = 0 headings = 0 sections = 0 cats = 0 unknown = 0

for line in f:       line = line.strip lines += 1

if not line: continue

if line == '': continue

llinks = re.findall("\[\[([^]|]*)", line) if len(llinks): for link in llinks: if link.startswith("Category:"): cats += 1 else: links.add(link) elif line.startswith("{{saved"): continue elif line.startswith("="): headings += 1 elif line.startswith(";"): sections += 1 elif len(line): unknown += 1

badlinks = set prefixes = set for l in links: pfx = l.split(":")[0].lower prefixes.add(pfx) if pfx in badprefix or l == 'Main Page': badlinks.add(l)

goodlinks = links.difference(badlinks)

#print title, "(%d good/%d bad; %d heads %d sects %d cats %d unk)" % (len(goodlinks), len(badlinks), headings, sections, cats, unknown)

if len(links) == 0: booksNoLinks.add(title) elif len(goodlinks) == 0: booksBadLinks.add(title) elif len(links) == 1: booksOneLink.add(title) elif title.lower.startswith("user:"): booksGoodUser.add(title) elif title.lower.startswith("wikipedia:"): booksGoodProject.add(title) elif title.lower.startswith("book:"): booksGoodProject.add(title) else: booksGoodWtf.add(title)

def printLinks(linkSet, title): if not len(linkSet): return print "\n== %s (%d) ==" % (title, len(linkSet)) for l in sorted(linkSet): print "* %s" % l

print "" print "Last updated %s" % time.ctime

printLinks(booksNoLinks, "Books containing no articles") printLinks(booksOneLink, "Books containing only one article") printLinks(booksBadLinks, "Books containing no mainspace articles") printLinks(booksGoodWtf, "Books in totally unexpected places") printLinks(booksGoodUser, "Otherwise unclassified books in user space") printLinks(booksGoodProject, "Otherwise unclassified books in project space")