User:Lar/ClassificationTableGen/Backlev

Perl code: This code generated User:Lar/Sandbox2 (version 6).. There is a lot of work to do on it yet but if you stumble across this, feedback welcome. Not ready for public release yet (if ever).

Updated as of + +Lar: t/c 05:06, 27 March 2006 (UTC)

use strict; use Data::Dumper; use Getopt::Std;
 * 1) !/usr/bin/perl -w
 * 2) process files and generate a category table
 * 3) Author: Larry Pieniazek (IBM/Ascential Software) as hobby project
 * 4) Adapted from stuff I cribbed from all over.
 * 5) (c)Larry Pieniazek 2006. This library is free software; you can redistribute it
 * 6) and/or modify it under the same terms as Perl itself.
 * 7) additionally, can be redistributed and modified under GFDL or CC-SA as you choose
 * 8) Abstract:
 * 9) 	This perlscript is designed to parse category SQL dumps from wikipedia
 * 10) 	which are found here: http://download.wikimedia.org/enwiki/
 * 11)   For example the 23 March dump is called
 * 12) 	 http://download.wikimedia.org/enwiki/20060323/enwiki-20060323-categorylinks.sql.gz
 * 13) 	The parsing is to generate article classification tables such as those found at
 * 14) 	 http://en.wikipedia.org/wiki/Wikipedia_talk:WikiProject_The_Beatles/Article_Classification
 * 15)   In addition to the dump (currently must have been converted to linefeed delimited tuples)
 * 16) 	the other input is a list of categories of interest, one per line.
 * 1) 	 http://en.wikipedia.org/wiki/Wikipedia_talk:WikiProject_The_Beatles/Article_Classification
 * 2)   In addition to the dump (currently must have been converted to linefeed delimited tuples)
 * 3) 	the other input is a list of categories of interest, one per line.
 * 1) 	the other input is a list of categories of interest, one per line.


 * 1) things we may want to use at some point
 * 2) use File::Spec::Functions;


 * 1) Subroutine prototypes:                                                    #
 * 1) Subroutine prototypes:                                                    #


 * 1) setup

sub Usage;				# print info message about how to use this sub ProcessOptions; 	# Process Command Line Options.


 * 1) utility

sub ScoreToBlank;		# underscores to blanks sub BlankToScore;		# blanks to underscores sub FlipComma;			# reverse a reversed comma string. "Lennon, John" -> "John Lennon" sub UnEscape;			# remove escapes with a clever rexp


 * 1) general

sub ReadCatFile;		# read the category file into the catArray and is_catHash sub ParseSQL;			# parse the big SQL file and build the article data (hashref $collect) sub WriteTable;			# create the output sub WriteTableHeader;	       # used by above, create header sub WriteTableSecBreak;	       # used by above, create a section break (when the leading char changes)


 * 1) -- option switches and related -

my(%options);      # hash of switches, values


 * 1) - logging NOT IMPLEMENTED YET (ever?) --

my($logging);      # Flag to denote we are writing to log. my($log_dir);      # Log Directory. my($lfh);          # Log File Handle. my($LOG_FILE_NAME); # Name of Log File to be written -l value or default

my($debug);	   # -d Flag to denote REALLY verbose messaging.
 * 1) my($verbose);     # -v Flag to denote verbose messaging.

my($sqlFileName);  # -q  (or 'enwiki-20060303-categorylinks.sql') my($catFileName);  # -c        (or 'categoryList.txt') my($tableFileName); # -o          (or 'tables.txt')


 * 1) -- Data structures -

my $inCats=;         # what cats is the article in? my $nameVersions=;   # what are the versions of the name (lex orders) my $rec={};		# ref to one article's record my $collect={};		# ref to all the articles keyed on the $artKey var
 * 1) what the data will look like
 * 1)    my $rec={
 * 2)       key => "178234",        # numeric key from first tuple value (article key, believed unique)
 * 3)       artLink => "link text"  # text to use for link not same as sort
 * 4)       sortKey => "sort text"  # sort text (what order should article come out)
 * 5)    	inCats  => [@inCats],   # array of categories the article is in
 * 6)    	nameVersions => [@nameVersions] # array of version of the name of the article
 * 7)                                               # this one may not be used for anything
 * 8)    	};					# one article's record
 * 9)    my %collect={
 * 10)    	key => $rec
 * 11)    	};				# all the articles keyed on the $artKey var
 * 1)    	};				# all the articles keyed on the $artKey var


 * 1) -- work vars

my @catList; my @catArray; my %is_catHash;


 * 1) file handles

my $sqlH; my $tableH; my $cfH;

sub Usage {
 * 1) Usage - Print Usage Information and exit.
 * 1) Usage - Print Usage Information and exit.

print <] [-c ] [-o ]

Switch meanings: -h --help   print this help message. -v --version print version message. -d <0|1|2|3>  debug: 0: quiet 1: Verbose Mode 2: REALLY verbose mode 3: Every frigging detail. File switches: -q  (or 'enwiki-20060303-categorylinks_sample.sql' by default) -c (or 'categoryList.txt' by default) -o (or 'tables.txt' by default)

END_USAGE print "Status: 99\n"; exit(99); } # End of Usage.

sub Version { print "\nfilterCategories version 0.04 - 26 March 2006, Larry Pieniazek." ." \n -- released under GFDL and CC-SA -- \n\n"; # really should print something else }


 * 1) this stuff isn't quite right at the moment

sub HELP_MESSAGE{ &Usage; } sub VERSION_MESSAGE{ &Version; }
 * 1) required for getopts to support --help and --version
 * 1) required for getopts to support --help and --version

sub ProcessOptions { &Version if ($options{'v'}); &Usage if ($options{'h'});
 * 1) ProcessOptions - Process Command Line Options.
 * 1) ProcessOptions - Process Command Line Options.

my %debugHash = (        '0'=>"silent",    	'1'=>"normal trace",    	'2'=>"very chatty",    	'3'=>"insanely chatty" ); if (defined $options{'d'}) { $debug=$options{'d'}; if ($debugHash{$debug}) { print"...debug switch was ".$options{'d'}." giving setting: ".$debugHash{$debug}."\n" unless 0 == $options{'d'} ; # if 0, then REALLY quiet } else { $debug=1; print"...debug switch was ".$options{'d'}." defaulting debug to 1 - normal trace\n"; } # recognised option } else { # default, no switch $debug=1; print"...debug switch not found, defaulting debug to 1 - normal trace\n"; }	if (defined $options{'q'}) { $sqlFileName=$options{'q'}; } else { $sqlFileName="enwiki-20060303-categorylinks_sample.sql"; }

if (defined $options{'c'}) { $catFileName=$options{'c'}; } else { $catFileName="categoryList.txt"; }

if (defined $options{'o'}) { $tableFileName=$options{'o'}; } else { $tableFileName="tables.txt"; }

} # End of ProcessOptions.

sub ReadCatFile {
 * 1) ReadCatFile - read in categories to build article tracking tables for
 * 1) ReadCatFile - read in categories to build article tracking tables for

my $rc=0;
 * 1)   $catFileName = $_[0];  # now set processOptions

if ($debug>2) { stat($catFileName); print "Exists\n" if -e _; print "Readable\n" if -r _; print "Writable\n" if -w _; print "Executable\n" if -x _; print "Setuid\n" if -u _; print "Setgid\n" if -g _; print "Sticky\n" if -k _; print "Text\n" if -T _; print "Binary\n" if -B _; }   if (( -e $catFileName ) && ( -r $catFileName )) { if (!open $cfH, "<", $catFileName){ warn "can't open ".$catFileName."\n"; $rc=99; return $rc; } } else { print "error with ".$catFileName." ... does not exist or not readable \n"; $rc= 99; return $rc; }

%is_catHash = ;

if ($debug>0) {print "reading ".$catFileName."\n";} # @catList=<$cfH>; my $catListItem; for { undef $!; unless (defined( $catListItem = <$cfH> )) { die $! if $!; last; # reached EOF }	chomp $catListItem; $catListItem=ScoreToBlank($catListItem); push @catList, $catListItem; # set up searchable hash... $is_catHash{$catListItem} = 1; }   if ($debug>0) { print "\nCategories to process: \n"; for my $fe(@catList) {print( $fe."\n");}; print "\n"; }   if ($debug>1) { print "\n\n... corresponding hash values: \n"; while (my ($key, $value) = each %is_catHash) { print "$key = $value\n"; }   	print "\n"; } # end chatty trace

$rc=0; return $rc; }

sub ScoreToBlank { my $str=$_[0]; if ($debug>3) {print "ScoreToBlank \$str IN: $str\n";} $str=~ s/_/ /g; if ($debug>3) {print "ScoreToBlank \$str OUT: $str\n";} return $str; } # there
 * 1) ScoreToBlank - convert underscores to blanks
 * 1) ScoreToBlank - convert underscores to blanks

sub BlankToScore { my $str=$_[0]; $str=~ s/ /_/g; return $str; } # and back again
 * 1) BlankToScore - convert blanks to underscores
 * 1) BlankToScore - convert blanks to underscores

sub FlipComma { my $str=$_[0]; my ($first,$second)= split(/, /,$str,2); if (length($second)>0) { # there is something there to flip $str=$second." ".$first; }   return $str; } # round and round we go
 * 1) FlipComma - take a phrase with comma (and 1 blank) and flip it,
 * 2)    "Lennon, John" -> "John Lennon"
 * 1)    "Lennon, John" -> "John Lennon"

sub StripLeadTrail { my $str=$_[0]; $str=~ s/^\s+//; $str=~ 	s/\s+$//; return $str; } # and back again sub UnEscape { my($string) = $_[0]; my $eseq = "\\"; return $string unless($eseq); #no-op #Remove escape characters apart from double-escapes $string =~ s/\Q$eseq\E(?!\Q$eseq\E)//gs;
 * 1) StripLeadTrail - strip leading s/^\s+// and trailing s/\s+$// blanks
 * 1) StripLeadTrail - strip leading s/^\s+// and trailing s/\s+$// blanks
 * 1) UnEscape - remove escape chars unless they're escaped
 * 2)   this code lifted from John Alden's Escape Delimiters
 * 3)     http://search.cpan.org/src/JOHNA/Text-EscapeDelimiters-1.004/lib/Text/EscapeDelimiters.pm
 * 4) Text::EscapeDelimiters v1.004
 * 5) (c) John Alden 2005. This library is free software; you can redistribute it
 * 6) and/or modify it under the same terms as Perl itself.
 * 1) and/or modify it under the same terms as Perl itself.

#Fold double-escapes down to single escapes $string =~ s/\Q$eseq$eseq\E/$eseq/gs;

return $string; }

sub ParseSQL {
 * 1) ParseSQL - read through the SQL file and build the data structures
 * 2) - read in one tuple at a time (currently one line but change to
 * 3)   buffered read later)
 * 4) - for each tuple parse out the pieces we need
 * 5) - add or update record in $collect hash, recording category and lexical key
 * 6)   (if we find a comma reversed version of the article name, it's probably
 * 7)   a better lexical key than we have so take it.)
 * 8)   - update lexical key, article name, category seen
 * 9)   - possibly strip blanks, change _ to blanks, remove \ escapes,
 * 10)     and reverse comma fields. (future: use list of articles with commas
 * 11)     in their names as refinement)
 * 1)     in their names as refinement)

my $rc=0; if (( -e $sqlFileName ) && ( -r $sqlFileName )) { open ($sqlH, "<", $sqlFileName) or die "can't open ".$sqlFileName." for reading \n"; } else { print "error with ".$sqlFileName." ... does not exist or not readable \n"; $rc=99; return $rc; }

if ($debug>0) {print "reading ".$sqlFileName."\n"; } my $sqlLine; my $sqlLC=0;

$Data::Dumper::Indent = 2;        # pretty print (3 is with array indices    $Data::Dumper::Useqq = 1;          # print strings in double quotes    $Data::Dumper::Pair = " : ";       # specify hash key/value separator    $Data::Dumper::Purity = 1;         # fill in the holes for eval    $Data::Dumper::Maxdepth = 3;       # no deeper than 3 refs down    $Data::Dumper::Deepcopy = 1;       # deep copy        for  {        undef $!;        unless (defined( $sqlLine = <$sqlH> )) {            die $! if $!;            last; # reached EOF        }        # we have to process lines that look like any of these         # (12731,'Catholics_not_in_communion_with_Rome','George Harrison',20060228150212),        #    ordinary	# (12731,'Deaths_by_lung_cancer','Harrison, George',20050904074730),	#    sort order is different (the article name is probably the first 12731 that doesn't	#    have a comma in the article name # (12731,'George_Harrison','',20060303000936), #   self ref... the category contains an article named the same thing # (2246703,'The_Beatles_songs','Don\'t Pass Me By',20050719071328), #   embedded escaped ' will screw up parse if not careful. # safe to process line as we got a line $sqlLC++; if ($debug>2) { print "line ".$sqlLC." was ".$sqlLine."\n"; } chomp $sqlLine; my($firstP, $secondP) = split(/',/, $sqlLine,2); if ($debug>2) { print "firstP: >".$firstP."< secondP: >".$secondP."< \n";} my($artKey, $catName) = split(/,'/,$firstP,2); $artKey=substr($artKey,1); $catName=ScoreToBlank($catName); if ($debug>2) { print "artKey: >".$artKey."< catName: >".$catName."< \n"; } my($artName, $timeStamp)=split(/',/,substr($secondP,1),2); # $timeStamp=split(/),/,$timeStamp,1);       $timeStamp=substr($timeStamp,0,-2);        if ($debug>2) {print "artName: >".$artName."< timeStamp: >".$timeStamp."< \n";}        if (0==length($artName)) { # empty, this is the case of matching art/cat names            $artName=$catName;        } else {            $artName=StripLeadTrail(UnEscape($artName));        }	my $sortKey="";	my $skHasComma=0;	my $anHasComma=0;

if (exists($is_catHash{$catName}) ) { if ($debug>1) { print "artName: >".$artName. "<\n timeStamp: >".$timeStamp. "<\n artKey: >".$artKey. "<\n catName: >".$catName."< \n"; print " ... one of our cats! \n"; }           if (exists($collect->{$artKey}) ) { if ($debug>1) { print "   ... and we have the article already\n"; }; $rec = $collect->{$artKey};    	# get ref to existing one $inCats= $rec->{inCats};		# and to the arrays it carries $nameVersions = $rec->{nameVersions}; } else { $rec={}; 			       # make an empty one $rec->{key}=$artKey;		# uses same key $inCats=; $nameVersions=; }           $inCats->{$catName}=1; $nameVersions->{$artName}=1; $rec->{'inCats'}=$inCats; $rec->{'nameVersions'}=$nameVersions; # put logic to handle making sure name of article for link is non comma $anHasComma= ( $artName =~/,/ ); my $artNameSave=$artName; if ($anHasComma) { # if article has comma flip it and save that as name $artNameSave=FlipComma($artName); }            if (!(exists($rec->{artLink}))) { $rec->{artLink}=$artNameSave; }           if ($debug>1) {print "\$artName: $artName \$artNameSave: $artNameSave	\n"} # put logic for sort key here if (exists($rec->{sortKey})) { $sortKey=$rec->{sortKey}; if ($debug>1) {print "sortKey: $sortKey\n"; } if ($sortKey ne $artName) { # If the keys are the same do nothing $skHasComma= ( $sortKey =~/,/ ); if ($debug>1) {print "anHasComma: $anHasComma skHasComma: $skHasComma\n";} if ($anHasComma eq $skHasComma) { # if neither has a comma, or both have a comma take whichever one is earlier in the alphabet if ($sortKey gt $artName) { $rec->{sortKey}= $artName; } # else not needed because sortKey already earlier, leave it. } else { # If the new key has a comma in it, use that one, it's probably the sort key if ($anHasComma) { $rec->{sortKey}= $artName; } # else not needed, leave as is       	    } if ($debug>1) {print "sortKey now is ".$rec->{sortKey}."\n"; } } # end of handling different keys } else { # we don't have it, save it away $rec->{sortKey}=$artName;	# since it's new, the sort key is the name we found if ($debug>1) {print "added sortKey: $rec->{sortKey}\n"; } } # end if sortKey does/doesn't exist $collect->{$artKey}=$rec; } # end if category is one we care about } # end for (the read loop)

if ($debug>0) { print "...collect: \n"; print Dumper($collect); }   if ($debug>0) {print "finished parsing SQL\n"; }

return $rc; } # end ParseSQL


 * 1) WriteTableHeader - create output table header
 * 1) WriteTableHeader - create output table header

sub WriteTableHeader { # assumes that $tableH is open and valid print $tableH <<END_TABLEH; {| {| width="100%" border="1" cellpadding="2" cellspacing="0" style="margin: 1em 1em 1em 0; background: #f9f9f9; border: 1px #aaa solid; border-collapse: collapse; font-size: 85%;" !width=20%|Article !width=15%|Categories !width=7%|Assessed !width=7%|Status !width=5%|Uses Infobox !width=37%|Comments and Pending tasks !width=8%|Assessed by END_TABLEH
 * valign=top|

return 0; }

sub WriteTableSecBreak { my $headChar=$_[0]; print $tableH "|-\n|colspan=\"7\" align=\"left\" style=\"background:white; font-size: 200%;"            	." font-weight:bold; border-bottom:4px solid grey; \"| \n" ."====".$headChar."====\n"; return 0; } # end WriteTableSecBreak
 * 1) WriteTableSecBreak - create output table break between sections
 * 1) WriteTableSecBreak - create output table break between sections


 * 1) WriteTable - create output table
 * 2) -  sort the data structure by the sort keys (which are the lexical
 * 3)   (sometimes comma inverted) article names) ... these keys are inside the
 * 4)   structure
 * 5) - using the sorted array of keys, iterate the hash in sort order
 * 6) - every time the first letter of the key changes, write out a SecBreak
 * 1) - every time the first letter of the key changes, write out a SecBreak

sub WriteTable { my $rc=0; if ($debug>2) { print" statting: ".$tableFileName."\n"; stat($tableFileName); print "Exists\n" if -e _; print "Readable\n" if -r _; print "Writable\n" if -w _; print "Executable\n" if -x _; print "Setuid\n" if -u _; print "Setgid\n" if -g _; print "Sticky\n" if -k _; print "Text\n" if -T _; print "Binary\n" if -B _; }

open ($tableH, '>', $tableFileName) or die "can't open ".$tableFileName." for writing \n"; $rc=&WriteTableHeader; if ($rc) { die "error building table header\n"; } # we want to create line pairs of the form # (with the pipe in col 1) #	|-   #	|Abbey Road (album)||category:The Beatles albums|| ||||unknown|| || #	|-   #	|Anthology 1||category:The Beatles albums|| ||||unknown|| || #    # in sorted order # make an array of the keys to the hash # (the article keys, which are not in any particular alpha)

my @keys = sort { $collect->{$a}->{sortKey}    # custom sort spec, use the lexical key cmp                            # (which is embedded in the rec) $collect->{$b}->{sortKey} } keys %{$collect}; my $firstLet=chr(00); # has to be lower than any other character val! # iterate in sorted order foreach my $artKey ( @keys ) { $rec = $collect->{$artKey};    # get easy access to the record $inCats= $rec->{inCats};	# and to the category array it carries my $artLink=$rec->{artLink}; my $trialFirst=substr($rec->{sortKey},0,1); # get first char if ($trialFirst ne $firstLet) { $firstLet=$trialFirst; if ($debug>1) {print "Switching to new first letter: $firstLet \n";} &WriteTableSecBreak($firstLet); } # end if new first letter in lexical order my ($catStr,$catV,$catK); $catStr=""; while (($catK, $catV) = each %{$inCats}) { if ("" ne $catStr) { $catStr.=" "; }	   $catStr.="category:".$catK.""; } # loop through the categories we saw if ($debug>0) { print "key: ".$artKey." rec key ".$rec->{key}." article Link text ".$artLink."\n"; } print $tableH "|-\n||".$artLink."||".$catStr."||| ||" .""		."||unknown|| || \n"; } # end of iteration through the hash in sorted order

# finish off table print $tableH "\n|}"; return $rc; }

my $rc=0; # print "prior to getopts\n"; getopts('hvd:q:c:o:', \%options) or &Usage; # debug also d
 * 1) Main routine -
 * 2) 	process options
 * 3) 	read in categories desired
 * 4) 	build hash of articles by parsing SQL file
 * 5) 	write out table file using hash
 * 6) main
 * 1) main
 * 1) main

# print "post getopts, pre process\n"; &ProcessOptions; if ($debug>1) { print "post process, pre read cat\n";	} $rc=&ReadCatFile; if ($rc) { die "error reading category list\n"; }

$rc=&ParseSQL; if ($rc) { die "error reading SQL or building structure\n"; }

$rc=&WriteTable; if ($rc) { die "error building table\n"; }

exit 0;