User:Visviva/Code

Deletion sorting
This is not a substitute for human deletion sorting, but it should help to get the ball rolling. Here are the steps I'm currently following:

Prepare category data
 * 1)  From the most recent database dump, download the all-articles XML.  Unfortunately this is too large (about 6.9 GB) for PHP to handle directly.
 * 2)  Use split to cut this into chunks of a size that PHP can handle.
 * 3)  Run fetch.php to extract all category relationships and insert them into an SQL table.
 * 4)  Run getcats.php to generate list of subcats to the desired depth (10 seems to be too deep, 5 may be ideal).  Due to the ontological faults in the category structure, this list is inevitably either incomplete or inconsistent, and possibly both.

Sort deletions
 * 1)  Using the wikitext from a given day's deletion log, generate a list of URLs of the day's deletion candidates.
 * 2)  Use Wget to download all files into the current directory.
 * 3)  Run index.php to generate a list of categorized and uncategorized articles, and a set of recommendations for sorting.

...This is rather clunky and I'm quite open to suggestions for streamlining the process.

Fetch.php
Digests a MediaWiki XML dump into an SQL table of 1:1 category relationships. Requires an existing MySQL database ('wiki') containing a table called 'wiki' with three columns for id, supercat, and subcat.

<?php set_time_limit(0); //File chunks into which the XML dump was broken. May run above 50. $filenames=array('xaa','xab','xac'); //Check that all filenames are valid. //   foreach ($filenames as $check) { //       $dummy=fopen($check,'rb'); //   }    $count=0; foreach ($filenames as $filename) { $file=fopen($filename,'rb'); $active=false; echo "using $filename. "; $link=mysql_connect('localhost','username','password'); if (!$link) { echo mysql_error; }   $startcat=' Category:'; $findme='[[Category:';   while ($line=fgets($file)) { //If $findme has not been found         if (!$active) {	    if (strpos(' '.$line,$startcat)) {	        $catname=substr($line,strpos($line,$startcat)+strlen($startcat));		$end=strpos($catname,' ');		$catname=substr($catname,0,$end);		$active=true;            }            else {	         continue;            }	}	else {	    if (strpos(' '.$line,$findme)) {	        $newcats=explode($findme,$line); //Skip first element, if not a category.		if ($newcats[0] AND strpos(' '.$line,$newcats[0]) == 1) {		    $x=0;		}		else {		    $x=1;		}		$catsin=array; //Scan chunks of line, extract category from each.		while ($newcats[$x]) {		    $newcat=$newcats[$x];		    if ($a=strpos($newcat,'|')) {		        $newcat=substr($newcat,0,$a);		    }		    elseif ($b=strpos($newcat,']]')) { $newcat=substr($newcat,0,$b); }		   else { $x++; continue; }		   $x++; $catsin[]=$newcat; } //Insert into database; report total number for each cat and any errors.

foreach ($catsin as $catin) { $count++; $catin=addslashes($catin); $current=addslashes($catname); $result=mysql_db_query('wiki',"INSERT INTO wiki VALUES ('$count','$catin','$current');",$link); if (!$result) { echo "Sorry, no dice on $count:$catname:$catin -- returned error was: ".mysql_error." "; }		} //      	        echo count($catsin)." supercategories found for $catname. "; }	   if (strpos(' '.$line,' ')) { $active=false; }	   continue; }

}    }     echo "Script execution complete.  Congratulations!"; ?>

Getcats.php
Generates a list of descendant cats for each sortpage. This takes an eternity to run. It may be necessary to abandon PHP/MySQL in favor of something more sane, like Perl and text files.

To save space, I've omitted the array of sortpages. You can find it in index.php, just below.

August 15 2006: The last execution stopped at Organizations and programs.

 $category) { $subcats=get_subcats($category); $output=$subcats; $x=0; $already=array; //Scan subcategories to depth $inc while ($x<$inc) { $x++; $nextround=array;

//Go through the subcategories, adding new subcats to the pile. foreach ($subcats as $subcat => $blank) { if ($already[$subcat]) { continue; }	       $already[$subcat]=1; $newlist=get_subcats($subcat); $nextround=array_merge($nextround,$newlist); $output=array_merge($output,$newlist); }           $subcats=$nextround; }	echo " Found the following categories for $category: "; foreach ($output as $out => $nothing) { echo "$out "; }	$final=$category; foreach ($output as $out => $nothing) { $final=$final.' '.$out; }       $page=addslashes($page); $final=addslashes($final); $result=mysql_db_query('wiki',"INSERT INTO $tablename VALUES ('$page','$final');",$link); if (!$result) { echo mysql_error; }   }

function get_cats { $array=array; //Same as the array in index.php

return $array; }

//Returns subcategories in an associative array //The keys are subcat names, the values are all blank strings.

function get_subcats($cat) { if (!$cat) { return array; }           $subcats=array; $slashcat=addslashes($cat); $result=mysql_db_query('wiki',"SELECT `subcat` from wiki WHERE `supercat`='$slashcat'",$link); if (!$result) { echo mysql_error; return array; }           $y=0; while ($line=mysql_fetch_array($result)) { $subcat=$line[0]; $y++; $subcat=trim(stripslashes($subcat)); $subcats[$subcat]=''; } //	   print_r($subcats); return $subcats; } ?>

Index.php
Just a start... This must be in the same directory as the downloaded AfD candidate pages. Also in the directory must be a file called "list.txt," containing a URL on each line, which was used by Wget to download the pages.

<?php

//Get list $filename="list.txt"; $lines=file($filename); $cutstring="http://en.wikipedia.org/wiki/";

$pages=array; $output=array; $delcats=get_cats;

//Your date here. $date='2006 August 14';

foreach ($lines as $pagename) { $pagename=trim($pagename); $pagename=substr($pagename,strlen($cutstring)); $pagenames[]=$pagename; }

//Get contents of all AfD'd pages

foreach ($pagenames as $pagename) { $pagename=trim($pagename); $contents=file_get_contents($pagename); if (strpos($contents,"Wikipedia does not have an article")){ continue; }	$start=strpos($contents,' '); $length=strpos($contents,' ') - $start; $contents=substr($contents,$start,$length); $pages[$pagename]=$contents; }

//Extract categories

foreach ($pages as $name=>$page) { $search="/wiki/Category:"; $count=substr_count($page,$search); $a=0; $posse=array; while ($a<$count) { $a++; $x=strpos($page,$search); $page=substr($page,$x+strlen($search)); $catname=substr($page,0,strpos($page,'"'));		if ($catname==="Articles_for_deletion" OR $catname === "Category_needed" OR strpos(' '.$catname,"Cleanup") OR strpos(' '.$catname,"cleanup")) {		   continue;		}		else {		    $posse[]=$catname;		}	    }	    $matches=array; //Keyword scan:  currently just scans for category name, could be expanded into full list of likely keywords.	    foreach ($delcats as $delcat => $dummy) { //Avoid keywords appearing in the boilerplate ... this could be removed if the scan focuses only on the article text.	            $searchstring=$delcat;		    if ($delcat=='Events') {		        $searchstring='Event';	           }		    if ($delcat=='History') {		        $searchstring='Historic';		    }		    if ($delcat=='Media') {		        continue;		    }		    if (strpos($page,$searchstring) OR strpos($page,strtolower($searchstring))) {		        $matches[]=$delcat;			$catmatches[$delcat][]=$name; }           }	    $keywordmatches[$name]=$matches; $output[$name]=$posse; } //Output

echo "  Category Output   "; $nocats=array; echo " Output for $date "; foreach ($output as $out => $cats) { if (empty($cats) OR !is_array($cats)) { $nocats[]=$out; $cats=array; }     echo ''.$out.' '; echo 'Deletion debate: Wikipedia:Articles for deletion/'.$out.' '; echo "$out is located in the following categories: "; $x=0; foreach ($cats as $cat) { $x++; echo $x.'. '.$cat.' '; }	echo "$out contained a keyword for the following categories: "; $y=0; foreach ($keywordmatches[$out] as $match) { $y++; echo $y.'. '.$match.' '; }   echo " "; } //List articles not in categories echo " Not Categorized "; foreach ($nocats as $nocat) { echo ''.$nocat.' '; }

$delcats=get_cats; foreach ($delcats as $delpage => $delcat) { $suggest=array; echo " $delpage "; $notyet=get_subcats($delcat); $subcats=array_flip($notyet); foreach ($output as $key => $cats) { if (!is_array($cats)) { continue; }	   foreach ($cats as $cat) { if ($subcats[$cat]) { $suggest[]=$key; }           }	}	if (empty($suggest)) { echo "No matches found. "; }	else { echo 'Suggest for '.$delpage.':'; foreach ($suggest as $suggestion) { echo ' '; }	}	if ($catmatches[$delcat]) { echo " Keyword suggestions for $delcat: "; foreach ($catmatches[$delcat] as $matched) { echo ' '; }	}   }    echo "  ";

//Functions

function get_subcats($cat) { $subcats=array; $link=mysql_connect('localhost','monty','cifra99'); $result=mysql_db_query('wiki',"SELECT `subcats` from subcats WHERE `supercat`='$cat'",$link); if (!$result) { echo mysql_error; return array; }	$list=mysql_fetch_assoc($result); $subcatlist=$list['subcats']; $subcatlist=stripslashes($subcatlist); //	echo $subcatlist; $subcats=explode(' ',$subcatlist); //       print_r($subcats); return $subcats; }   function get_cats { $array=array; $array["Alabama"]="Alabama"; $array["Albania"]="Albania"; $array["Antarctica"]="Antarctica"; $array["Argentina"]="Argentina"; $array["Arizona"]="Arizona"; $array["Armenia"]="Armenia"; $array["Arts"]="Arts"; $array["Athletes"]="Athletes"; $array["Australia "]="Australia"; $array["Austria"]="Austria"; $array["Authors"]="Writers"; $array["Azerbaijan"]="Azerbaijan"; $array["Bands"]="Musical groups"; $array["Bangladesh"]="Bangladesh"; $array["Barbados"]="Barbados"; $array["Belgium"]="Belgium"; $array["Brazil"]="Brazil"; $array["Brunei"]="Brunei"; $array["Bulgaria"]="Bulgaria"; $array["Business"]="Business"; $array["Businesses"]="Companies"; $array["Businesspeople"]="Businesspeople"; $array["California"]="California"; $array["Cambodia"]="Cambodia"; $array["Canada "]="Canada"; $array["China"]="China"; $array["Colombia"]="Colombia"; $array["Colorado"]="Colorado"; $array["Comics and animation"]="Animation"; $array["Computers"]="Cartooning"; $array["Connecticut"]="Connecticut"; $array["Croatia"]="Croatia"; $array["Cricket"]="Cricket"; $array["Cuba"]="Cuba"; $array["Czech Republic"]="Czech Republic"; $array["Delaware"]="Delaware"; $array["Denmark"]="Denmark"; $array["Ecuador"]="Ecuador"; $array["Education"]="Education"; $array["Egypt"]="Egypt"; $array["Estonia"]="Estonia"; $array["Ethiopia"]="Ethiopia"; $array["Events"]="Events"; $array["Fashion"]="Fashion"; $array["Fictional characters"]="Fictional characters"; $array["Film and TV"]="Mass media"; $array["Finland"]="Finland"; $array["Florida"]="Florida"; $array["Food and drink"]="Food and drink"; $array["Fora"]="Internet forums"; $array["France"]="France"; $array["Game-related"]="Games"; $array["Georgia"]="Georgia"; $array["Germany"]="Germany"; $array["Greece"]="Greece"; $array["Guatemala"]="Guatemala"; $array["Hawaii"]="Hawaii"; $array["History"]="History"; $array["Hong Kong"]="Hong Kong"; $array["Hungary"]="Hungary"; $array["Iceland"]="Iceland"; $array["Idaho"]="Idaho"; $array["Illinois"]="Illinois"; $array["India"]="India"; $array["Indiana"]="Indiana"; $array["Indonesia"]="Indonesia"; $array["Internet"]="Internet"; $array["Iowa"]="Iowa"; $array["Iran"]="Iran"; $array["Iraq"]="Iraq"; $array["Ireland"]="Ireland"; $array["Levant"]="Levant"; $array["Italy"]="Italy"; $array["Japan"]="Japan"; $array["Judaism"]="Judaism"; $array["Kentucky"]="Kentucky"; $array["Korea"]="Korea"; $array["Language"]="Language"; $array["Latvia"]="Latvia"; $array["Lists"]="Lists"; $array["Lists of people"]="Lists of people"; $array["Lithuania"]="Lithuania"; $array["Louisiana"]="Louisiana"; $array["Luxembourg"]="Luxembourg"; $array["Macedonia"]="Macedonia"; $array["Maine"]="Maine"; $array["Malaysia"]="Malaysia"; $array["Martinique"]="Martinique"; $array["Maryland"]="Maryland"; $array["Massachusetts"]="Massachusetts"; $array["Media"]="Media"; $array["Mexico"]="Mexico"; $array["Michigan"]="Michigan"; $array["Middle East"]="Middle East"; $array["Military and combat"]="Military"; $array["Minnesota"]="Minnesota"; $array["Mississippi"]="Mississippi"; $array["Missouri"]="Missouri"; $array["Montana"]="Montana"; $array["Montserrat"]="Montserrat"; $array["Music"]="Music"; $array["Nebraska"]="Nebraska"; $array["Nevada"]="Nevada"; $array["Netherlands"]="Netherlands"; $array["New Hampshire"]="New Hampshire"; $array["New Jersey"]="New Jersey"; $array["New Mexico"]="New Mexico"; $array["New York"]="New York"; $array["New Zealand"]="New Zealand"; $array["Nigeria"]="Nigeria"; $array["North Carolina"]="North Carolina"; $array["Norway"]="Norway"; $array["Oceania"]="Oceania"; $array["Ohio"]="Ohio"; $array["Oklahoma"]="Oklahoma"; $array["Oregon"]="Oregon"; $array["Organizations and programs"]="Organizations"; $array["Pakistan"]="Pakistan"; $array["Panama"]="Panama"; $array["Pennsylvania"]="Pennsylvania"; $array["People"]="People"; $array["Philippines"]="Philippines"; $array["Poetry"]="Poetry"; $array["Poland"]="Poland"; $array["Politicians"]="Politicians"; $array["Politics and law"]="Government"; $array["Portugal"]="Portugal"; $array["Publications"]="Publications"; $array["Puerto Rico"]="Puerto Rico"; $array["Quebec"]="Quebec"; $array["Recreation"]="Recreation"; $array["Religion and philosophy"]="Belief"; $array["Rhode Island"]="Rhode Island"; $array["Romania"]="Romania"; $array["Russia"]="Russia"; $array["Saudi Arabia"]="Saudi Arabia"; $array["Schools"]="Schools"; $array["Science and medicine"]="Science"; $array["Science fiction"]="Science fiction"; $array["Senegal"]="Senegal"; $array["Sexuality and gender"]="Human sexuality"; $array["Singapore"]="Singapore"; $array["Slovakia"]="Slovakia"; $array["Slovenia"]="Slovenia"; $array["Social science"]="Social sciences"; $array["South Africa"]="South Africa"; $array["South Carolina"]="South Carolina"; $array["Spain"]="Spain"; $array["Sports"]="Sports"; $array["Sri Lanka"]="Sri Lanka"; $array["Sweden"]="Sweden"; $array["Switzerland"]="Switzerland"; $array["Taiwan"]="Taiwan"; $array["Tajikistan"]="Tajikistan"; $array["Tanzania"]="Tanzania"; $array["Technology"]="Technology"; $array["Tennessee"]="Tennessee"; $array["Texas"]="Texas"; $array["Transportation"]="Transportation"; $array["Turkey"]="Turkey"; $array["Uganda"]="Uganda"; $array["UK"]="United Kingdom"; $array["Uruguay"]="Uruguay"; $array["Ukraine"]="Ukraine"; $array["US "]="United States"; $array["Utah"]="Utah"; $array["Venezuela"]="Venezuela"; $array["Vermont"]="Vermont"; $array["Vietnam"]="Vietnam"; $array["Visual arts"]="Visual arts"; $array["Virginia"]="Virginia"; $array["Washington"]="Washington"; $array["Washington, DC"]="Washington, D.C."; $array["Webcomics"]="Webcomics"; $array["Websites"]="Websites"; $array["West Virginia"]="West Virginia"; $array["Wisconsin"]="Wisconsin"; $array["Words"]="Words"; $array["Writing"]="Writing"; $array["Yugoslavia"]="Yugoslavia"; $array["Zambia"]="Zambia"; return $array; } ?>