User:Cedars/gaauto.pl

The following Perl script is a hack that automatically creates a categorized list of good articles in the same format as the good articles page. The script understands quoted, italicized and disguised article links. It uses the existing list as a basis for the new list. It removes old good articles from the revised list and offers the user the opportunity to categorize new good articles. It sorts and counts every article and can automatically adapt to use new headings and subheadings. It allows dual listings and major headings. The script uses cURL to download existing content and the Roman 1.1 Perl module to sort Final Fantasy titles. The script is designed to assist the human editing of Wikipedia articles, not replace it. It is best that users still add and remove articles from the list as they would without the script - this is because they are likely to categorize the items better than the script user. Please feel free to make changes to this page if you feel they would improve the script. If you have comments on the script please feel free to post them on the talk page.

A brief note on output

The script downloads several files to the working directory and outputs two files. The first file, "output_headings.txt", is a file listing the levels and sublevels available for catgorization. This file is output before any requests for categorization are made. The second file, "output.txt", is the formatted wiki-syntax for the list. It may appear corrupt if not opened using UTF-8. The script also outputs a timestamp list of when the most recently added articles were added, "stamp.time", and a backup of the previous version of that list, "stamp.bac". If the timestamp list and backup are dramatically different from each other the script will refuse to run. This is to prevent the timestamp list from becoming distorted and thus damaging the recently added list.

To view script properly use edit mode


 * 1) ! /usr/bin/perl

use Roman; use open ':utf8';

$DOWNLOAD = 1;
 * 1) Download a fresh copy of files

$REMOVED = 1;
 * 1) Warn of removed articles

$NEWARTICLES = 14;
 * 1) Number of new articles to remember

$ADVANCED = 0;
 * 1) Should open web browser or text editor

$SECTIONCOMMENTS = 0;
 * 1) Adds section comments (improves editing)

$WEBBROWSER = "open"; $TEXTEDITOR = "open";
 * 1) Web broswer and text editor commands

sub titlesort { return titlecmp($a, $b); } sub titlecmp {
 * 1) Sorts article titles

# Grab name %xh = %{shift}; $x = $xh{"name"}; %yh = %{shift}; $y = $yh{"name"}; if ($x =~ /Final Fantasy [X|V|I]+/ && $y =~ /Final Fantasy [X|V|I]+/) { # Handle Final Fantasy titles $x =~ /Final Fantasy ([X|V|I]+)/; $x = arabic($1); $y =~ /Final Fantasy ([X|V|I]+)/; $y = arabic($1); return $x <=> $y; }	else { # Handle other titles $x =~ s/.*//g; $x =~ s/''//g; $y =~ s/.*//g; $y =~ s/''//g; if ($x =~ /.*\|(.*)/) { $x = $1; }		if ($y =~ /.*\|(.*)/) { $y = $1; }		return uc($x) cmp uc($y); } }

sub basicsort { return basiccmp($a, $b); } sub basiccmp { %xh = %{shift}; $x = $xh{"name"}; %yh = %{shift}; $y = $yh{"name"}; $x =~ s/.*//g; $x =~ s/''//g; $y =~ s/.*//g; $y =~ s/''//g; if ($x =~ /(.*)\|.*/) { $x = $1; }	if ($y =~ /(.*)\|.*/) { $y = $1; }	return uc($x) cmp uc($y); }
 * 1) Sorts article names

sub timesort { return timecmp($a, $b); } sub timecmp { %xh = %{shift}; $x = $xh{"time"}; %yh = %{shift}; $y = $yh{"time"}; if ($x < 0 && $y < 0) { return 0; }	elsif ($x < 0 && $y >= 0) { return 1; }	elsif ($x >= 0 && $y < 0) { return -1; }	else { return ($x <=> $y) * -1; } }
 * 1) Sorts article time stamps

if (-f "stamp.bac") { $stamp_size = -s "stamp.time"; $stamp_bac_size = -s "stamp.bac"; if (abs($stamp_size - $stamp_bac_size) > 1024) { print "Large change in timestamp file. This script will now quit to prevent data loss.\n"; print "Please delete the \"stamp.bac\" file to continue.\n"; exit(1); } } system "cp stamp.time stamp.bac";
 * 1) Keep backup of timestamp file

if ($DOWNLOAD) { system "curl \"http://en.wikipedia.org/w/index.php?title=Wikipedia:Good_articles&action=edit\" > input_ga.html"; }
 * 1) Download the current good articles file

open(FILE, "input_ga.html"); @input = ; close(FILE); $input_len = $#input + 1;
 * 1) Read the good articles file

$major = -1; $level = -1; $sublevel = 0; $headings_len = 0; $articles_len = 0; $preamble_len = 0; $preamble_on = 0; $main_on = 0; $lang_len = 0; for ($i = 0; $i < $input_len; $i++) {
 * 1) Go through each line of the good articles file

# Get the current line $curline = $input[$i]; $curline =~ s/&amp;/&/g; $curline =~ s/&lt;//g; $curline =~ s/&quot;/\"/g;	# Handle preamble	if ($preamble_on) {		if ($curline =~ /Gapages/) {			$preamble_on = 0;			$main_on = 1;		}		if ($preamble_len == 0) {			$curline =~ s/.*>//;		}		$preamble[$preamble_len] = $curline;		$preamble_len++;	}	elsif ($main_on) {		# If it is a language remember it		if ($curline =~ /\[\^W][^P]\:[^\*\]\]/) {			$lang[$lang_len] = $curline;			$lang_len++;		}		# If it is a recently added article image remember it		if ($curline =~ /colspan=2.*\/) {			$new_articles_image = $1;		}		# If it is a major heading add it to the major headings		if ($curline =~ /]*>([^<]*)<\/div>/) { $major += 1; $realpart = $1; $imagpart = $1; $realpart =~ s/\[\[.*\]\]//; $realpart =~ s/'''//g; $imagpart =~ s/[^\]]*$//; $major_text[$major] = $realpart; $major_icon[$major] = $imagpart; }		# If it is a heading add it to the headings if ($curline =~ /]*>([^<]*)<\/div>/) { $level += 1; $sublevel = 0; $headings_len += 1; $subheadings_len[$level] = 0; $sound = 1; $realpart = $1; $imagpart = $1; $realpart =~ s/\[\[.*\]\]//; $imagpart =~ s/[^\]]*$//; $headings[$level][$sublevel] = $realpart; $headings_icon[$level] = $imagpart; $headings_major[$level] = $major; }		# If it is a subheading add it to the headings and start counting articles if ($curline =~ /=====(.*)=====$/) { $sublevel += 1; $subheadings_len[$level] += 1; $headings[$level][$sublevel] = $1; $start = 1; }		# If it is an div stop counting articles if ($curline =~ /\/div/) { $start = 0; }		# If it is an article add it to the articles list if ($start && $curline =~ /\[\^\*\]\]/) { $searchstr = $curline; $searchstr =~ s/.*\[\[([^\]]*)\]\].*\n$/\1/; if ($curline =~ /.*\[\^\*\]\].*.*\n$/) { $commentstr = $curline; $commentstr =~ s/.*\[\^\*\]\].*.*\n$/\1/; $commentstr =~ s/\ +$//; $articles[$articles_len]{"comment"} = $commentstr; }			if ($curline =~ /^\ *\'\'/) { $articles[$articles_len]{"italic"} = 1; } else { $articles[$articles_len]{"italic"} = 0; } if ($curline =~ /^\ *&quot/ || $curline =~ /^\ *\"/) { $articles[$articles_len]{"quote"} = 1; }			else { $articles[$articles_len]{"quote"} = 0; }			$articles[$articles_len]{"name"} = $searchstr;			$articles[$articles_len]{"level"} = $level;			$articles[$articles_len]{"sublevel"} = $sublevel;			$articles[$articles_len]{"verified"} = 0;			$articles[$articles_len]{"multi"} = 0;			$articles[$articles_len]{"time"} = time;			$articles_len += 1;		}	}	else {		if ($curline =~ /textarea/) {			$preamble_on = 1;		}	} }

if ($articles_len == 0) { print "Download of good article list failed.\n"; exit(1); }
 * 1) Check download worked

@articles = sort basicsort @articles;
 * 1) Sort the articles list

$narticles[0] = $articles[0]; $narticles_len = 1; for ($i = 1; $i < $articles_len; $i++) { if (basiccmp($articles[$i], $articles[$i - 1]) == 0) { $narticles[$narticles_len - 1]{"multi"} = 1; $narticles[$narticles_len - 1]{"sec_level"} = $articles[$i]{"level"}; $narticles[$narticles_len - 1]{"sec_sublevel"} = $articles[$i]{"sublevel"}; }	else { $narticles[$narticles_len] = $articles[$i]; $narticles_len++; } } @articles = @narticles; $articles_len = $narticles_len;
 * 1) Check for multiple entries

$cat_articles_len = 0; $next = "http://en.wikipedia.org/wiki/Category:Wikipedia_good_articles"; for ($i = 1; $next != -1; $i++) {
 * 1) Go through each of the category files

# Download the category file if ($DOWNLOAD) { system "curl \"$next\" > input_cat$i.html"; }	# Read the category file undef @input; open(FILE, "input_cat$i.html"); @input = ; close(FILE); $input_len = $#input + 1; $next = -1; # Go through each line of the category file for ($j = 0; $j < $input_len; $j++) { # Get the current line $curline = $input[$j]; $curline =~ s/&amp;/&/g; # If it is an article add it to the category articles list do { $run = 0; if ($curline =~ />Talk:([^<]*)Talk:([^<]*)]*>next 200/) {			$next = "http://en.wikipedia.org".$1;		}	}

}

if ($cat_articles_len == 0) { print "Download of good article category failed.\n"; exit(1); }
 * 1) Check download worked

open(FILE, ">output_headings.txt"); for ($i = 0; $i < $headings_len; $i++) { for ($j = 0; $j < $subheadings_len[$i] + 1; $j++) { if ($j == 0) { print FILE $i.".0   ".$headings[$i][$j]."\n"; }		else { print FILE " ".$i.".".$j."  ".$headings[$i][$j]."\n"; }	} } close(FILE);
 * 1) Print the headings to file

@cat_articles = sort basicsort @cat_articles; $orig = 0;
 * 1) Sort category articles list

for ($j = 0; $j < $cat_articles_len; $j++) {
 * 1) Go through each of the category articles

# Search the articles list for the current category article $found_index = -1; if (basiccmp($articles[$orig], $cat_articles[$j]) == 0) { $found_index = $orig; $orig = ($orig + 1) % $articles_len; }	else { for ($i = $orig + 1; $i != $orig && $found_index == -1; $i = ($i + 1) % $articles_len) { if (basiccmp($articles[$i], $cat_articles[$j]) == 0) { $found_index = $i; $orig = $i + 1; }		}	}	# If an article is found mark it verified otherwise add a new article to the list if ($found_index != -1) { $articles[$found_index]{"verified"} = 1; $name_lower = 0; if (substr($articles[$found_index]{"name"}, 0, 1) ne substr($cat_articles[$j]{"name"}, 0, 1)) { $name_lower = 1; }		$articles[$found_index]{"name"} =~ s/[^|]*/$cat_articles[$j]{"name"}/; if ($name_lower) { $articles[$found_index]{"name"} = lcfirst($articles[$found_index]{"name"}); }	}	else { $articles[$articles_len]{"name"} = $cat_articles[$j]{"name"}; print "Article not found: ".$cat_articles[$j]{"name"}."\n"; $done = 0; do { print "Which level do you what to assign it to? (t for list, n for ignore)\n"; $in = ; chomp($in); $in = lc($in); if ($in eq "w") { if ($ADVANCED) { open(FILE, "output_headings.txt"); @input = ; foreach $line (@input) { print $line; } close(FILE); $artname = $cat_articles[$j]{"name"}; $artname =~ s/\"//g;					$artname =~ s/ /_/g;					`$WEBBROWSER "http://en.wikipedia.org/w/index.php?title=$artname"`;				}			}			elsif ($in eq "t") {				open(FILE, "output_headings.txt");				@input = ;				foreach $line (@input) { print $line; }				close(FILE);			}			elsif ($in eq "exit" || $in eq "q") {				exit(1);			}			elsif ($in eq "n") {				$done = 1;			}			else {				$articles[$articles_len]{"level"} = $in;				$done = 1;			}		} while (!$done);		if (!($in eq "n")) {			print "Which sublevel do you what to assign it to?\n";			$articles[$articles_len]{"sublevel"} = ;			$articles[$articles_len]{"verified"} = 1;			$articles[$articles_len]{"multi"} = 0;			$articles[$articles_len]{"time"} = time;			$articles_len++;		}	} }

open(FILE, "stamp.time"); @input = ; close(FILE); $input_len = $#input + 1; $orig = 0; for ($i = 0; $i < $input_len; $i++) {
 * 1) Open the time stamps

# Get the current line $curline = $input[$i]; $curline =~ s/&amp;/&/g; $curline =~ s/&lt;//g; $curline =~ s/&quot;/\"/g;	# Fill out the stamp	$curline =~ s/\[\[(.*)\]\]//;	$stamp[0]{"name"} = $1;	$stamp[0]{"time"} = int($curline);

# Search the articles list for a match $found_index = -1; if (basiccmp($articles[$orig], $stamp[0]) == 0) { $found_index = $orig; $orig = ($orig + 1) % $articles_len; }	else { for ($j = $orig + 1; $j != $orig && $found_index == -1; $j = ($j + 1) % $articles_len) { if (basiccmp($articles[$j], $stamp[0]) == 0) { $found_index = $j; $orig = ($j + 1) % $articles_len; }		}	}	# Assign the time stamp if ($found_index != -1) { $articles[$found_index]{"time"} = $stamp[0]{"time"}; } }

open(FILE, ">stamp.time"); $new_articles_count = 0; @articles = sort timesort @articles; for ($i = 0; $i < $articles_len; $i++) { if ($articles[$i]{"verified"}) { if ($new_articles_count < $NEWARTICLES && $articles[$i]{"time"} != -1) { $new_articles[$new_articles_count] = $articles[$i]; $new_articles_count++; }		else { $articles[$i]{"time"} = -1; }		print FILE "".$articles[$i]{"name"}." ".$articles[$i]{"time"}."\n"; } } close(FILE); @new_articles = sort titlesort @new_articles;
 * 1) Find the new articles

@articles = sort basicsort @articles;
 * 1) Sort the articles again

open(FILE, ">output.txt");
 * 1) Open the output file

for ($i = 0; $i < $preamble_len; $i++) { print FILE $preamble[$i]; }
 * 1) Print out preamble

print FILE "|-\n| colspan=2 width=\"100%\" style=\"padding:1em 1em 1em 1em; border:1px solid #dfdfdf; background-color:#E0EDFA\" valign=\"top\" align=\"center\"|"; if ($new_articles_image) { print FILE ""; } print FILE "\nRecently listed good articles\n\n"; $pre = 0; for ($i = 0; $i < $new_articles_count; $i++) { if ($pre) { print FILE " &mdash;\n"; } if ($new_articles[$i]{"quote"}) { print FILE "&quot;".$new_articles[$i]{"name"}."&quot;"; } elsif ($new_articles[$i]{"italic"}) { print FILE "".$new_articles[$i]{"name"}.""; } else { print FILE "".$new_articles[$i]{"name"}.""; } $pre = 1; } print FILE "\n|}\n\n\n"; print FILE "\n"; print FILE "\n"; print FILE " \n";
 * 1) Print the recently added articles

$article_count = 0; $major = -1; for ($i = 0; $i < $headings_len; $i++) {
 * 1) Go through each heading and subheading

# Print out major heading if ($headings_major[$i] > $major) { $major = $headings_major[$i]; if ($major > 0) { print FILE " \n \n"; }		print FILE "\n"; print FILE "\n"; print FILE "<div style=\"padding:5px 5px 8px 5px; background-color:#CCCCFF; text-align:left; font-size:larger;\">$major_icon[$major]$major_text[$major] \n"; print FILE "<div style=\"text-align:left;\">\n"; }

for ($j = 0; $j < $subheadings_len[$i] + 1; $j++) { # Write the heading or subheading if ($j == 0) { if ($i != 0) { print FILE " \n"; print FILE " \n"; print FILE "\n"; }			print FILE "<div style=\"clear:both;\" class=\"NavFrame\">\n"; print FILE "<div class=\"NavHead\" style=\"padding:2px 2px 2px 30px; background-color:#FFFAF0; text-align:left; font-size:larger;\">$headings_icon[$i]$headings[$i][$j] \n"; print FILE "<div class=\"NavContent\" style=\"text-align:left;\">\n"; if ($SECTIONCOMMENTS) { print FILE "==&shy; ==\n"; }			else { print FILE "==&shy; ==\n"; }		}		else { print FILE "\n=====".$headings[$i][$j]."=====\n"; }		# Run through the articles adding them if they belong to the current level undef @cur_articles; $cur_articles_len = 0; $article_count = 0; for ($k = 0; $k < $articles_len; $k++) { if ($articles[$k]{"level"} == $i && $articles[$k]{"sublevel"} == $j) { if ($articles[$k]{"verified"}) { $cur_articles[$article_count] = $articles[$k]; $article_count++; $total_count++; }				else { if ($REMOVED) { print "REMOVED ARTICLE: ".$articles[$k]{"name"}."\n"; }				}			}			elsif ($articles[$k]{"multi"} == 1 && $articles[$k]{"sec_level"} == $i && $articles[$k]{"sec_sublevel"} == $j) { if ($articles[$k]{"verified"}) { $cur_articles[$article_count] = $articles[$k]; $article_count++; }			}		}		# Then sort and print the articles if ($article_count > 0) { @cur_articles = sort titlesort @cur_articles; $pre = 0; for ($k = 0; $k < $article_count; $k++) { if ($pre) { print FILE " &mdash;\n"; } if ($cur_articles[$k]{"quote"}) { print FILE "&quot;".$cur_articles[$k]{"name"}."&quot;"; } elsif ($cur_articles[$k]{"italic"}) { print FILE "".$cur_articles[$k]{"name"}.""; } else { print FILE "".$cur_articles[$k]{"name"}.""; } if ($cur_articles[$k]{"comment"}) { print FILE " "; } $pre = 1; }			if ($article_count == 1) { print FILE "\n \x{2014} (1 article) \n"; }			else { print FILE "\n \x{2014} (".$article_count." articles) \n"; }		}	} }

print FILE " \n"; print FILE " \n\n"; for ($i = 0; $i < $lang_len; $i++) { print FILE $lang[$i]; } print FILE "\n"; print FILE "\n"; close(FILE);
 * 1) Close the output file

open(FILE, "output.txt"); @input = <FILE>; close(FILE); $input_len = $#input + 1; open(FILE, ">output.txt"); for ($i = 0; $i < $input_len; $i++) { $input[$i] =~ s/\[\[Wikipedia\:Good articles\/Statistics\|[0-9]*\]\]/\[\[Wikipedia\:Good articles\/Statistics\|$total_count\]\]/; $input[$i] =~ s/expr: \{\{NUMBEROFARTICLES\:R\}\} \/ [0-9]*/expr: \{\{NUMBEROFARTICLES\:R\}\} \/ $total_count/; print FILE $input[$i]; } close(FILE);
 * 1) Reopen the output file and reprint with correct number of articles

print "Number of articles: ".$total_count."\n";
 * 1) Print out total number of articles

if ($ADVANCED) { print "Do you want me to open your browser for editing? (y/n)\n"; $in = <STDIN>; chomp($in); $in = lc($in); if ($in eq "y") { `$WEBBROWSER "http://en.wikipedia.org/w/index.php?title=Wikipedia:Good_articles&action=edit"`; `$TEXTEDITOR "output.txt"`; } }
 * 1) Open for editing