User:Carnildo/wiki-regex-tester.pl

Common usages: wiki-regex-tester.pl titles.txt < blacklist.txt Will test every regex in "blacklist.txt" to see if it matches any titles in "titles.txt". "blacklist.txt" contains one blacklist regex per line; "titles.txt" contains one title per line. wiki-regex-tester.pl 'Title of a Wikipedia article' < blacklist.txt Will test to see if 'Title of a Wikipedia article' would be blocked by any entry in "blacklist.txt" wget -O - 'http://en.wikipedia.org/w/index.php?title=MediaWiki:Titleblacklist&action=raw' |perl wiki-regex-tester.pl ns_0.txt|wc -l Will fetch the latest version of the English Wikipedia blacklist, test it against the list of titles in "ns_0.txt", and count the number of titles matched.

#!/usr/bin/perl

use warnings; use strict; use utf8;

use Time::HiRes;

binmode STDIN, ":utf8"; binmode STDOUT, ":utf8"; binmode STDERR, ":utf8";

my @regexes;

while() {       my $regex = $_; my $ignorecase = 1; my $moveonly = 0; my $newaccount = 0;

$regex =~ s/#.*$//g;   # Strip comments $ignorecase = 0 if($regex =~ /casesensitive/); $moveonly = 1 if($regex =~ /moveonly/); $newaccount = 1 if($regex =~ /newaccountonly/); $regex =~ s/<(moveonly|newaccountonly|casesensitive|\||errmsg=[^|>]*| )+>//g; # Strip modifiers $regex =~ s/\s*$//g;   # Strip trailing space $regex =~ s/^\s*//g;   # Strip leading space

if($regex !~ /^\s*$/ and !$newaccount) {               push @regexes, [$regex, $ignorecase, $moveonly]; } }

print STDERR "Testing ". scalar(@regexes). " regexes\n";

my $lines = 0; my $lines2 = 0; my $regex_count = 0; foreach my $regex_entry (@regexes) {       my $start_time = Time::HiRes::time; my $u_start_time = Time::HiRes::clock; my $maxtime = 0; my ($regex, $ignorecase, $moveonly) = @{$regex_entry};

if(-e $ARGV[0]) {               open INFILE, "<", $ARGV[0]; binmode INFILE, ":utf8"; }       else {               open INFILE, "<", \$ARGV[0]; binmode INFILE, ":utf8"; }

while() {               my $target = $_; chomp $target; $target =~ s/_/ /g;

if($ignorecase) {                       if($target =~ /^$regex$/i) {                               print "* $target :: $regex\n"; }               }                else {                       if($target =~ /^$regex$/) {                               print "* $target :: $regex\n"; }               }

$lines = $lines + 1; if($lines >= 10000) {                       my $newtime = Time::HiRes::clock; my $diff = $newtime - $u_start_time; $u_start_time = $newtime;

$maxtime = $diff if($diff > $maxtime);

$lines = 0; $lines2 += 10000; print STDERR "$diff $lines2\r"; }       }        $regex_count += 1;

my $stop_time = Time::HiRes::time; print STDERR "Regex $regex took ". ($stop_time - $start_time). " seconds\n"; print STDERR "Slowest batch took $maxtime seconds\n"; close INFILE; }