User:Carnildo/wiki-regex-tester.c

Common usages: ./wiki-regex-tester titles.txt < blacklist.txt Will test every regex in "blacklist.txt" to see if it matches any titles in "titles.txt". "blacklist.txt" contains one blacklist regex per line; "titles.txt" contains one title per line. ./wiki-regex-tester 'Title of a Wikipedia article' < blacklist.txt Will test to see if 'Title of a Wikipedia article' would be blocked by any entry in "blacklist.txt" wget -O - 'http://en.wikipedia.org/w/index.php?title=MediaWiki:Titleblacklist&action=raw' |wiki-regex-tester ns_0.txt|wc -l Will fetch the latest version of the English Wikipedia blacklist, test it against the list of titles in "ns_0.txt", and count the number of titles matched.

/* wiki-regex-tester.c * * A program to test regular expressions for the Wikipedia title blacklist. Assumes UTF-8. */

/* Compile using gcc -o wiki-regex-tester wiki-regex-tester.c `pcre-config --libs` */
 * 1) include 
 * 2) include 
 * 3) include 
 * 4) include 
 * 5) include 
 * 6) include 


 * 1) include 

void preprocess_regex(char *regex, int *casesensitive, int *newaccountonly) {	size_t lead = 0; char tempregex[4096]; /* Crude check for modifiers -- assumes correct formatting and that they'll never appear in a regex. */	if(strstr(regex, "casesensitive")) {		*casesensitive = 1; }	if(strstr(regex, "newaccountonly")) {		*newaccountonly = 1; }	/* Cut off the trailing newline */ if(strrchr(regex, '\n')) {		*strrchr(regex, '\n') = '\0'; }	/* Whack off the tail end of the regex -- all modifiers and comments */ if(strchr(regex, '#')) {		*strchr(regex, '#') = '\0';	/* I think it's a safe assumption that '#'-characters can't appear in blacklist entries -- the code appears to be buggy that way. */	}	if(strstr(regex, "<moveonly")) {		*strstr(regex, "<moveonly") = '\0'; }	if(strstr(regex, "<newaccountonly")) {		*strstr(regex, " 0) {		memmove(regex, regex + lead, strlen(regex) - lead + 1); }	while(regex[strlen(regex) - 1] == ' ') {		regex[strlen(regex) - 1] = '\0'; }	/* Add anchors */ if(strlen(regex) > 0) {		sprintf(tempregex, "^%s$", regex); strcpy(regex, tempregex); } }

void fixup_line(char *line) {	line[strlen(line) - 1] = '\0';	/* Cut off the trailing newline */ while(strchr(line, '_')) {		*strchr(line, '_') = ' '; } }

int main(int argc, char *argv[]) {	int i;	char regex[4096]; char line[1024]; int ovector[300]; FILE *infile; struct stat dummy; pcre *comp_regex; int result; int matches; int lines = 0; const char * errptr; int offset; /* Read the regexes in from stdin */ while(!feof(stdin)) {		int casesensitive = 0; int newaccountonly = 0; fgets(regex, 4096, stdin); /* For each regex */ /* Preprocess */ preprocess_regex(regex, &casesensitive, &newaccountonly); if(strlen(regex) > 0) {			matches = 0; fprintf(stderr, "Testing /%s/%c now\n", regex, casesensitive?' ':'i'); comp_regex = pcre_compile(regex, PCRE_UTF8|(casesensitive?0:PCRE_CASELESS), &errptr, &offset, NULL); if(NULL == comp_regex) {				fprintf(stderr, "Compile failed: %d %s\n", offset, errptr); }			else {				if(!newaccountonly) {					/* Test */ for(i = 1; i < argc; i++) {						/* If it's a file */ if(!stat(argv[i], &dummy)) {							infile = fopen(argv[i], "r"); while(!feof(infile)) {								lines += 1; fgets(line, 1024, infile); fixup_line(line); result = pcre_exec(comp_regex, NULL, line, strlen(line), 0, 0, ovector, 300); if(result >= 0) {									printf("* %s :: %s\n", line, regex); matches += 1; }								else if(result == PCRE_ERROR_NOMATCH) { //									printf("* Nomatch\n"); }								else {									fprintf(stderr, "Error: %d\n", result); }								if((lines % 100000) == 0) {									fprintf(stderr, "Lines: %d           \r", lines); }							}							fclose(infile); }						else {							lines += 1; /* Otherwise, test as a literal */ result = pcre_exec(comp_regex, NULL, argv[i], strlen(argv[i]), 0, 0, ovector, 300); if(result >= 0) {								matches += 1; printf("* %s :: %s\n", argv[i], regex); }							else if(result == PCRE_ERROR_NOMATCH) { //								printf("* No match\n"); }							else {								fprintf(stderr, "Error: %d\n", result); }						}					}				}			}			fprintf(stderr, "Matches: %d\n", matches); }	} }