User:Jediarchives11/Wikification

The code below is for the Automatic Wikification Extension. This extension searches articles when they are saved for words or phrases that have an article and then links them to that article.  Requested by and edited by Nicholas Anderson  This extension is a hook for MediaWiki that examines an article before it is committed to the database and looks for possible wiki topics in the article that are not marked as links and converts them. Changelog 2005-07-25: Work started 2006-01-06: Fixed Bug: When adding links, spaces would be removed 2006-01-07: $excludelist array added

To Do   *Fix bug: Commas and periods aren't removed when finding things to link *Fix bug: Last word in an article never links

//when searching for phrases (like "History of Greece"), up to how many words //should we search? //the higher this number, the slower the extension $wikifiPhraseWordLimit = 4;

//when searching for a single word term, what is the minimum number of characters //allowed for a word //this value is ignored for phrases $wikifiMinWordLength = 3;

//namespaces to search for matches //should have insignificant performance impact $wikifiSearchNamespaces = array(NS_MAIN);

//when true, the only words that will be searched for matches are capitalized words $wikifiOnlyCheckProper = false;

$wgExtensionFunctions[] = "Wikification_Wikify";

//register this hook function Wikification_Wikify { global $wgHooks; $wgHooks['ArticleSave'][] = 'Wikification_Save'; }

//this is the function that does the work //all variables are passed by reference function Wikification_Save($article, $user, $text) { global $wikifiPhraseWordLimit, $wikifiMinWordLength, $wikifiSearchNamespaces; global $wikifiOnlyCheckProper; //grab the database reference $db = &wfGetDB(DB_MASTER); //first we need to strip out things that should never be links //strip out existing wiki links * [*] $s = preg_replace("/\\[\\[.*?\\]\\]/", '', $text); $s = preg_replace("/\\[.*?\\]/", '', $s); //strip out section headers $s = preg_replace("/={1,5}.*?={1,5}/", '', $s); //strip out other junk $s = preg_replace("/[.,]/","", $s);

$excludelist = array("about", "test", "spam blacklist test"); $s = str_replace($excludelist, "", $s);

//separate the text into words $words = explode(' ', $s); //remove any non-printable characters foreach ($words as $k=>$w) { $words[$k] = trim($w); if (strlen($w) == 0) { unset($words[$k]); } }  //reindex the keys $words = array_values($words); $count = count($words); $search = array; $i = 0; foreach ($words as $k=>$v) { ++$i; //add an individual word if it is long enough if (strlen($v) >= $wikifiMinWordLength) { if ($wikifiOnlyCheckProper) { if (ctype_upper($v{0})) { $search[] = $v; }     }      else { $search[] = $v; }   }      for ($j = 1; $j < $wikifiPhraseWordLimit; $j++) { //if we have enough words left in the array if ( ($i + $j) < $count) { $phrase = $v; for ($l = 0; $l < $j; $l++) { $phrase .= ' '. $words[$k+$l+1]; }         $search[] = $phrase; }     }    }

//$search is an array for terms for which to search //we need to convert them to titles foreach ($search as $k=>$v) { $search[$k] = str_replace(' ', '_', ucwords($v)); } //assemble what could be a massive sql query $sql = "SELECT page_namespace, page_title FROM wikipage"; $sql .= " WHERE page_namespace IN (".implode(',', $wikifiSearchNamespaces).")"; $sql .= " AND page_title IN ("; foreach ($search as $v) {    $sql .= "'".addslashes($v)."', ";  }  $sql = rtrim($sql, " ,");  $sql .= ")"; $result = $db->doQuery($sql); //if we found a match if ($db->numRows($result)) { //loop through all of the matches while ($row = $db->fetchRow($result)) { $namespace = $row['page_namespace']; $title = $row['page_title']; //start building the replacement text $link = " ";     //find the original text in the article      $matches = array;      $find = str_replace('_', ' ', $title);      preg_match_all("/$find/i", $text, $matches);      $matches = array_unique($matches[0]);      foreach ($matches as $m) {        $newlink = $link."$m "; //this regexp needs fine tuning $text = preg_replace("/[^\[]$m\s/", $newlink, $text); }

} }

return true; }

?>