User:Selket/Parsers.cs

/* WikiFunctions Copyright (C) 2006 Martin Richards

This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA

using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using System.Configuration; using System.Collections; using System.Web;

[assembly: CLSCompliant(true)] namespace WikiFunctions.Parse {   ///     /// Provides functions for editting wiki text, such as formatting and re-categorisation. ///    public class Parsers {       #region constructor etc.        public Parsers {//default constructor metaDataSorter = new MetaDataSorter(this); MakeRegexes; }

///        /// Re-organises the Person Data, stub/disambig templates, categories and interwikis ///        /// The number of maximum number of words for a stub. public Parsers(int StubWordCount, bool AddHumanKey) {           metaDataSorter = new MetaDataSorter(this); StubMaxWordCount = StubWordCount; addCatKey = AddHumanKey; MakeRegexes; }

private void MakeRegexes {           //look bad if changed RegexUnicode.Add(new Regex("&(ndash|mdash|minus|times|lt|gt|nbsp|thinsp|shy|lrm|rlm|[Pp]rime);", RegexOptions.Compiled), "&amp;$1;"); //IE6 does like these RegexUnicode.Add(new Regex("&#(705|803|596|620|699|700|8652|9408|9848|12288|160|61|x27|39);", RegexOptions.Compiled), "&amp;#$1;"); //Decoder doesn't like these RegexUnicode.Add(new Regex("&#(x109[0-9A-Z]{2});", RegexOptions.Compiled), "&amp;#$1;"); RegexUnicode.Add(new Regex("&#((?:277|119|84|x1D|x100)[A-Z0-9a-z]{2,3});", RegexOptions.Compiled), "&amp;#$1;"); RegexUnicode.Add(new Regex("&#(x12[A-Za-z0-9]{3});", RegexOptions.Compiled), "&amp;#$1;"); //interfere with wiki syntax RegexUnicode.Add(new Regex("&#(126|x5D|x5B|x7b|x7c|x7d|0?9[13]|0?12[345]|0?0?3[92]);", RegexOptions.Compiled | RegexOptions.IgnoreCase), "&amp;#$1;"); //not entity, but still wrong RegexUnicode.Add(new Regex("(cm| m|mm|km|mi)2", RegexOptions.Compiled), "$1²"); RegexUnicode.Add(new Regex("(cm| m|mm|km|mi)3", RegexOptions.Compiled), "$1³");

RegexTagger.Add(new Regex("\\{\\{(template:)?(wikify|wikify-date|wfy|wiki)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), ""); RegexTagger.Add(new Regex("\\{\\{(template:)?(Clean ?up|CU|Clean|Tidy)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), ""); RegexTagger.Add(new Regex("\\{\\{(template:)?(Linkless|Orphan)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), ""); RegexTagger.Add(new Regex("\\{\\{(template:)?(Uncategori[sz]ed|Uncat|Classify|Category needed|Catneeded|categori[zs]e|nocats?)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), ""); RegexTagger.Add(new Regex("\\{\\{(template:)?(Unreferenced|add references|cite[ -]sources?|cleanup-sources?|needs? references|no sources|no references?|not referenced|references|sources|unref|Unreferencedsect|unsourced)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "");

RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(Dab|Disamb|Disambiguation)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), ""); RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(2cc|2LAdisambig|2LCdisambig|2LC)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), ""); RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(3cc|3LW|Tla|Tla-dab|TLA-disambig|TLAdisambig|3LC)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), ""); RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(4cc|4LW|4LA|4LC)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), ""); RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(Bio-dab|Hndisambig)", RegexOptions.IgnoreCase | RegexOptions.Compiled), "{{Hndis");

RegexConversion.Add(new Regex("\\{\\{(?:Template:)?(Prettytable|Prettytable100|Pt)\\}\\}", RegexOptions.IgnoreCase | RegexOptions.Compiled), "class="wikitable""); RegexConversion.Add(new Regex("\\{\\{(?:[Tt]emplate:)?(PAGENAMEE?\\}\\}|[Ll]ived\\||[Bb]io-cats\\|)", RegexOptions.Compiled), "{{subst:$1");

RegexConversion.Add(new Regex(@"\{\{[Ll]ife(?:time|span)\|([0-9]{4})\|([0-9]{4})\|(.*?)\}\}", RegexOptions.Compiled), "\r\n"); RegexConversion.Add(new Regex(@"\{\{[Ll]ife(?:time|span)\|\|([0-9]{4})\|(.*?)\}\}", RegexOptions.Compiled), "\r\n"); RegexConversion.Add(new Regex(@"\{\{[Ll]ife(?:time|span)\|([0-9]{4})\|\|(.*?)\}\}", RegexOptions.Compiled), "\r\n"); }

Dictionary RegexUnicode = new Dictionary; Dictionary RegexConversion = new Dictionary; Dictionary RegexTagger = new Dictionary;

HideText hider = new HideText; MetaDataSorter metaDataSorter; string testText = ""; int StubMaxWordCount = 500; ///        /// Sort interwiki link order ///        public bool sortInterwikiOrder {           get { return boolInterwikiOrder; } set { boolInterwikiOrder = value; } }       private bool boolInterwikiOrder = true;

///        /// The interwiki link order to use ///        public InterWikiOrderEnum InterWikiOrder {           set { metaDataSorter.InterWikiOrder = value; } get { return metaDataSorter.InterWikiOrder; } }

///        /// When set to true, adds key to categories (for people only) when parsed ///        public bool addCatKey {           get { return boolAddCatKey; } set { boolAddCatKey = value; } }       private bool boolAddCatKey = false;

#endregion

#region General Parse

///        /// Re-organises the Person Data, stub/disambig templates, categories and interwikis ///        /// The wiki text of the article. /// The article title. /// True, sort interwiki order per pywiki bots, false keep current order. /// The re-organised text. public string SortMetaData(string ArticleText, string ArticleTitle) {           return metaDataSorter.Sort(ArticleText, ArticleTitle); }

readonly Regex regexFixDates0 = new Regex("([12][0-9][0-9]0)'s", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex regexFixDates1 = new Regex("(January|February|March|April|May|June|July|August|September|October|November|December) ([1-9][0-9]?)(?:st|nd|rd|th)", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex regexFixDates2 = new Regex("([1-9][0-9]?)(?:st|nd|rd|th) (January|February|March|April|May|June|July|August|September|October|November|December)", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex regexHeadings0 = new Regex("(== ?)(see also:?|related topics:?|related articles:?|internal links:?|also see:?)( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex regexHeadings1 = new Regex("(== ?)(external links:?|external sites:?|outside links|web ?links:?|exterior links:?)( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex regexHeadings2 = new Regex("(== ?)(external link:?|external site:?|web ?link:?|exterior link:?)( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex regexHeadings3 = new Regex("(== ?)(reference:?)(s? ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex regexHeadings4 = new Regex("(== ?)(source:?)(s? ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex regexHeadings5 = new Regex("(== ?)(further readings?:?)( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex regexHeadings6 = new Regex("(== ?)(Early|Personal|Adult|Later) Life( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex regexHeadings7 = new Regex("(== ?)(Current|Past|Prior) Members( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex regexHeadingsCareer = new Regex("(== ?)([a-zA-Z]+) Career( ?==)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex RegexBadHeader = new Regex("^(={1,4} ?(about|description|overview|definition|profile|(?:general )?information|background|intro(?:duction)?|summary|bio(?:graphy)?) ?={1,4})", RegexOptions.IgnoreCase | RegexOptions.Compiled);

///        /// Fix ==See also== and similar section common errors. ///        /// The wiki text of the article. /// Value that indicated whether no change was made. /// The modified article text. public string FixHeadings(string ArticleText, string ArticleTitle, out bool NoChange) {           testText = ArticleText; ArticleText = FixHeadings(ArticleText, ArticleTitle);

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText.Trim; }

///        /// Fix ==See also== and similar section common errors. Removes unecessary introductary headings. ///        /// The wiki text of the article. /// The modified article text. public string FixHeadings(string ArticleText, string ArticleTitle) {           ArticleText = Regex.Replace(ArticleText, "^={1,4} ?" + Regex.Escape(ArticleTitle) + " ?={1,4}", "", RegexOptions.IgnoreCase); ArticleText = RegexBadHeader.Replace(ArticleText, "");

if (!Regex.IsMatch(ArticleText, "= ?See also ?=")) ArticleText = regexHeadings0.Replace(ArticleText, "$1See also$3");

ArticleText = regexHeadings1.Replace(ArticleText, "$1External links$3"); ArticleText = regexHeadings2.Replace(ArticleText, "$1External link$3"); ArticleText = regexHeadings3.Replace(ArticleText, "$1Reference$3"); ArticleText = regexHeadings4.Replace(ArticleText, "$1Source$3"); ArticleText = regexHeadings5.Replace(ArticleText, "$1Further reading$3"); ArticleText = regexHeadings6.Replace(ArticleText, "$1$2 life$3"); ArticleText = regexHeadings7.Replace(ArticleText, "$1$2 members$3"); ArticleText = regexHeadingsCareer.Replace(ArticleText, "$1$2 career$3");

return ArticleText; }

///        /// Fix date and decade formatting errors. ///        /// The wiki text of the article. /// The modified article text. public string FixDates(string ArticleText) {           ArticleText = regexFixDates0.Replace(ArticleText, "$1s"); ArticleText = regexFixDates1.Replace(ArticleText, "$1 $2"); ArticleText = regexFixDates2.Replace(ArticleText, "$1 $2");

return ArticleText; }

///        /// Footnote formatting errors per WP:FN. ///        /// The wiki text of the article. /// The modified article text. public string FixFootnotes(string ArticleText) {           string factTag = "({{[ ]*fact[ ]*}}|{{[ ]*fact[ ]*[\\|][^}]*}}|{{[ ]*facts[ ]*}}|{{[ ]*citequote[ ]*}}|{{[ ]*citation needed[ ]*}}|{{[ ]*cn[ ]*}}|{{[ ]*verification needed[ ]*}}|{{[ ]*verify source[ ]*}}|{{[ ]*verify credibility[ ]*}}|{{[ ]*who[ ]*}}|{{[ ]*failed verification[ ]*}}|{{[ ]*nonspecific[ ]*}}|{{[ ]*dubious[ ]*}}|{{[ ]*or[ ]*}}|{{[ ]*lopsided[ ]*}}|{{[ ]*GR[ ]*[\\|][ ]*[^ ]+[ ]*}}|{{[ ]*[c]?r[e]?f[ ]*[\\|][^}]*}}|{{[ ]*ref[ _]label[ ]*[\\|][^}]*}}|{{[ ]*ref[ _]num[ ]*[\\|][^}]*}})"; ArticleText = Regex.Replace(ArticleText, "\n\r\f\t ]+?"+factTag, "$1");

// One space/linefeed ArticleText = Regex.Replace(ArticleText, "[\\n\\r\\f\\t ]+?])", "<ref$1"); // remove trailing spaces from named refs ArticleText = Regex.Replace(ArticleText, ">ref ([^>]*[^>])[ ]*>", " |<ref[^>]*?/>)[ ]*[,;-]?[ ]*<ref", "$1<ref");           ArticleText = Regex.Replace(ArticleText, "( |<ref[^>]*?/>)[ ]*[,;-]?[ ]*<ref", "$1<ref");

string LacksPunctuation = "([^\\.,;:!\\?\"'’])";           string QuestionOrExclam = "([!\\?])";            string MinorPunctuation = "([\\.,;:])";            string AnyPunctuation = "([\\.,;:!\\?])";            string MajorPunctuation = "([,;:!\\?])";            string Period = "([\\.])";            string Quote = "([\"'’]*)"; string Space = "[ ]*";

string RefTag1 = ""; string RefTag2 = "(<ref[^>]*?[^/]>([^<]|<[^/]|</[^r]|</r[^e]|</re[^f]|</ref[^>])*? )"; string RefTag3 = "(<ref[^>]*?/>)";

string match0a = LacksPunctuation + Quote + factTag + Space + AnyPunctuation; string match0b = QuestionOrExclam + Quote + factTag + Space + MajorPunctuation; string match0c = MinorPunctuation + Quote + factTag + Space + AnyPunctuation; string match0d = QuestionOrExclam + Quote + factTag + Space + Period;

string match1a = LacksPunctuation + Quote + RefTag1 + Space + AnyPunctuation; string match1b = QuestionOrExclam + Quote + RefTag1 + Space + MajorPunctuation; string match1c = MinorPunctuation + Quote + RefTag1 + Space + AnyPunctuation; string match1d = QuestionOrExclam + Quote + RefTag1 + Space + Period;

string match2a = LacksPunctuation + Quote + RefTag2 + Space + AnyPunctuation; string match2b = QuestionOrExclam + Quote + RefTag2 + Space + MajorPunctuation; string match2c = MinorPunctuation + Quote + RefTag2 + Space + AnyPunctuation; string match2d = QuestionOrExclam + Quote + RefTag2 + Space + Period;

string match3a = LacksPunctuation + Quote + RefTag3 + Space + AnyPunctuation; string match3b = QuestionOrExclam + Quote + RefTag3 + Space + MajorPunctuation; string match3c = MinorPunctuation + Quote + RefTag3 + Space + AnyPunctuation; string match3d = QuestionOrExclam + Quote + RefTag3 + Space + Period;

for (int j = 0; j < 10; j++) { // repeat for multiple refs together ArticleText = Regex.Replace(ArticleText, match0a, "$1$2$4$3"); ArticleText = Regex.Replace(ArticleText, match0b, "$1$2$4$3"); ArticleText = Regex.Replace(ArticleText, match0c, "$2$4$3"); ArticleText = Regex.Replace(ArticleText, match0d, "$1$2$3");

ArticleText = Regex.Replace(ArticleText, match1a, "$1$2$5$3"); ArticleText = Regex.Replace(ArticleText, match1b, "$1$2$5$3"); ArticleText = Regex.Replace(ArticleText, match1c, "$2$5$3"); ArticleText = Regex.Replace(ArticleText, match1d, "$1$2$3");

ArticleText = Regex.Replace(ArticleText, match2a, "$1$2$5$3"); ArticleText = Regex.Replace(ArticleText, match2b, "$1$2$5$3"); ArticleText = Regex.Replace(ArticleText, match2c, "$2$5$3"); ArticleText = Regex.Replace(ArticleText, match2d, "$1$2$3");

ArticleText = Regex.Replace(ArticleText, match3a, "$1$2$4$3"); ArticleText = Regex.Replace(ArticleText, match3b, "$1$2$4$3"); ArticleText = Regex.Replace(ArticleText, match3c, "$2$4$3"); ArticleText = Regex.Replace(ArticleText, match3d, "$1$2$3"); }

return ArticleText; }

///        /// Applies removes some excess whitespace from the article ///        /// <param name="ArticleText">The wiki text of the article. /// The modified article text. public static string RemoveWhiteSpace(string ArticleText) {           ArticleText = Regex.Replace(ArticleText, "\r\n(\r\n)+", "\r\n\r\n");

ArticleText = Regex.Replace(ArticleText, "== ? ?\r\n\r\n==", "==\r\n=="); ArticleText = ArticleText.Replace("\r\n\r\n(* ?\\[?http)", "\r\n$1");

ArticleText = Regex.Replace(ArticleText.Trim, "+$", ""); ArticleText = Regex.Replace(ArticleText.Trim, "<br ?/?>$", "", RegexOptions.IgnoreCase);

return ArticleText.Trim; }

///        /// Applies removes all excess whitespace from the article ///        /// <param name="ArticleText">The wiki text of the article. /// The modified article text. public string RemoveAllWhiteSpace(string ArticleText) {//removes all whitespace ArticleText = ArticleText.Replace("\t", " "); ArticleText = RemoveWhiteSpace(ArticleText);

ArticleText = ArticleText.Replace("\r\n\r\n*", "\r\n*");

ArticleText = Regex.Replace(ArticleText, " +", " "); ArticleText = Regex.Replace(ArticleText, " \r\n", "\r\n");

ArticleText = Regex.Replace(ArticleText, "==\r\n\r\n", "==\r\n");

//fix bullet points ArticleText = Regex.Replace(ArticleText, "^([\\*#]+) ", "$1", RegexOptions.Multiline); ArticleText = Regex.Replace(ArticleText, "^([\\*#]+)", "$1 ", RegexOptions.Multiline);

//fix heading space ArticleText = Regex.Replace(ArticleText, "^(={1,4}) ?(.*?) ?(={1,4})$", "$1$2$3", RegexOptions.Multiline);

//fix dash spacing ArticleText = Regex.Replace(ArticleText, " ?(–|—|&#15[01];|&[nm]dash;|&#821[12];|&#x201[34];) ?", "$1"); ArticleText = Regex.Replace(ArticleText, "(—|&#151;|&mdash;|&#8212;|&#x2014;|–|&#150;|–|&#8211;|&#x2013;)", " $1 ");

return ArticleText.Trim; }

///        /// Fixes and improves syntax (such as html markup) ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="NoChange">Value that indicated whether no change was made. /// The modified article text. public string FixSyntax(string ArticleText, out bool NoChange) {           testText = ArticleText; ArticleText = FixSyntax(ArticleText);

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

readonly Regex SyntaxRegex1 = new Regex("\\[\\[http:\\/\\/([^][]*?)\\]", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex SyntaxRegex2 = new Regex("\\[http:\\/\\/([^][]*?)\\]\\]", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex SyntaxRegex3 = new Regex("\\[\\[http:\\/\\/(.*?)\\]\\]", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex SyntaxRegex4 = new Regex("\\[\\[([^][]*?)\\]([^][][^\\]])", RegexOptions.Compiled); readonly Regex SyntaxRegex5 = new Regex("([^][])\\[([^][]*?)\\]\\]([^\\]])", RegexOptions.Compiled);

readonly Regex SyntaxRegex6 = new Regex("\\?", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex SyntaxRegex7 = new Regex("\\[\\[ (.*)?\\]\\]", RegexOptions.Compiled); readonly Regex SyntaxRegex8 = new Regex("\\[\\[([A-Za-z]*) \\]\\]", RegexOptions.Compiled); readonly Regex SyntaxRegex9 = new Regex("\\[\\[(.*)?_#(.*)\\]\\]", RegexOptions.Compiled);

readonly Regex SyntaxRegexTemplate = new Regex("(\\{\\{[\\s]*)[Tt]emplate:(.*?\\}\\})", RegexOptions.Singleline | RegexOptions.Compiled); readonly Regex SyntaxRegex11 = new Regex("^((#|\\*).*?)<br ?/?>\r\n", RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.Compiled);

readonly Regex SyntaxRegexItalic = new Regex("(.*?)", RegexOptions.IgnoreCase | RegexOptions.Compiled); readonly Regex SyntaxRegexBold = new Regex("(.*?)", RegexOptions.IgnoreCase | RegexOptions.Compiled);

///        /// Fixes and improves syntax (such as html markup) ///        /// <param name="ArticleText">The wiki text of the article. /// The modified article text. public string FixSyntax(string ArticleText) {           //replace html with wiki syntax if (!Regex.IsMatch(ArticleText, "'</?[ib]>|</?[ib]>'", RegexOptions.IgnoreCase)) {               ArticleText = SyntaxRegexItalic.Replace(ArticleText, "$1"); ArticleText = SyntaxRegexBold.Replace(ArticleText, "$1"); }           ArticleText = Regex.Replace(ArticleText, "^ |^+", "", RegexOptions.Multiline); //remove appearance of double line break ArticleText = Regex.Replace(ArticleText, "(^==?[^=]*==?)\r\n(\r\n)?+", "$1", RegexOptions.Multiline);

//remove unnecessary namespace ArticleText = SyntaxRegexTemplate.Replace(ArticleText, "$1$2");

//remove from lists ArticleText = SyntaxRegex11.Replace(ArticleText, "$1\r\n");

//can cause problems //ArticleText = Regex.Replace(ArticleText, "^<[Hh]2>(.*?)</[Hh]2>", "==$1==", RegexOptions.Multiline); //ArticleText = Regex.Replace(ArticleText, "^<[Hh]3>(.*?)</[Hh]3>", "===$1===", RegexOptions.Multiline); //ArticleText = Regex.Replace(ArticleText, "^<[Hh]4>(.*?)</[Hh]4>", "====$1====", RegexOptions.Multiline);

//fix uneven bracketing on links if (!Regex.IsMatch(ArticleText, "\\[\\Ii]mage:[^*http")) {               ArticleText = SyntaxRegex1.Replace(ArticleText, "[http://$1]"); ArticleText = SyntaxRegex2.Replace(ArticleText, "[http://$1]"); ArticleText = SyntaxRegex3.Replace(ArticleText, "[http://$1]"); ArticleText = SyntaxRegex4.Replace(ArticleText, "$1$2"); ArticleText = SyntaxRegex5.Replace(ArticleText, "$1$2$3"); }

//repair bad external links ArticleText = SyntaxRegex6.Replace(ArticleText, "[$1]");

//repair bad internal links ArticleText = SyntaxRegex7.Replace(ArticleText, "$1"); ArticleText = SyntaxRegex8.Replace(ArticleText, "$1"); ArticleText = SyntaxRegex9.Replace(ArticleText, "$1");

ArticleText = Regex.Replace(ArticleText, "ISBN: ?([0-9])", "ISBN $1");

return ArticleText.Trim; }

///        /// Fixes link syntax ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="NoChange">Value that indicated whether no change was made. /// The modified article text. public string FixLinks(string ArticleText, out bool NoChange) {           testText = ArticleText;

string y = "";

string cat = "[[" + Variables.Namespaces[14];

foreach (Match m in WikiRegexes.SimpleWikiLink.Matches(ArticleText)) {               if (!m.Value.StartsWith(cat) && !m.Value.StartsWith("[[Image:") && !m.Value.StartsWith("[[image:") && !m.Value.StartsWith("[[_") && !m.Value.Contains("|_"))                {                    y = m.Value.Replace("_", " ");                    y = Regex.Replace(y, " ?\\| ?", "|");                }                else                    y = m.Value;

y = y.Replace("+", "%2B"); y = HttpUtility.UrlDecode(y);

ArticleText = ArticleText.Replace(m.Value, y); }

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// Simplifies some links in article wiki text such as changing Dogs to Dogs ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="NoChange">Value that indicated whether no change was made. /// The simplified article text. public string LinkSimplifier(string ArticleText, out bool NoChange) {           testText = ArticleText; ArticleText = LinkSimplifier(ArticleText);

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// Simplifies some links in article wiki text such as changing Dogs to Dogs ///        /// <param name="ArticleText">The wiki text of the article. /// The simplified article text. public string LinkSimplifier(string ArticleText) {           string n = ""; string a = ""; string b = ""; string k = "";

foreach (Match m in WikiRegexes.PipedWikiLink.Matches(ArticleText)) {               n = m.Value; a = m.Groups[1].Value; b = m.Groups[2].Value;

if (a == b || Tools.TurnFirstToLower(a) == b)               { k = WikiRegexes.PipedWikiLink.Replace(n, "$2"); ArticleText = ArticleText.Replace(n, k); }               else if (a + "s" == b || Tools.TurnFirstToLower(a) + "s" == b)                { k = WikiRegexes.PipedWikiLink.Replace(n, "$2"); k = "" + k.Substring(0, k.Length - 1) + "s"; ArticleText = ArticleText.Replace(n, k); }           }

return ArticleText; }

///        /// Adds bullet points to external links after "external links" header ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="NoChange">Value that indicated whether no change was made. /// The modified article text. public string BulletExternalLinks(string ArticleText, out bool NoChange) {           testText = ArticleText; ArticleText = BulletExternalLinks(ArticleText);

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// Adds bullet points to external links after "external links" header ///        /// <param name="ArticleText">The wiki text of the article. /// The modified article text. public string BulletExternalLinks(string ArticleText) {           int intStart = 0; string ArticleTextSubstring = "";

Match m = Regex.Match(ArticleText, "= ? ?external links? ? ?=", RegexOptions.IgnoreCase | RegexOptions.RightToLeft);

if (!m.Success) return ArticleText;

intStart = m.Index;

ArticleTextSubstring = ArticleText.Substring(intStart); ArticleText = ArticleText.Substring(0, intStart); ArticleTextSubstring = Regex.Replace(ArticleTextSubstring, "(\r\n)?(\r\n)(\\[?http)", "$2* $3"); ArticleText += ArticleTextSubstring;

return ArticleText; }

public string FixCategories(string ArticleText) {//Fix common spacing/capitalisation errors in categories

Regex catregex = new Regex("\\[\\[ ?" + Variables.NamespacesCaseInsensitive[14] + " ?(.*?)\\]\\]"); string cat = "[[" + Variables.Namespaces[14];           string x = "";

foreach (Match m in catregex.Matches(ArticleText)) {               x = cat + m.Groups[1].Value.Replace("_", " ") + "]]"; ArticleText = ArticleText.Replace(m.Value, x); }

return ArticleText; }

#endregion

#region other functions

///        /// Converts HTML entities to unicode, with some deliberate exceptions ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="NoChange">Value that indicated whether no change was made. /// The modified article text. public string Unicodify(string ArticleText, out bool NoChange) {           testText = ArticleText; ArticleText = Unicodify(ArticleText);

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// Converts HTML entities to unicode, with some deliberate exceptions ///        /// <param name="ArticleText">The wiki text of the article. /// The modified article text. public string Unicodify(string ArticleText) {           if (Regex.IsMatch(ArticleText, "<[Mm]ath>")) return ArticleText; ArticleText = Regex.Replace(ArticleText, "&#150;|&#8211;|&#x2013;", "–"); ArticleText = Regex.Replace(ArticleText, "&#151;|&#8212;|&#x2014;", "&mdash;"); ArticleText = ArticleText.Replace(" &amp; ", " & "); ArticleText = ArticleText.Replace("&amp;", "&amp;amp;");

foreach (KeyValuePair<Regex, string> k in RegexUnicode) {               ArticleText = k.Key.Replace(ArticleText, k.Value); }           try {               ArticleText = HttpUtility.HtmlDecode(ArticleText); }           catch (Exception ex) {               System.Windows.Forms.MessageBox.Show(ex.ToString); }

return ArticleText; }

///        /// Emboldens the first occurence of the title, if it isnt already ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="ArticleTitle">The title of the article. /// <param name="NoChange">Value that indicated whether no change was made. /// The modified article text. public string BoldTitle(string ArticleText, string ArticleTitle, out bool NoChange) {           //ignore date articles if (WikiRegexes.Dates2.IsMatch(ArticleTitle)) {               NoChange = true; return ArticleText; }

string escTitle = Regex.Escape(ArticleTitle);

//remove self links first Regex tregex = new Regex("\\[\\[(" + Tools.CaseInsensitive(escTitle) + ")\\]\\]"); if (!ArticleText.Contains("'''")) {               ArticleText = tregex.Replace(ArticleText, "$1", 1); }           else {               ArticleText = ArticleText.Replace("" + ArticleTitle + "", ArticleTitle); ArticleText = ArticleText.Replace("" + Tools.TurnFirstToLower(ArticleTitle) + "", Tools.TurnFirstToLower(ArticleTitle)); }

if (Regex.IsMatch(ArticleText, "^(\\[\\[|\\*|:)") || Regex.IsMatch(ArticleText, "''' ?" + escTitle + " ?'''", RegexOptions.IgnoreCase)) {               NoChange = true; return ArticleText; }

ArticleText = hider.HideMore(ArticleText);

escTitle = Regex.Replace(ArticleTitle, " \\(.*?\\)$", ""); escTitle = Regex.Escape(escTitle);

Regex regexBold = new Regex("([^\\[]|^)(" + escTitle + ")([ ,.:;])", RegexOptions.IgnoreCase);

string strSecondHalf = ""; if (ArticleText.Length > 80) {               strSecondHalf = ArticleText.Substring(80); ArticleText = ArticleText.Substring(0, 80); }

if (ArticleText.Contains("'''")) {               ArticleText = ArticleText + strSecondHalf; ArticleText = hider.AddBackMore(ArticleText); NoChange = true; return ArticleText; }

if (regexBold.IsMatch(ArticleText)) {               NoChange = false; ArticleText = regexBold.Replace(ArticleText, "$1$2$3", 1); }           else NoChange = true;

ArticleText = ArticleText + strSecondHalf; ArticleText = hider.AddBackMore(ArticleText); return ArticleText; }

///        /// Replaces an iamge in the article. ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="OldImage">The old image to replace. /// <param name="NewImage">The new image. /// <param name="NoChange">Value that indicated whether no change was made. /// The new article text. public string ReplaceImage(string OldImage, string NewImage, string ArticleText, out bool NoChange) {           testText = ArticleText; ArticleText = ReplaceImage(OldImage, NewImage, ArticleText);

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// Replaces an iamge in the article. ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="OldImage">The old image to replace. /// <param name="NewImage">The new image. /// The new article text. public string ReplaceImage(string OldImage, string NewImage, string ArticleText) {           //remove image prefix OldImage = Regex.Replace(OldImage, "^" + Variables.Namespaces[6], "", RegexOptions.IgnoreCase).Replace("_", " "); NewImage = Regex.Replace(NewImage, "^" + Variables.Namespaces[6], "", RegexOptions.IgnoreCase).Replace("_", " ");

OldImage = Regex.Escape(OldImage).Replace("\\ ", "[ _]");

OldImage = Variables.NamespacesCaseInsensitive[6] + Tools.CaseInsensitive(OldImage); NewImage = Variables.Namespaces[6] + NewImage;

ArticleText = Regex.Replace(ArticleText, OldImage, NewImage);

return ArticleText; }

///        /// Removes an iamge in the article. ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="Image">The image to remove. /// The new article text. public string RemoveImage(string Image, string ArticleText, bool CommentOut, string Comment) {           //remove image prefix Image = Regex.Replace(Image, "^" + Variables.Namespaces[6], "", RegexOptions.IgnoreCase).Replace("_", " "); Image = Regex.Escape(Image).Replace("\\ ", "[ _]"); Image = Tools.CaseInsensitive(Image);

Regex r = new Regex("\\[\\[" + Variables.NamespacesCaseInsensitive[6] + Image + ".*\\]\\]"); MatchCollection n = r.Matches(ArticleText);

if (n.Count > 0) {               foreach (Match m in n)                { string match = m.Value;

int i = 0; int j = 0;

foreach (char c in match) {                       if (c == '[') j++; else if (c == ']') j--;

i++;

if (j == 0) {                           if (match.Length > i)                                match = match.Remove(i);

Regex t = new Regex(Regex.Escape(match));

if (CommentOut) ArticleText = t.Replace(ArticleText, "", 1, m.Index); else ArticleText = t.Replace(ArticleText, "", 1);

break; }

}               }            }            else {               r = new Regex("(" + Variables.NamespacesCaseInsensitive[6] + ")?" + Image); n = r.Matches(ArticleText);

foreach (Match m in n)               { Regex t = new Regex(Regex.Escape(m.Value));

if (CommentOut) ArticleText = t.Replace(ArticleText, "", 1, m.Index); else ArticleText = t.Replace(ArticleText, "", 1, m.Index); }           }

return ArticleText; }

///        /// Removes an iamge in the article. ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="OldImage">The image to remove. /// <param name="NoChange">Value that indicated whether no change was made. /// The new article text. public string RemoveImage(string Image, string ArticleText, bool CommentOut, string Comment, out bool NoChange) {           testText = ArticleText; ArticleText = RemoveImage(Image, ArticleText, CommentOut, Comment);

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// Adds the category to the article. ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="NewCategory">The new category. /// The article text. public string AddCategory(string NewCategory, string ArticleText, string ArticleTitle, out bool NoChange) {           testText = ArticleText; ArticleText = AddCategory(NewCategory, ArticleText, ArticleTitle);

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// Adds the category to the article. ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="NewCategory">The new category. /// The article text. public string AddCategory(string NewCategory, string ArticleText, string ArticleTitle) {           if (Regex.IsMatch(ArticleText, "\\[\\[ ?[Cc]ategory ?: ?" + Regex.Escape(NewCategory))) return ArticleText;

string cat = "\r\n" + Variables.Namespaces[14] + NewCategory + ""; cat = Tools.ApplyKeyWords(ArticleTitle, cat);

if (ArticleTitle.StartsWith(Variables.Namespaces[10])) ArticleText += " " + cat + "\r\n "; else ArticleText += cat;

return ArticleText; }

///        /// Re-categorises the article. ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="OldCategory">The old category to replace. /// <param name="NewCategory">The new category. /// <param name="NoChange">Value that indicated whether no change was made. /// The re-categorised article text. public string ReCategoriser(string OldCategory, string NewCategory, string ArticleText, out bool NoChange) {           //remove category prefix OldCategory = Regex.Replace(OldCategory, "^" + Variables.Namespaces[14], "", RegexOptions.IgnoreCase); NewCategory = Regex.Replace(NewCategory, "^" + Variables.Namespaces[14], "", RegexOptions.IgnoreCase);

//format categories properly ArticleText = FixCategories(ArticleText);

testText = ArticleText;

if (Regex.IsMatch(ArticleText, "\\[\\[" + Variables.NamespacesCaseInsensitive[14] + Tools.CaseInsensitive(Regex.Escape(NewCategory)) + "( ?\\|| ?\\]\\])")) {               ArticleText = RemoveCategory(OldCategory, ArticleText); }           else {               OldCategory = Regex.Escape(OldCategory); OldCategory = Tools.CaseInsensitive(OldCategory);

OldCategory = Variables.Namespaces[14] + OldCategory + "( ?\\|| ?\\]\\])"; NewCategory = Variables.Namespaces[14] + NewCategory + "$1";

ArticleText = Regex.Replace(ArticleText, OldCategory, NewCategory); }

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// Removes a category from an article. ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="strOldCat">The old category to remove. /// <param name="NoChange">Value that indicated whether no change was made. /// The article text without the old category. public string RemoveCategory(string strOldCat, string ArticleText, out bool NoChange) {           testText = ArticleText; ArticleText = RemoveCategory(strOldCat, ArticleText);

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// Removes a category from an article. ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="strOldCat">The old category to remove. /// The article text without the old category. public string RemoveCategory(string strOldCat, string ArticleText) {           //format categories properly ArticleText = FixCategories(ArticleText);

strOldCat = Regex.Escape(strOldCat); strOldCat = Tools.CaseInsensitive(strOldCat);

//broken into two parts to avoid removal of newline when it's not desirable string s = "\\[\\[" + Variables.NamespacesCaseInsensitive[14] + " ?" + strOldCat + "( ?\\]\\]| ?\\|[^\\|]*?\\]\\])\r\n\\["; ArticleText = Regex.Replace(ArticleText, s, "["); s = "\\[\\[" + Variables.NamespacesCaseInsensitive[14] + " ?" + strOldCat + "( ?\\]\\]| ?\\|[^\\|]*?\\]\\])"; ArticleText = Regex.Replace(ArticleText, s, "");

return ArticleText; }

public string LivingPeople(string ArticleText, out bool NoChange) {           NoChange = true; testText = ArticleText;

if (Regex.IsMatch(ArticleText, "\\[\\[ ?Category ?:[ _]?([0-9]{1,2}[ _]century[ _]deaths|[0-9s]{4,5}[ _]deaths|Disappeared[ _]people|Living[ _]people|Year[ _]of[ _]death[ _]missing|Possibly[ _]living[ _]people)", RegexOptions.IgnoreCase)) return ArticleText;

Match m = Regex.Match(ArticleText, "\\[\\[ ?Category ?:[ _]?([0-9]{4})[ _]births(\\|.*?)?\\]\\]", RegexOptions.IgnoreCase);

if (!m.Success) return ArticleText;

string birthCat = m.Value; int birthYear = int.Parse(m.Groups[1].Value); string catKey = "";

if (birthYear < 1910) return ArticleText;

if (birthCat.Contains("|")) catKey = Regex.Match(birthCat, "\\|.*?\\]\\]").Value; else catKey = "]]";

ArticleText += "[[Category:Living people" + catKey;

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// Converts/subst'd some deprecated templates ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="NoChange">Value that indicated whether no change was made. /// The new article text. public string Conversions(string ArticleText, out bool NoChange) {           testText = ArticleText; ArticleText = Conversions(ArticleText);

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// Converts/subst'd some deprecated templates ///        /// <param name="ArticleText">The wiki text of the article. /// The new article text. public string Conversions(string ArticleText) {           //Use proper codes ArticleText = ArticleText.Replace("[[zh-tw:", "[[zh:");           ArticleText = ArticleText.Replace("[[nb:", "[[no:");            ArticleText = ArticleText.Replace("[[dk:", "[[da:");

ArticleText = ArticleText.Replace("{{msg:", "{{");

foreach (KeyValuePair<Regex, string> k in RegexConversion) {               ArticleText = k.Key.Replace(ArticleText, k.Value); }

return ArticleText; }

///        /// Subst'd some user talk templates ///        /// <param name="TalPageText">The wiki text of the talk page. /// The new text. public string SubstUserTemplates(string TalkPageText) {           TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(test[n0-6]?[ab]?)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase); TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(test[n0-6]?[ab]?-n\\|.*?)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase);

TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(3RR[0-5]?)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase);

TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(spam[0-5][ab]?)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase); TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(spam[0-5]?-n\\|.*?)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase);

TalkPageText = Regex.Replace(TalkPageText, "\\{\\{(template:)?(welcome[0-6]|welcomeip|anon|welcome-anon)\\}\\}", "{{subst:$2}}", RegexOptions.IgnoreCase);

return TalkPageText; }                     ///         /// If necessary, adds/removes wikify or stub tag ///        public string Tagger(string ArticleText, string ArticleTitle, out bool NoChange, ref string Summary) {           testText = ArticleText; ArticleText = Tagger(ArticleText, ArticleTitle, ref Summary);

if (testText == ArticleText) NoChange = true; else NoChange = false;

return ArticleText; }

///        /// adds/removes ///        /// <param name="ArticleText">The wiki text of the article. /// <param name="ArticleTitle">The old category to remove. /// The article text without the old category. public string Tagger(string ArticleText, string ArticleTitle, ref string Summary) {           if (Tools.IsRedirect(ArticleText)) return ArticleText;

if (!Tools.IsMainSpace(ArticleTitle)) return ArticleText;

double Length = ArticleText.Length + 1;

double LinkCount = 1; double Ratio = 0;

string CommentsStripped = WikiRegexes.Comments.Replace(ArticleText, ""); int words = Tools.WordCount(CommentsStripped);

//update by-date tags foreach (KeyValuePair<Regex, string> k in RegexTagger) {               ArticleText = k.Key.Replace(ArticleText, k.Value); }

//remove stub tags from long articles if (words > StubMaxWordCount && WikiRegexes.Stub.IsMatch(CommentsStripped)) {               MatchEvaluator stubEvaluator = new MatchEvaluator(stubChecker); ArticleText = WikiRegexes.Stub.Replace(ArticleText, stubEvaluator);

ArticleText = ArticleText.Trim; }

foreach (Match m in WikiRegexes.Template.Matches(ArticleText)) {               if (!m.Value.Contains("stub")) return ArticleText; }

LinkCount = Tools.LinkCount(CommentsStripped); Ratio = LinkCount / Length;

if (words > 6 && !WikiRegexes.Category.IsMatch(CommentsStripped) && !Regex.IsMatch(ArticleText, @"\{\{[Uu]ncategori[zs]ed")) {               if (WikiRegexes.Stub.IsMatch(CommentsStripped)) {                   ArticleText += "\r\n\r\n{{Uncategorizedstub|February 2007}}"; Summary += ", added uncategorised tag"; }               else {                   ArticleText += "\r\n\r\n"; Summary += ", added uncategorised tag"; }           }            else if (LinkCount < 3 && (Ratio < 0.0025)) {               ArticleText = "\r\n\r\n" + ArticleText; Summary += ", added wikify tag"; }           else if (CommentsStripped.Length <= 300 && !WikiRegexes.Stub.IsMatch(CommentsStripped)) {               ArticleText = ArticleText + "\r\n\r\n\r\n{{stub}}"; Summary += ", added stub tag"; }

return ArticleText; }

private string stubChecker(Match m)       {// Replace each Regex cc match with the number of the occurrence. if (Regex.IsMatch(m.Value, Variables.SectStub)) return m.Value; else return ""; }

#endregion

#region unused

///        /// Bypasses all redirects in the article ///        public string BypassRedirects(string ArticleText) {//checks links to make them bypass redirects and (TODO) disambigs string link = ""; string article = "";

MatchCollection simple = WikiRegexes.WikiLinksOnly.Matches(ArticleText); MatchCollection piped = WikiRegexes.PipedWikiLink.Matches(ArticleText);

foreach (Match m in simple) {               //make link link = m.Value; article = m.Groups[1].Value;

//get text string text = ""; try {                   text = Tools.GetArticleText(article); }               catch {                   continue; }

//test if redirect if (Tools.IsRedirect(text)) {                   string directLink = Tools.RedirectTarget(text).Replace("_"," "); directLink = "" + article + "";

ArticleText = ArticleText.Replace(link, directLink); }           }            return ArticleText; }

///        /// Fixes minor problems, such as abbreviations and miscapitalisations ///        /// <param name="ArticleText">The wiki text of the article. /// The new article text. public string MinorThings(string ArticleText) {           ArticleText = Regex.Replace(ArticleText, "[Aa]\\.[Kk]\\.[Aa]\\.?", "also known as");

ArticleText = ArticleText.Replace("e.g.", "for example"); ArticleText = ArticleText.Replace("i.e.", "that is");

MatchCollection ma = Regex.Matches(ArticleText, "(monday|tuesday|wednesday|thursday|friday|saturday|sunday|january|february|april|june|july|august|september|october|november|december)"); if (ma.Count > 0) {               foreach (Match m in ma) ArticleText = ArticleText.Replace(m.Groups[1].Value, Tools.TurnFirstToUpper(m.Groups[1].Value)); }

return ArticleText; }

// to Dog //private string ExtToInternalLinks(string ArticleText) //{       //    foreach (Match m in Regex.Matches(ArticleText, "\\")) //   {        //        string a = HttpUtility.UrlDecode(m.ToString);

//       if (a.Contains(" ")) //       {        //            int intP; //           //string a = n;        //            intP = a.IndexOf(" ");

//           string b = a.Substring(intP); //           a = a.Remove(intP); //           b = b.TrimStart; //           a = a.Replace("_", " ");

//           ArticleText = ArticleText.Replace(m.ToString, a); //       }        //    }

//   ArticleText = Regex.Replace(ArticleText, "\\", "$1"); //   return ArticleText; //}

#endregion } }