Wikipedia:Bots/Requests for approval/BHGbot 9/Step5 checker

// AWB custom module to remove Cleanup bare URLs when there are no remaining Bare URLs // v0.07 18 October 2021 // -- BHG

// NOTE this version is hacked for testing purposes. // It skips all pages except those which get to Step 5, then fail there.

public string botNV {	string botName = "WP:BHGbot 9"; string botVersion = "0.07 checker"; string botTrial = " Trial"; // string botTrial = ""; return botName + "v" + botVersion + botTrial; }

public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip) {   Skip = false; Summary = botNV + ": ";

// String DECLARATIONS bool debugging = false; string debuggingEditSummary = "This is a test to debug " + botNV + ". This edit should not have been saved, so please revert it"; string successEditSummary = "Removed . This page currently has no bare URLs";

// article text variables string nuArticleText = ""; // The text that we will return if the tag is removed. string testArticleText = ArticleText; // A copy of the article which will be used for testing purposes

// tallies int CleanupBareURLsTagCount = 0; int bareURLinlineTagCount = 0; int bareURLrefCount = 0; int URLsremainingAfterRemovingNonBareURlsCount = 0;

// DECLARE some regexes needed later on

string CleanupBareURLsTagMatcher = @"\s*\{\{ *([tT]emplate *: *)?([Cc]leanup[_ ]+bare[_ ]+URLs|[Bb]are[_ ]+|[Bb]are|[Bb]are[_ ]+link|[Bb]are[_ ]+linkname|[Bb]are[_ ]+links|[Bb]are[_ ]+references|[Bb]are[_ ]+refs|[Bb]are[_ ]+URL|[Bb]are[_ ]+URLs|[Bb]are-URLs|[Bb]arelinks|[Bb]areURL|[Bb]areURLs|[Cc]leanup[_ ]+bare-URLs|[Cc]leanup[_ ]+link[_ ]+rot|[Cc]leanup[_ ]+link-rot|[Cc]leanup-Bare[_ ]+URLs|[Cc]leanup-barelinks|[Cc]leanup-link[_ ]+rot|[Cc]leanup-link-rot|[Cc]leanup-linkrot|[Cc]UBURL|[Ll]ink[_ ]+rot|[Ll]INKROT|[Ll]R) *(\|[^\}]*)?\}\}"; string bareURLinlineTagMatcher = @"\s*\{\{ *([tT]emplate *: *)?([Bb]are[_ ]+URL[\- ]inline|[Ll]inkrot-inline|[Bb]are-inline|[Bb]are[_ ]+inline|[Bb]are[_ ]+url[_ ]+inline|[Bb]are-url[_ ]+inline|[Bb]are[_ ]+link[_ ]+inline|[Bb]are-link-inline|[Bb]are-url-inline|[Bb]are[_ ]+url) *(\|[^\}]*)?\}\}"; string bareURLinlineRefMatcher = @"]*?>\s*\[?\s*https?:[^>< \|\[\]]+\s*\]?\s*<\s*/\s*ref\s*>"; string completeRefTagMatcher = @"]*?>[^<>]*<\s*/\s*ref\s*>"; string citeTemplateMatcher = @"\{\{ *([tT]emplate *: *)?([Cc](ite|itation))[^\]\{]*\}\}"; // Yes, this is crude, and will miss some cases // such as cites using, but it will do                                                                                           // for a start string URLtemplateMatcher = @"{\{ *([tT]emplate *: *)?(URL|Websites|URLWww|URLUrlw|URLUrl|URLUR|URLSite|URLWebsite|URLپیوند وب)\s*(\|[^\}]*)?\}\}"; string OfficialWebsiteOrOfficialURLtemplateMatcher = @"\{\{ *([tT]emplate *: *)?([Oo]fficial[_ ]+URL|[Oo]fficial[_ ]+website|[Cc]onditionalURL|[Cc]onditional[_ ]+URL|[Gg]et[_ ]+URL[_ ]+from[_ ]+WikiData|[Oo]fficialURL|[Oo]fficial[_ ]+url|[Oo]fficialSite|[Oo]fficial|[Cc]ompany[_ ]+Website|[Oo]fficial[_ ]+site|[Oo]fficial[_ ]+Website|[Oo]ffficial[_ ]+website|[Oo]fficial[_ ]+web[_ ]+site|[Oo]fficial[_ ]+homepage|[Hh]omepage|[Hh]ome[_ ]+page|[Oo]fficialwebsite|[Mm]ain[_ ]+website|[Oo]fficialsite|[Oo]fficial[_ ]+webpage|[Oo]fficial[_ ]+Site|[Oo]web)\s*(\|[^\}]*)?\}\}"; string URLparameterMatcher = @"\|\s*(website|url)\s*=\s*https?:[^\|\}]*"; string nonBareURLMatcher = @"\[\s*https?://[^>< \|\[\]]+\s+[^\]]+\]"; // a bit crude string BareURLMatcher = @"((?!<\[ *)https?://[^>< \|\[\]]+|\[ *https?:[^>< \|\[\]]+\s*\])"; // currently unused string anyURLMatcher = @"(?!<\w)https?://\w"; // is this enough? // STEP 1.check that the page contains the banner template, or one of its many aliases. If not, skip the page MatchCollection CleanupBareURLsTagmatches = Regex.Matches(ArticleText, CleanupBareURLsTagMatcher, RegexOptions.Singleline); CleanupBareURLsTagCount = CleanupBareURLsTagmatches.Count; if (CleanupBareURLsTagCount == 0) { // No tags, so skip this page if (debugging) { Skip = false; Summary = debuggingEditSummary; return MakeDebugMsg(1, false, "Page contains no CleanupBareURLsTagMatcher tag.", false, ArticleText); }	   Skip = true; return ArticleText; }

// So we have a tag // Now create a copy of the page without the tag. This is what we will save if there are no remaining Bare URLs nuArticleText = Regex.Replace(ArticleText, CleanupBareURLsTagMatcher, "", RegexOptions.Singleline);

// STEP 2. count the number of tags in the page, including aliases MatchCollection bareURLinlineTagmatches = Regex.Matches(ArticleText, bareURLinlineTagMatcher, RegexOptions.Singleline); bareURLinlineTagCount = bareURLinlineTagmatches.Count;

// STEP 3. count the number of tags in the page, including aliases MatchCollection bareURLrefmatches = Regex.Matches(ArticleText, bareURLinlineRefMatcher, RegexOptions.Singleline); bareURLrefCount = bareURLrefmatches.Count; // STEP 4. if the total matches of step 2 + step 3 is greater than zero, then skip the page if ((bareURLinlineTagCount + bareURLrefCount) > 0) { // This page still has some bare URL refs, so skip this page // No tags, so skip this page if (debugging) { Skip = false; Summary = debuggingEditSummary; return MakeDebugMsg(4, false, "Page still has some bare URL refs.\n* bareURLinlineTagCount=" + bareURLinlineTagCount + "\n* bareURLrefCount=" + bareURLrefCount, false, ArticleText); }	   Skip = true; return ArticleText; }

// STEP 5. check for bare URLs not in ref tags // // In this step we proceed by working on a copy of the article from which we remove URls which are known to be OK // Then we check whether any bare URLs remain

// STEP 5.A: remove all ref tags. //           We have already checked for any bare URLs inside ref tags, so we can just remove all ref tags and their contents. testArticleText = Regex.Replace(testArticleText, completeRefTagMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase); // STEP 5.B: remove all templates. //           Anything inside a  template is good, so just remove the whole template testArticleText = Regex.Replace(testArticleText, citeTemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);

// STEP 5.C: remove any undefined templates. //           Anything inside a undefined template is good, so just remove the whole template testArticleText = Regex.Replace(testArticleText, URLtemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);

// STEP 5.D: remove any or  templates. //           Anything inside an  or  template is good, so just remove the whole template testArticleText = Regex.Replace(testArticleText, OfficialWebsiteOrOfficialURLtemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);

// STEP 5.E: remove any URL which is a value of a template parameter "url=" or "website=" //           e.g. "|website=https://example.com" or "|url=https://example.com" testArticleText = Regex.Replace(testArticleText, URLparameterMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);

// STEP 5.F: remove any non-bare URLs //           e.g. "foo" testArticleText = Regex.Replace(testArticleText, nonBareURLMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase);

// STEP 6: does the page still contain any URLs? MatchCollection RemainingURLsMatches = Regex.Matches(testArticleText, anyURLMatcher, RegexOptions.Singleline | RegexOptions.IgnoreCase); URLsremainingAfterRemovingNonBareURlsCount = RemainingURLsMatches.Count; if (URLsremainingAfterRemovingNonBareURlsCount == 0) { // SUCCESS! No bare URLs, so we can remove the tag if (debugging) { Skip = false; Summary = debuggingEditSummary; return MakeDebugMsg(6, true, "Page contains no WP:Bare URLs.", true, nuArticleText); }	   Skip = true; Summary = botNV + ": " + successEditSummary; return ArticleText; }

// FAILURE // If we get here, then the page still contains bare URLs Skip = false; return "STEP 5 FAIL \n\n\n" +ArticleText; }

public string MakeDebugMsg(int stepNum, bool testsOK, string debugMessage, bool textChanged, string pageText) {	string retval = "DEBUGGING " + botNV + ". --- This edit should NOT have been saved. Please revert.\n";

retval = retval + "\nSTEP: " + stepNum; retval = retval + "\nSTATUS: "; if (testsOK) { retval = retval + "Success"; }	else { retval = retval + "Fail"; }	retval = retval + "\nNOTES: " + debugMessage; retval = retval + "\n\nArticle text follows below the line. "; if (textChanged) { retval = retval + "This text has been modified"; }	else { retval = retval + "This is the original text, unmodified\n"; }	retval = retval + "\n\n____________________________________";

return retval; }