User:CapitalBot

CapitalBot moves through all cities/towns/villages/etc in the United States and adds or updates Infobox Settlement using information from government databases. It is a manually-assisted bot, meaning that all edits are checked by its operator before saving. It is written in C# as a module for WP:AWB and operates here and there for a few hours at a time. Please post any problems or questions with this bot at the bot operator's talk page (User talk:CapitalR).

I would be happy to share the database with anyone interested (but you will need to be able to give me a location to upload it to, as it's about 50MB and I don't have a site that I can host it on).

FAQ

 * Q: What is a FIPS code and why is it being added to the infoboxes?
 * A: The FIPS code is a number given by the US federal government to identify the location. For example, FIPS codes are used by the census bureau to identify every location.  In the infoboxes it is in the form 00-00000, where the first two digits identify the state, and the last five identify the location within the state.  The FIPS codes are being added to the infoboxes to allow future bots to easily map a Wikipedia article to its corresponding US census record to periodically check the accuracy of the data in the article.


 * Q: What is a GNIS feature ID and why is it being added to the infoboxes?
 * A: The feature ID is another ID number given by the US federal government as part of the GNIS. This database contains geographic information on millions of places (cities/towns/cdps/churches/schools/parks/etc), including coordinates and elevation.  The feature ID is being added to the infobox to allow future bots to easily map a Wikipedia article to its corresponding geographic record to periodically check the accuracy of the data.

CapitalBot progress
Note: the status field is mostly for the bot operator's reference. A few percent of all articles were skipped for various reasons and will be completed at the end.

Source code
This is the source code, written in C# as an AWB module. Of course, it requires the Excel spreadsheet with all of the data, and a few minor modifications to AWB itself, but you can get an idea of how it works.

 using System; using System.Collections.Generic; using System.ComponentModel; using System.Drawing; using System.Text; using System.Windows.Forms; using System.Text.RegularExpressions; using Excel = Microsoft.Office.Interop.Excel; using Office = Microsoft.Office.Core;

namespace WindowsApplication1 {   class Module1 : WikiFunctions.Plugin.IModule {       public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip) {           Skip = false; Summary = "Infobox updates"; bool strict = false;

if (!LoadExcelDatabase) {               Skip = true; return ArticleText; }

if (IsSkip(ArticleText, strict)) {               Skip = true; return ArticleText; }

if (Regex.IsMatch(ArticleText, "[{][{]\\s*"+infoboxaliases, RegexOptions.IgnoreCase)) Summary = "Updating infobox parameters"; else {               Summary = "Adding infobox"; ArticleText = infoboxtemplate + ArticleText; }

ArticleText = FillInInfobox(ArticleText, ArticleTitle); if (ArticleText.Length < 100) Skip = true;

ArticleText = PerformFinalEdits(ArticleText); return ArticleText; }

private bool IsSkip(string article, bool strict) {           bool skip = false;

if (Regex.IsMatch(article, "[{][{]\\s*(geodis|disambig|geobox)", RegexOptions.IgnoreCase)) skip = true;

if (strict) {               if (Regex.IsMatch(article, "[\r\n]([{][|]|[|][}])")) skip = true; if (Regex.IsMatch(article, "[<]\\s*(table|div)", RegexOptions.IgnoreCase)) skip = true; }

return skip; }

private string PerformFinalEdits(string article) {           article = Regex.Replace(article,                "([|]image_map\\s*= )(\\s*[|]mapsize\\s*= )(\\s*[|]map_caption\\s*= )(\\s*[|][^`]*)*[|][^]|]*[|][^]|]*[|](Location[^][]*][[][^*[]][]][^][]*][[][^*[]][]][^]]*)[]][]]\\s*",                "$1$5$2%%mapsize%%$3$6$4");

article = Regex.Replace(article,               "([|]image_map\\s*= )(\\s*[|]mapsize\\s*= )(\\s*[|]map_caption\\s*= )(\\s*[|])([^`]*)*[|][^]|]*[|][^]|]*[|]([^]|]*)[]][]]\\s*",                "$1$6$7$8$2%%mapsize%%$3$9$4$5", RegexOptions.IgnoreCase); article = Regex.Replace(article,               "([|]image_map\\s*= )(\\s*[|]mapsize\\s*= )(\\s*[|]map_caption\\s*= )(\\s*[|])([^`]*)*[|][^]|]*[|]([^]|]*)[]][]]\\s*",                "$1$6$7$8$2%%mapsize%%$3$9$4$5", RegexOptions.IgnoreCase); article = Regex.Replace(article,               "([|]image_map\\s*= )(\\s*[|]mapsize\\s*= )(\\s*[|]map_caption\\s*= )(\\s*[|])([^`]*)*[|]([^]|]*)[]][]]\\s*",                "$1$6$7$8$2%%mapsize%%$3$9$4$5", RegexOptions.IgnoreCase); article = Regex.Replace(article,               "([|]image_map\\s*= )(\\s*[|]mapsize\\s*= )(\\s*[|]map_caption\\s*= )(\\s*[|])([^`]*)*)[]][]]\\s*", "$1$6$7$8$2%%mapsize%%$3$9$4$5", RegexOptions.IgnoreCase);

article = Regex.Replace(article, "%%mapsize%%", "250x250px"); article = Regex.Replace(article, "(==\\s*External )L(inks\\s*==)", "$1l$2"); article = Regex.Replace(article, "(==\\s*External link)(\\s*==)", "$1s$2"); article = Regex.Replace(article, "(==\\s*See )A(lso\\s*==)", "$1a$2");

string mapsize = "250x200px"; article = Regex.Replace(article, "([|]mapsize                 = )[^\\s|]*(\\s*[|])", "$1→" + mapsize + "$2").Replace("→",""); article = Regex.Replace(article, "([|]image_map\\s*=\\s*[|]mapsize\\s*=\\s*)" + mapsize + "(\\s*[|])", "$1$2"); article = Regex.Replace(article, "([|]image_map\\s*=\\s*[|]map_caption\\s*=\\s*[|]mapsize\\s*=\\s*)" + mapsize + "(\\s*[|])", "$1$2"); return article; }

private string AddFirstImageToInfobox(string article) {           if (article.Contains("[[Image:") && !Regex.IsMatch(article, "[|]\\s*image_skyline\\s*=\\s*[^\\s]+\\s*[|]"))            {                string article2 = article.Substring(article.IndexOf("[[Image:") + 8).Replace("[[", "→").Replace("]]", "←");                string strmatch = "[[Image:";                Regex imagereg1 = new Regex("^([^→←]*←)"), imagereg2 = new Regex("^([^→←]*→)");                int k = 1;

while (k > 0) {                   if (imagereg1.IsMatch(article2)) {                       strmatch += imagereg1.Match(article2).Groups[1].Value; article2 = imagereg1.Replace(article2, ""); k--; }                   else if (imagereg2.IsMatch(article2)) {                       strmatch += imagereg2.Match(article2).Groups[1].Value; article2 = imagereg2.Replace(article2, ""); k++; }                   else return article; }               if (!strmatch.EndsWith("←")) return article;

string strmatch0 = (strmatch.Substring(0,strmatch.Length)).Replace("→", "").Replace("←", "");

Regex extractimage1 = new Regex("[[][[]Image:([^←|]*)[←|]");               string i1 = "";                if (extractimage1.IsMatch(strmatch))                {                    i1 = extractimage1.Match(strmatch).Groups[1].Value;                    strmatch = extractimage1.Replace(strmatch, "|");                }                else                    return article;

string i2 = ""; k = 0; int j = strmatch.Length-2; while (j >= 0) {                   if (k == 0 && strmatch[j].Equals('|')) break; if (strmatch[j].Equals('←')) k++; else if (strmatch[j].Equals('→')) k--; i2 = strmatch[j] + i2; j--; }

i2 = i2.Replace("→", "").Replace("←", "").Trim; for (int i = 0; i < 5; i++) i2 = Regex.Replace(i2, "^(right|left|center|none|thumb|thumbnail|border|frame|[0-9]*px|[0-9]*x[0-9]*px)$","");

if (strmatch.Length > 0 &&                   !Regex.IsMatch(article, "[<][!][-][-][^>]*" + Regex.Escape(strmatch0) + "[^>]*[-][-][>]") &&                    !Regex.IsMatch(article, "[{][{][^}]*" + Regex.Escape(strmatch0) + "[^}]*[}][}]")) {                   article = Regex.Replace(article, Regex.Escape(strmatch0) + "\\s*", ""); article = Regex.Replace(article, "[|]\\s*imagesize\\s*=\\s*[^|]*[|]", "|"); article = Regex.Replace(article, "([|]image_skyline\\s*= )(\\s*[|]image_caption\\s*= )(\\s*[|])",                       "$1→" + i1 + "$2→" + i2 + "\r\n|imagesize                = 250px$3").Replace("→", ""); }           }

return article; }

private bool LoadExcelDatabase {           if (!loadedExcel) {               string filename = "C:\\Users\\Administrator\\Documents\\Census Place Names Test.xlsx"; loadedExcel = OpenExcelSheet(filename); if (!loadedExcel) return false; parameter2idxmap = GenerateParameter2IdxMap; articlemap = GenerateArticleMap("BG"); GenerateMaps; }           return true; }

private void ExtractInfobox(string ArticleText, out string article, out string infobox) {           article = ArticleText; article = Regex.Replace(article, "[}][}]", "←"); article = Regex.Replace(article, "[{][{]", "→"); string header = ""; int i = article.IndexOf("→Infobox Settlement"); if (i > 0) {               header = article.Substring(0,i); article = article.Substring(i); }

infobox = article.Substring(0,1); article = article.Substring(1);

int k = 1; while (k > 0 && article.Length > 0) {               if (article[0] == '→') k++; else if (article[0] == '←') k--;

infobox += article[0]; article = article.Substring(1); }

article = header + "%%%Infobox%%%" + article; article = Regex.Replace(article, "←", "}}"); article = Regex.Replace(article, "→", ""); infobox = Regex.Replace(infobox, "→", "{{"); }

private string FillInInfobox(string ArticleText, string ArticleTitle) {           ArticleText = Regex.Replace(ArticleText, "[{][{]\\s*"+infoboxaliases, "{{Infobox Settlement"); ArticleText = Regex.Replace(ArticleText, "([|]\\s*settlement_type\\s*=\\s*)(town|city|cdp|CDP|village|borough)(\\s*[|])", "$1$3");

string article, infobox; ExtractInfobox(ArticleText, out article, out infobox); infobox = RenameDeprecatedParameters(infobox); infobox = AddMissingParameters(infobox);

string infoboxnocomments = Regex.Replace(infobox.Replace("", "←"), "→[^←]*←", ""); for (int i = 0; i < parameterlist.Count; i++) {               if (Regex.IsMatch(infoboxnocomments, "[|]\\s*" + parameterlist[i].Trim + "\\s*=\\s*[|]")) infobox = Regex.Replace(infobox, "([|]\\s*" + parameterlist[i].Trim + "\\s*=)", "$1 %%" + parameterlist[i] + "%%"); }

for (int i = 0; i < fullparamlist.Count; i++) {               infobox = Regex.Replace(infobox, "[|]\\s*(" + Regex.Escape(fullparamlist[i].Trim) + "\\s*=)", "|$1"); infobox = Regex.Replace(infobox, " *([|]\\s*" + Regex.Escape(fullparamlist[i].Trim) + "\\s*=)", "$1"); }

infobox = SpaceEqualSigns(infobox);

infobox = Regex.Replace(infobox, "([|]\\s*latNS\\s*= )(\\s*[|])", "$1N$2"); infobox = Regex.Replace(infobox, "([|]\\s*longEW\\s*= )(\\s*[|])", "$1W$2"); infobox = Regex.Replace(infobox, "([|]\\s*latNS\\s*=)(\\s*[|])", "$1 N$2"); infobox = Regex.Replace(infobox, "([|]\\s*longEW\\s*=)(\\s*[|])", "$1 W$2"); infobox = Regex.Replace(infobox, "\\s*([|]\\s*mapsize\\s*=)", "\r\n$1"); infobox = Regex.Replace(infobox, "\\s*([|]\\s*map_caption\\s*=)", "\r\n$1");

if (!Regex.IsMatch(infobox, "[|]\\s*subdivision_name3\\s*=\\s*[^\\s|]+\\s*[|]")) {               infobox = Regex.Replace(infobox, "([|]\\s*subdivision_type)([12]*)(\\s*=)\\s*[^={]*([|][^={]*[=])", "$1$2$3 %%subdivision_type$2%%\r\n$4"); infobox = Regex.Replace(infobox, "([|]\\s*subdivision_name)([12]*)(\\s*=)\\s*[^={]*([|][^={]*[=])", "$1$2$3 %%subdivision_name$2%%\r\n$4");

infobox = Regex.Replace(infobox, "%%subdivision_type%%", "Country"); infobox = Regex.Replace(infobox, "%%subdivision_name%%", "United States"); infobox = Regex.Replace(infobox, "%%subdivision_type1%%", "State"); }

for (int i = 0; i < fullparamlist.Count; i++) infobox = Regex.Replace(infobox, "([|]\\s*" + fullparamlist[i] + "\\s*=) *", "$1 ");

infobox = GenerateInfobox(ArticleTitle, infobox); if (infobox.Length < 100) return "Could not generate infobox";

infobox = Regex.Replace(infobox, " *([\r\n])", "$1"); infobox = Regex.Replace(infobox, "= *([\r\n])", "= $1"); article = PlaceInfobox(infobox, article); return article; }

private string RenameDeprecatedParameters(string infobox) {           foreach (KeyValuePair p in deprecatedmap) infobox = Regex.Replace(infobox, "([|]\\s*)" + p.Key + "(\\s*=)", "$1" + p.Value + "$2"); return infobox; }

private string AddMissingParameters(string infobox) {           infobox = Regex.Replace(infobox, "([{][{]Infobox Settlement)\\s*", "$1\r\n"); infobox = Regex.Replace(infobox, "[<][!][-][-]\\s*enter ZIP code, Postcode, Post code, Postal code...\\s*[-][-][>]", ""); infobox = Regex.Replace(infobox, "[<][!][-][-]\\s*For Town or Village [(]Leave blank for the default City[)]\\s*[-][-][>]", "");

if (!Regex.IsMatch(infobox,"[|]\\s*website\\s*=")) infobox = Regex.Replace(infobox, "([^}]*)([}][}]$)", "$1|website = %%website%% \r\n$2");

for (int i = fullparamlist.Count - 1; i >= 0; i--) if (requiredmap[fullparamlist[i]]) {                   if (!Regex.IsMatch(infobox, "[|]\\s*" + fullparamlist[i] + "\\s*=")) {                       int k = 0; for (int j = i + 1; j < fullparamlist.Count; j++) if (Regex.IsMatch(infobox, "[|]\\s*" + fullparamlist[j] + "\\s*=")) {                               k = j;                                break; }                       infobox = Regex.Replace(infobox, "(\\s*[|]\\s*" + fullparamlist[k] + "\\s*=)", "\r\n|" + fullparamlist[i] + "= $1"); }               }            return infobox; }

private string SpaceEqualSigns(string infobox) {           string blank = ""; foreach (KeyValuePair p in equalsignmap) infobox = Regex.Replace(infobox, "([|]\\s*" + p.Key + ")\\s*=", "$1" + blank.PadLeft(p.Value, ' ') + "="); for (int i = 0; i < fullparamlist.Count; i++) {               infobox = Regex.Replace(infobox, "([|]\\s*" + fullparamlist[i] + "\\s*=) *", "$1 "); infobox = Regex.Replace(infobox, "([^\r\n])([|]\\s*" + fullparamlist[i] + "\\s*=)", "$1\r\n$2"); }           return infobox; }

public string PlaceInfobox(string infobox, string article) {           Regex templateextract = new Regex("\\s*([%][%][%]Infobox[%][%][%])\\s*([{][{][^{}]*[}][}])\\s*"); Regex indentextract  = new Regex("\\s*([%][%][%]Infobox[%][%][%])\\s*([:][^\n]*)\n\\s*"); article = article.Trim; while (templateextract.IsMatch(article) || indentextract.IsMatch(article)) {               if (templateextract.IsMatch(article)) article = templateextract.Replace(article, "\r\n$2\r\n$1"); else article = indentextract.Replace(article, "\r\n$2\r\n$1"); article = article.Trim; }           return Regex.Replace(article, "[%][%][%]Infobox[%][%][%]\\s*", infobox.Trim + "\r\n"); }

private bool OpenExcelSheet(string filename) {           excelapp = new Excel.Application; if (excelapp == null) {               MessageBox.Show("ERROR: EXCEL couldn't be started!"); return false; }

workbook = excelapp.Workbooks.Open(filename, 0, true, 5,               "", "", true, Excel.XlPlatform.xlWindows, "\t", false, false, 0, true, false, 0); sheets = workbook.Worksheets; worksheet = (Excel.Worksheet)sheets.get_Item(1); paramworksheet = (Excel.Worksheet)sheets.get_Item(4); return true; }

private Dictionary GenerateParameter2IdxMap {           parameter2idxmap = new Dictionary; parameterlist = new List ; Excel.Range range = worksheet.get_Range("A3", "BZ3"); Array values = (Array)range.Cells.Value2; string[] strArray = ConvertToStringArray(values);

for (int i = 0; i < strArray.Length; i++) if (strArray[i].Trim.Length > 0) if (!strArray[i].Equals("N/A")) if (!parameter2idxmap.ContainsKey(strArray[i])) {                           parameter2idxmap.Add(strArray[i], i); parameterlist.Add(strArray[i]); }

return parameter2idxmap; }

private Dictionary GenerateArticleMap(string articlecol) {           articlemap = new Dictionary; Excel.Range range = worksheet.get_Range(articlecol + "4", articlecol + "2000"); Array values = (Array)range.Cells.Value2; string[] strArray = ConvertToStringArray(values);

for (int i = 0; i < strArray.Length; i++) if (strArray[i].Length > 3 && !articlemap.ContainsKey(strArray[i])) articlemap.Add(Regex.Replace(strArray[i], "[ _,']","").ToLower, i + 4); return articlemap; }

private void GenerateMaps {           equalsignmap = new Dictionary; fullparamlist = new List ; requiredmap = new Dictionary; deprecatedmap = new Dictionary; Excel.Range range1 = paramworksheet.get_Range("A2","A136"); Excel.Range range2 = paramworksheet.get_Range("B2","B136"); Excel.Range range3 = paramworksheet.get_Range("C2","C136"); Excel.Range range4 = paramworksheet.get_Range("D2","D136"); string[] strArray1 = ConvertToStringArray((Array)range1.Cells.Value2); string[] strArray2 = ConvertToStringArray((Array)range2.Cells.Value2); string[] strArray3 = ConvertToStringArray((Array)range3.Cells.Value2); string[] strArray4 = ConvertToStringArray((Array)range4.Cells.Value2);

for (int i = 0; i < strArray1.Length; i++) {               fullparamlist.Add(strArray1[i].Trim); if (strArray2[i].Length > 0) requiredmap.Add(strArray1[i].Trim, true); else requiredmap.Add(strArray1[i].Trim, false); equalsignmap.Add(strArray1[i].Trim, Convert.ToInt32(strArray3[i].Trim)); if (strArray4[i].Length > 0) deprecatedmap.Add(strArray4[i].Trim, strArray1[i].Trim); }       }

private string GenerateInfobox(string articlename, string blankinfobox) {           string result; result = blankinfobox;

string articlename2 = Regex.Replace(articlename, "[ _,']", "").ToLower; if (!articlemap.ContainsKey(articlename2)) return "Article name not found in database"; int row = articlemap[articlename2];

Excel.Range range = worksheet.get_Range("A" + row.ToString, "BZ" + row.ToString); Array values = (Array)range.Cells.Value2; string[] strArray = ConvertToStringArray(values); Dictionary parametervalues = new Dictionary;

for (int i = 0; i < parameterlist.Count; i++) parametervalues[parameterlist[i]] = strArray[parameter2idxmap[parameterlist[i]]];

Regex param = new Regex("[%][%]([a-zA-Z0-9_]*)[%][%]", RegexOptions.Compiled); while (param.IsMatch(result)) {               Match match = param.Match(result); string paramname = match.Groups[1].Value; Regex param1 = new Regex("[%][%]" + paramname + "[%][%]"); string paramvalue = "";

if (parametervalues.ContainsKey(paramname)) paramvalue = parametervalues[paramname]; else paramvalue = "";

result = param1.Replace(result, paramvalue.Trim); }           result = Regex.Replace(result, "\\s*[|](latm|lats|latNS|longm|longs|longEW)", " |$1"); result = Regex.Replace(result, "[|](latd|latm|lats|latNS|longd|longm|longs|longEW)\\s*=", "|$1 ="); result = Regex.Replace(result, " *[}][}]\\s*$", "}}"); return result; }

private string[] ConvertToStringArray(System.Array values) {           string[] newArray = new string[values.Length]; int index = 0;

for (int i = values.GetLowerBound(0); i <= values.GetUpperBound(0); i++) {               for (int j = values.GetLowerBound(1); j <= values.GetUpperBound(1); j++) {                   if (values.GetValue(i, j) == null) newArray[index] = ""; else newArray[index] = (string)values.GetValue(i, j).ToString; index++; }           }            return newArray; }

bool loadedExcel = false; Excel.Workbook workbook; Excel.Application excelapp; Excel.Sheets sheets; Excel.Worksheet worksheet; Excel.Worksheet paramworksheet; Dictionary parameter2idxmap; Dictionary articlemap; List parameterlist; Dictionary equalsignmap; Dictionary requiredmap; List fullparamlist; Dictionary<string, string> deprecatedmap; string infoboxaliases = "(Infobox City|Infobox Settlement|Infobox Town|Infobox CDP|Infobox SmallCity|Infobox City[-]NoFlag|Infobox City NH|US City infobox|Infobox Village)";

string infoboxtemplate = @" "; } }