Wikipedia:Bots/Requests for approval/BareRefBot/Code2

function uploadentry(url, title, work = "", metatitle = "", isdead = false ) { const inquery = "INSERT INTO web (url, title, work, metatitle, isdead) VALUES ($1,$2,$3,$4,$5)" var insertarray = [url, title, work, metatitle, isdead] var res = sql.query(inquery, insertarray)

} function getAndTagTitle(geturl) { var isDead = false var browser = webkit.launch // Open up a web browser. Webkit (Safari) is relatively fast, light on memory / processing power, and works on all of the major operating systems. const page = browser.newPage; // Open up a new page await page.goto(geturl) // Go to the page.

var statusnum = result.status // Get status code

//https://developer.mozilla.org/en-US/docs/Web/HTTP/Status for more info. if (statusnum == 404 || statusnum == 410) { isDead = true }

if (statusnum => 400 && statusnum < 500) { return // Some sort of error. Do nothing, ignore. When coming across the URL there will be no match and url will be left aone } if (statusnum => 500 && statusnum < 600) { return // Same other above. } const title = page.title var additionalinfo = {} additionalinfo.metatitle = title // upgrade 1/28/2022, see bot page discussion. additionalinfo.work = new URL(geturl).hostname // If the website is "www.website.notexist/ffsdf", the "work" will be "www.website.notexist" uploadentry(geturl, title, additionalinfo.work, additionalinfo.metatitle, isDead) return // entry has been uploaded

}

function traverse(refitem ) { var traversedcount = -1 var removebaretemp = false // tracking category for multiple bare refs for (refobj of refitem) { // iterate over parser "objects" in the in question traversedcount = traversedcount + 1 // count of objects traversed. if (typeof refobj == "string") { // This is a recursive function, so sometimes it calls a function on a string // A string can not be iterated and if the object passed in is a string it has gone too deep in, so step out. return }      if (refobj.type == "url" && refobj.is_bare == true  ) { usethisurl = refobj[0].toString if (usethisurl.indexOf("archive.") >= 0 || // everything else (note the. at the end)       usethisurl.indexOf("webcit") >= 0 || // webcite        usethisurl.indexOf("youtube.com") >= 0 ||         usethisurl.indexOf("twitter.com") >= 0 ||          usethisurl.indexOf("facebook.com") >= 0 ||          usethisurl.indexOf("instagram.com") >= 0) { // Skip these, because these shoud either be in archive-url (out of scope) or I haven't integrated the fixes for these yet continue

}       var shoulddo = true for (refobj2 of refitem) { // iterate through the whole thing again to check for undeseriables if (typeof refobj2 == "string" && refobj2.trim != "") { shoulddo = false // lets not fix middle ones. For exaple is not something that should be filled break }

if (refobj2.type == "transclusion" && refobj2.name.toLowerCase != "bare url inline") { // If there is some sort of transcluion in the that is not recognized, skip as it might be out of scope. shoulddo = false break }

}       if (!shoulddo) { continue }        usethisurl = usethisurl.replaceAll("|", "%7C") // escape for  CS1 parsethis = parsethis + " |url=" + usethisurl if (usethisurl.indexOf(".pdf") >=0) { continue }        getAndTagTitle(usethisurl)

}      if (obj.type == "tag_inner") { traverse(obj[traversedcount]) // Deal with nested refs, and other parser strangeness. }   }

} function main(filename) { var wikitxt = fs.readFileSync(filename).toString var page_data = CeL.net.wiki.parser(wikitxt) parsed_data = page_data.parse parsed_data.each("tag_inner", function refprocess(token, index, parent) {      if (!parent || parent.tag != "ref") {          // we dont want to convert non ref bares (e.g.: URLS out of nowhere and external link sections)          return      }      traverse(token, datetype)  }) }