User:Dumelow/generatestats.js

// //This script generates a list of the ten shortest and ten longest articles which transclude a template, //e.g., calculates some statistics and plots a histogram. //To use this function add User:Dumelow/generatestats.js to your monobook.js //then go to http://en.wikipedia.org/w/index.php?title=User:Dumelow/generatestats&action=edit //See the talk page for documentation. function keyValuePair(key,value){ this.key = key; this.value = value; } function sortByValue(a, b){ return a.value - b.value } function getBestScale(min,max){ scales = new Array(0.2,0.5,1,2,5,10,20,25,50,100,200,250,500,1000,2000,5000); var val = (max-min)/15; for(var x in scales){ if (scales[x]-val >= 0) return scales[x]; }  return 5000; } function loadXMLDocPassingTemplate(url,handler,template) {  // branch for native XMLHttpRequest object if (window.XMLHttpRequest) { var req = new XMLHttpRequest; }  // branch for IE/Windows ActiveX version else if (window.ActiveXObject) { var req = new ActiveXObject("Microsoft.XMLHTTP"); } if (req) { req.onreadystatechange = function {handler(req,template)}; req.open("GET", url, true); req.send(""); } } function getSizeFromAPI(req,template) { // only if req shows "loaded" if (req.readyState == 4) { // only if "OK" if (req.status == 200) { // ...processing statements go here... if(useTalkCategory || useTemplateCategory) jobsLeft--; var response = req.responseXML.documentElement; var pages = response.getElementsByTagName('page');

if(pages.length > 0){ for(var i=0;i 0){ var geicontinue = embeddedin[0].getAttribute('geicontinue'); if(useTalkCategory || useTemplateCategory) jobsLeft++; loadXMLDocPassingTemplate(queryURL+'&geicontinue='+geicontinue,getSizeFromAPI,template); } 	  //If last page retrieved then start processing else if(jobsLeft == 0){ //If using wiki text size if(document.location.href.indexOf('prosesize') == -1){ sortAndMakeChart; } 	    //If using readable prose size (WARNING:Will load every page which transcludes template. Could be thousands of pages!!) else{ for(var x in pagesList){ var titleURL = encodeURIComponent(pagesList[x].key.replace(/ /g,'_')); loadXMLDocPassingTemplate('/w/index.php?action=render&title='+titleURL,getProseSizeFromPage,pagesList[x].key); } 	    }           } 	 }       } else { alert("There was a problem retrieving the XML data:\n" +                req.statusText); }   } }  function getArticlePageFromTalkPage(req,template) { // only if req shows "loaded" if (req.readyState == 4) { // only if "OK" if (req.status == 200) { // ...processing statements go here... var response = req.responseXML.documentElement; var pages = response.getElementsByTagName('page'); if(pages.length > 0){ for(var i=0;i 0){ var gcmcontinue = categorymembers[0].getAttribute('gcmcontinue'); loadXMLDocPassingTemplate(talkQueryURL+'&gcmcontinue='+gcmcontinue,getArticlePageFromTalkPage,template); }          //All pages retrieved else{ var pageIds=''; for(i in articleList){ //API limited to 50 titles per query if( i%50 == 0 && i>0){ pageIds = pageIds.substr(1); jobsLeft++; loadXMLDocPassingTemplate(queryURL+pageIds,getSizeFromAPI,template); pageIds=''; }              pageIds += '|' + articleList[i]; }            //Process remainder pageIds = pageIds.substr(1); jobsLeft++; loadXMLDocPassingTemplate(queryURL+pageIds,getSizeFromAPI,template); } 	 }      } else { alert("There was a problem retrieving the XML data:\n" +                req.statusText); }   } }  function getPagesFromTemplateCategory(req,template) { // only if req shows "loaded" if (req.readyState == 4) { // only if "OK" if (req.status == 200) { // ...processing statements go here... var response = req.responseXML.documentElement; var pages = response.getElementsByTagName('page'); if(pages.length > 0){ for(var i=0;i 0){ var gcmcontinue = categorymembers[0].getAttribute('gcmcontinue'); loadXMLDocPassingTemplate(templateQueryURL+'&gcmcontinue='+gcmcontinue,getPagesFromTemplateCategory,template); }          //All pages retrieved else{ for(i in articleList){ //API embeddedin query can only take one title jobsLeft++; loadXMLDocPassingTemplate(queryURL+articleList[i],getSizeFromAPI,template); }          } 	 }       } else { alert("There was a problem retrieving the XML data:\n" +                req.statusText); }   } }  function getProseSizeFromPage(req,title) { // only if req shows "loaded" if (req.readyState == 4) { // only if "OK" if (req.status == 200) { // ...processing statements go here... var response = req.responseText; var start = response.indexOf(' ',-1); var stop = 0; var proseSize = 0; while(start > -1){ stop = response.indexOf(' ',start); var para = response.substring(start+3,stop); para = para.replace(/\[\d{1,3}\]/g,''); para = para.replace(/citation needed/g,''); para = para.replace(/(<([^>]+)>)/ig,''); proseSize += para.length; start = response.indexOf(' ',stop); }        proseList[proseIndex++] = new keyValuePair(title,proseSize); document.getElementById('wpTextbox1').value = 'Retrieved prose size for ' + proseIndex + ' out of ' + index + ' articles.\n To abort click the back button in your browser.'; //If last page retrieved then start processing if(proseIndex == index){ pagesList = proseList; sortAndMakeChart; }      } else { alert("There was a problem retrieving the XML data:\n" +                req.statusText); }   } }  function sortAndMakeChart{ pagesList.sort(sortByValue); //Get top ten and bottom ten var bottomTen = '===Ten shortest articles===\n'; for(var i=0;i<10;i++){ bottomTen += ('# ' + pagesList[i].key + ' (' + Math.round(pagesList[i].value/1024) + ' kB)\n'); }  pagesList.reverse; var topTen = '===Ten longest articles===\n'; for(var i=0;i<10;i++){ topTen += ('# ' + pagesList[i].key + ' (' + Math.round(pagesList[i].value/1024) + ' kB)\n'); }

var list = '===List of articles by size===\n'; if(document.location.href.indexOf('&list') != -1){ for(var i=0;i<pagesList.length;i++){ list += ('# ' + pagesList[i].key + ' (' + Math.round(pagesList[i].value/1024) + ' kB)\n'); }  }   //Get Range var max = Math.ceil(pagesList[0].value/1024); var min = Math.floor(pagesList[pagesList.length-1].value/1024); var xScale = getBestScale(min,max); max = Math.ceil(max/xScale)*xScale; min = Math.floor(min/xScale)*xScale; var numBins = (max - min)/xScale; //Calculate statistics var sum = 0.0; var bins = new Array(numBins); for(var i=0;i<numBins;i++){ bins[i]=0; }  for(var i=0;i<pagesList.length;i++){ sum += pagesList[i].value*1.0; bins[Math.floor((pagesList[i].value/1024-min)/(xScale*1.0))]++; }  var mean = (sum/(pagesList.length*1024)).toFixed(3); var median = (pagesList[Math.floor(pagesList.length/2)+1].value/1024).toFixed(3); var statistics = '===Statistics===\n*Number of articles: '+pagesList.length+'\n*Mean: '+mean+' kB\n*Median: '+median+' kB\n'; //Calculate best vertical scale var yMax = Math.max.apply(Math,bins); var yScale = getBestScale(0,yMax); yScale = Math.max(1,yScale); yMax = Math.ceil(yMax/yScale)*yScale; var verticalScale = '\nScaleMajor = gridcolor:darkgrey increment:' + yScale + ' start:0'; if(Math.floor(yScale/2) == yScale/2) verticalScale += '\nScaleMinor = gridcolor:lightgrey increment:' + yScale/2 + ' start:0' //Draw chart var chart = '===Chart===\n ';

if(document.location.href.indexOf('&list') != -1){ document.getElementById('wpTextbox1').value = topTen + '\n' + bottomTen + '\n' + statistics + '\n' + chart + '\n' + list; }  else{ document.getElementById('wpTextbox1').value = topTen + '\n' + bottomTen + '\n' + statistics + '\n' + chart; }  document.getElementById('wpPreview').click; } function generateStatistics{ pagesList = new Array; index = 0; proseList = new Array; proseIndex = 0; articleList = new Array; template =''; queryURL =''; talkQueryURL =''; templateQueryURL =''; jobsLeft = 0; namespace = '0';

useTalkCategory = (document.location.href.indexOf('usetalkcategory') != -1) ? true : false; useTemplateCategory = (document.location.href.indexOf('usetemplatecategory') != -1) ? true : false; specifyNamespace = (document.location.href.indexOf('specifynamespace') != -1) ? true : false;

if(specifyNamespace){ namespace=prompt("Enter the number of the namespace the pages are in\n (0=article, 2=User, 4=Wikipedia etc)",""); }

if(useTalkCategory){ template=prompt("Enter the talk page category you want to check for\n (Don't include Category:)",""); template = "Category:"+template.toUpperCase.substr(0,1)+template.substr(1); talkQueryURL = '/w/api.php?action=query&generator=categorymembers&gcmtitle=' + template + '&gcmlimit=5000&gcmnamespace=1&prop=info&inprop=subjectid&format=xml'; queryURL = '/w/api.php?action=query&prop=info&format=xml&pageids='; loadXMLDocPassingTemplate(talkQueryURL,getArticlePageFromTalkPage,template); } else if(useTemplateCategory){ template=prompt("Enter the template category you want to check\n (Don't include Category:)",""); template = "Category:"+template.toUpperCase.substr(0,1)+template.substr(1); templateQueryURL = '/w/api.php?action=query&generator=categorymembers&gcmtitle=' + template + '&gcmlimit=5000&gcmnamespace=10&prop=info&format=xml'; queryURL = '/w/api.php?action=query&generator=embeddedin&geilimit=5000&geinamespace=0&prop=info&format=xml&geititle='; loadXMLDocPassingTemplate(templateQueryURL,getPagesFromTemplateCategory,template); } else{ template=prompt("Enter the template you want to check for\n (Don't include Template:)",""); template = "Template:"+template.toUpperCase.substr(0,1)+template.substr(1); queryURL = '/w/api.php?action=query&generator=embeddedin&geititle=' + template + '&geilimit=5000&geinamespace=' + namespace + '&prop=info&format=xml'; loadXMLDocPassingTemplate(queryURL,getSizeFromAPI,template); } document.getElementById('wpTextbox1').value = 'Started.'; } addOnloadHook(function  {   if(document.location.href.indexOf('User:Dumelow/generatestats&action=edit') != -1){     generateStatistics;   } }); //