User:Harej/citation-watchlist-staging.js

/* Per-wiki configuration */

const LANGUAGE = 'en'; const FAMILY = 'wikipedia'; const actionApiEndpoint = `https://${LANGUAGE}.${FAMILY}.org/w/api.php`; const restApiEndpoint = `https://api.wikimedia.org/core/v1`; const publicSuffixList = "Wikipedia:Citation_Watchlist/Public_Suffix_List"; const listOfLists = "Wikipedia:Citation_Watchlist/Lists"; const msgWarning = "Warning"; const msgCaution = "Caution"; const msgInspect = "Inspect"; const warnEmoji = '\u2757'; const cautionEmoji = '\u270B'; const inspectEmoji = '\uD83D\uDD0E'; const warnSectionHeader = "==Warn=="; const cautionSectionHeader = "==Caution=="; const inspectSectionHeader = "==Inspect=="; const delayMs = 50; const maxRequestsPerHour = 400;

/* Citation Watchlist Script – Highlights watchlist entries when questionable sources are added

author: Hacks/Hackers license: GPL 4.0

let publicSuffixSet = new Set; let warnList = new Set; let cautionList = new Set; let inspectList = new Set; let lastRequestTime = 0;

// The Wikimedia REST API has a hard request limit of 500 per hour, and no clear // way to batch these requests. As such, we need to track our requests, and to do // so globally across the whole session (not just a single instantiation of the // script.)

if (!localStorage.getItem('citationWatchlistRestApiRequestCount')) { localStorage.setItem('citationWatchlistRestApiRequestCount', '0'); }

setInterval( => { localStorage.setItem('citationWatchlistRestApiRequestCount', '0');  console.log("Request count reset"); }, 3600000);

function getRequestCount { const count = parseInt(localStorage.getItem('citationWatchlistRestApiRequestCount'), 10); return isNaN(count) ? 0 : count; }

function incrementRequestCount { const currentCount = getRequestCount; localStorage.setItem('citationWatchlistRestApiRequestCount', (currentCount + 1).toString); console.log(`Request count incremented to ${currentCount + 1}`); }

function prependEmojiWithTooltip(element, emoji, domains, tooltipText) { let processedType = ''; if (emoji === warnEmoji) { processedType = 'warn'; } else if (emoji === cautionEmoji) { processedType = 'caution'; } else if (emoji === inspectEmoji) { processedType = 'inspect'; } else { console.error('Unsupported emoji type'); return; }

if (element.getAttribute(`data-processed-${processedType}`) === 'true') { return; }

const emojiSpan = document.createElement('span'); emojiSpan.textContent = emoji + " "; emojiSpan.title = tooltipText + ": " + domains.join(", "); element.parentNode.insertBefore(emojiSpan, element); element.setAttribute(`data-processed-${processedType}`, 'true'); }

async function parseWatchlist { // Select all containers of the watchlist links to process them individually const entriesContainers = document.querySelectorAll('.mw-changeslist-links'); const revisions = []; const revisionIds = [];

let linkCounter = 0;

// Build map of previous revision IDs for (const container of entriesContainers) { const prevLink = container.querySelector('a.mw-history-histlinks-previous'); let urlParams = ''; if (prevLink) { urlParams = new URLSearchParams(prevLink.href); revisionIds.push(urlParams.get('oldid')); } }  console.log(revisionIds); const previousRevisionMap = await fetchPreviousRevisionIds(revisionIds);

for (const container of entriesContainers) { const diffLink = container.querySelector('a.mw-changeslist-diff'); const histLink = container.querySelector('a.mw-changeslist-history'); const prevLink = container.querySelector('a.mw-history-histlinks-previous'); const curLink = container.querySelector('a.mw-history-histlinks-current');

if (diffLink) { // First we are checking if we are in recent changes / watchlist. // If a "diff" link is found, process it     linkCounter += 1; urlParams = new URLSearchParams(diffLink.href); revisions.push({       oldrevision: urlParams.get('diff'),        newrevision: urlParams.get('oldid'),        element: diffLink.parentNode.parentNode      }); } else if (histLink) { // If no "diff" link is found but a "hist" link is, process the "hist" link linkCounter += 1; urlParams = new URLSearchParams(histLink.href); const pageID = urlParams.get('curid'); const firstID = await fetchFirstRevisionId(pageID); revisions.push({       oldrevision: firstID,        element: histLink.parentNode.parentNode      }); } else if (prevLink) { // At this point, check if we are on a page history rather than watchlist linkCounter += 1; urlParams = new URLSearchParams(prevLink.href); revisions.push({       oldrevision: urlParams.get('oldid'),        newrevision: previousRevisionMap[urlParams.get('oldid')],        element: prevLink.parentNode.parentNode      }); } else if (curLink) { // No prev link means we are at the page's first revision // We do not actually want to compare to the current revision. We extract // the oldid and treat like a new page. linkCounter += 1; urlParams = new URLSearchParams(curLink.href); revisions.push({       oldrevision: urlParams.get('oldid'),        element: curLink.parentNode.parentNode      }); } }

// Finally, to get to this point, you are on a page history with only // one revision, and therefore no links of any kind. Extract first (and // only) revision ID from page title. if (linkCounter == 0) { const pageID = mw.config.get('wgArticleId'); const firstID = await fetchFirstRevisionId(pageID); revisions.push({     oldrevision: firstID,      element: entriesContainers[0]    }); }

return revisions; }

function delay(ms) { return new Promise(resolve => setTimeout(resolve, ms)); }

async function buildURL(params) { const url = new URL(actionApiEndpoint); Object.keys(params).forEach(key => url.searchParams.append(key, params[key])); return url; }

function getRootDomain(hostname, publicSuffixSet) { const domainParts = hostname.split('.'); for (let i = 0; i < domainParts.length; i++) { const candidate = domainParts.slice(i).join('.'); if (publicSuffixSet.has(candidate) || publicSuffixSet.has(`!${candidate}`)) { return domainParts.slice(i - 1).join('.'); } }  return hostname; }

function extractAddedURLs(addedParts) { const addedURLs = []; const urlRegex = /https?:\/\/[^\s<"]+/g; let match;  while ((match = urlRegex.exec(addedParts)) !== null) {    try {      const url = new URL(match[0]);      addedURLs.push(url.href);    } catch (error) {      console.error(`Invalid URL rejected: ${match[0]}`);    }  }  return addedURLs; }

async function fetchFromActionAPI(params) { const url = await buildURL(params); console.log(`Action API request: ${url}`);

const now = Date.now; const elapsed = now - lastRequestTime; if (elapsed < delayMs) { await delay(delayMs - elapsed); }

lastRequestTime = Date.now;

try { const response = await fetch(url); if (!response.ok) { throw new Error(`Network response was not ok: ${response.statusText}`); }   return await response.json; } catch (error) { console.error('Error fetching data from MediaWiki API:', error); throw error; } }

async function fetchPublicSuffixList { const pslUrl = `https://${LANGUAGE}.${FAMILY}.org/wiki/${publicSuffixList}?action=raw`; console.log(`Raw page text request: ${pslUrl}`); try { const response = await fetch(pslUrl); const content = await response.text; const suffixSet = new Set; const lines = content.split('\n'); for (const line of lines) { if (line.trim && !line.trim.startsWith('//')) { suffixSet.add(line.trim); }   }    return suffixSet; } catch (error) { console.error("Error fetching Public Suffix List:", error); return new Set; } }

async function fetchDiffFromAPI(apiUrl) { if (getRequestCount >= maxRequestsPerHour) { console.warn("Request limit reached, waiting for reset..."); await delay(3600000); // Wait for an hour if the limit is reached }

incrementRequestCount; console.log(`Diff API request: ${apiUrl} (Request count: ${getRequestCount})`); try { const response = await fetch(apiUrl); const data = await response.json; return data["source"] || data["diff"]; } catch (error) { console.error('Error fetching API content:', error); return null; } }

async function fetchDiffAndProcess(revisions) { for (const revision of revisions) { let apiUrl = `${restApiEndpoint}/${FAMILY}/${LANGUAGE}/revision/${revision.oldrevision}`; if (revision.newrevision !== undefined) { apiUrl += `/compare/${revision.newrevision}`; }   const diff = await fetchDiffFromAPI(apiUrl); let addedURLs = [];

if (Array.isArray(diff)) { // actual diffs are arrays; new pages are strings // Types 2 and 4 represent "from". // Types 1 and 5 represent "to". // Type 3 represents changes within a line. It will be harder to extract URL changes in this case. let fromURLs = []; let toURLs = [];

for (const diffLine of diff) { const lineURLs = extractAddedURLs(diffLine.text); for (const URL of lineURLs) { if (diffLine.type === 2 || diffLine.type === 4) { fromURLs.push(URL); } else if (diffLine.type === 1 || diffLine.type === 5) { toURLs.push(URL); }       }      }

const toURLSet = new Set(toURLs); addedURLs = fromURLs.filter(url => !toURLSet.has(url)); } else { addedURLs = extractAddedURLs(diff); }

console.log(`Old revision: ${revision.oldrevision}   New revision: ${revision.newrevision}    API URL: ${apiUrl}    Revision element: ${revision.element.innerHTML}    Added URLs: ${addedURLs.join(' ')}    `);

const matchedWarnDomains = []; const matchedCautionDomains = []; const matchedInspectDomains = [];

for (const url of addedURLs) { const hostname = new URL(url).hostname; const domain = getRootDomain(hostname, publicSuffixSet);

if (warnList.has(domain) && !matchedWarnDomains.includes(domain)) { matchedWarnDomains.push(domain); } else if (cautionList.has(domain) && !matchedCautionDomains.includes(domain)) { matchedCautionDomains.push(domain); } else if (inspectList.has(domain) && !matchedInspectDomains.includes(domain)) { matchedInspectDomains.push(domain); }   }

if (matchedWarnDomains.length > 0) { prependEmojiWithTooltip(revision.element, warnEmoji, matchedWarnDomains, msgWarning); }   if (matchedCautionDomains.length > 0) { prependEmojiWithTooltip(revision.element, cautionEmoji, matchedCautionDomains, msgCaution); }   if (matchedInspectDomains.length > 0) { prependEmojiWithTooltip(revision.element, inspectEmoji, matchedInspectDomains, msgInspect); } } }

async function fetchAndOrganizeDomainLists(pageNames) { const params = { action: 'query', prop: 'revisions', titles: pageNames.join('|'), // Join all page names rvprop: 'content', rvslots: '*', format: 'json', origin: '*' };

try { const data = await fetchFromActionAPI(params); const pages = data.query.pages; const warnList = new Set; const cautionList = new Set; const inspectList = new Set;

for (const pageId in pages) { const content = pages[pageId].revisions[0].slots.main['*']; let currentList = null;

const lines = content.split('\n'); for (let line of lines) { if (line.trim === warnSectionHeader) { currentList = warnList; } else if (line.trim === cautionSectionHeader) { currentList = cautionList; } else if (line.trim === inspectSectionHeader) { currentList = inspectList; }

if (line.startsWith('*') && currentList) { const domain = line.substring(1).trim; currentList.add(domain); }     }    }

return { warnList, cautionList, inspectList }; } catch (error) { console.error('Error fetching or parsing the page content:', error); throw error; } }

async function fetchPreviousRevisionIds(revisionIds) { const params = { action: 'query', prop: 'revisions', revids: revisionIds.join('|'), // join all revision IDs rvprop: 'ids', format: 'json', origin: '*' };

try { const data = await fetchFromActionAPI(params); const pages = data.query.pages; const revisionMap = {}; for (const pageId in pages) { const revisions = pages[pageId].revisions; if (revisions && revisions.length > 0) { for (const revision of revisions) { revisionMap[revision.revid] = revision.parentid; }     }    }    return revisionMap; } catch (error) { console.error('Error fetching previous revision IDs:', error); return {}; } }

async function fetchFirstRevisionId(pageID) { const params = { action: 'query', pageids: pageID, prop: 'revisions', rvlimit: 1, rvdir: 'newer', format: 'json', origin: '*' };

try { const data = await fetchFromActionAPI(params); const pages = data.query.pages; const pageId = Object.keys(pages)[0]; const revisions = pages[pageId].revisions;

if (revisions && revisions.length > 0) { return revisions[0].revid; } else { throw new Error('No revisions found for this page.'); } } catch (error) { console.error('Error fetching first revision ID:', error); return null; } }

async function fetchDomainListPages(pageName) { const cacheKey = `citationWatchlistFetchDomainListPages_${pageName}`; const cacheExpiration = 4 * 60 * 60 * 1000; // 4 hours in milliseconds const now = Date.now; const cachedData = localStorage.getItem(cacheKey); const cachedTimestamp = localStorage.getItem(`${cacheKey}_timestamp`); if (cachedData && cachedTimestamp && (now - parseInt(cachedTimestamp, 10)) < cacheExpiration) { console.log("Loaded list of lists from cache"); return JSON.parse(cachedData); } else { const params = { action: 'query', prop: 'revisions', titles: pageName, rvprop: 'content', rvslots: '*', format: 'json', origin: '*' };   try { const data = await fetchFromActionAPI(params); const page = data.query.pages; const pageId = Object.keys(page)[0]; const content = page[pageId].revisions[0].slots.main['*']; const pageTitles = []; const lines = content.split('\n'); for (let line of lines) { if (line.startsWith('* ')) {         const match = line.match(/\[\[([^\+)\]\]/); // Matches the first instance of Page Title if (match) { pageTitles.push(match[1]); }       }      }      localStorage.setItem(cacheKey, JSON.stringify(pageTitles)); localStorage.setItem(`${cacheKey}_timestamp`, now.toString); console.log("Loaded from API and stored in cache"); return pageTitles; } catch (error) { console.error('Error fetching or parsing the page content:', error); throw error; } } }

async function runScript { publicSuffixSet = await fetchPublicSuffixList; if (publicSuffixSet.size === 0) { console.error('Public Suffix List loading failed'); return; } console.log("Welcome to Citation Watchlist"); const listPages = await fetchDomainListPages(listOfLists); try { const lists = await fetchAndOrganizeDomainLists(listPages); lists.warnList.forEach(warnList.add, warnList); lists.cautionList.forEach(cautionList.add, cautionList); lists.inspectList.forEach(inspectList.add, inspectList); } catch (error) { console.error('Error fetching domain lists:', error); } const watchlistRevisions = await parseWatchlist; await fetchDiffAndProcess(watchlistRevisions); }

runScript.then( => console.log('Citation Watchlist script finished executing'));