Wikipedia:WikiProject Interlanguage Links/Scripts

Into a mysql database load the page and langlinks database dump files from two or more wikipedia Wikipedia:Database_download.

Extract interlanguage links into a single table for anaylysis

CREATE TABLE inter (    from_lang    varchar(10),     from_title   varchar(255),     to_lang      varchar(10),     to_title     varchar(255) ); USE en;

INSERT INTO inter.inter SELECT 'en', page.page_title, langlinks.ll_lang, langlinks.ll_title FROM page, langlinks WHERE page.page_id = langlinks.ll_from AND  page.page_namespace = 0;

USE de;

INSERT INTO inter.inter SELECT 'de', page.page_title, langlinks.ll_lang, langlinks.ll_title FROM page, langlinks WHERE page.page_id = langlinks.ll_from AND  page.page_namespace = 0;

... etc

Tidy up this table and index it:

update inter set from_title = replace( from_title, '_', ' ' ); update inter set to_title = replace( to_title, '_', ' ' );

// Clear nonsensical entries -- entries to nonsensical languages -- entries to blank titles -- entries to unlikely titles -- poss entries to nonexistant articles ?

ALTER TABLE inter ADD INDEX ( from_lang, from_title ); ALTER TABLE inter ADD INDEX ( to_lang, to_title );

Extract the information required, ie:

// suggested reciporcal links // a:x -> b:x and b:x exists and no link from b:? to a:x exists => b:x -> a:x

INSERT INTO suggestions SELECT a.to_lang, a.to_title, a.from_lang, a.from_title FROM inter a INNER JOIN en.page p   ON p.page_title = a.to_title AND p.page_namespace = 0 AND p.page_is_redirect = 0 LEFT JOIN inter b   ON   b.from_lang = a.to_lang AND b.from_title = a.to_title AND b.to_lang = a.from_lang WHERE a.from_lang IN ('de','es','fr','it','nl','ja','pl','pt','sv' ) AND  a.to_lang = 'en' AND  b.from_lang IS NULL;

// Interlanguage links to pages that do not exist // en -> fr only INSERT INTO suggestions SELECT a.from_lang, a.from_title, a.to_lang, a.to_title FROM inter a INNER JOIN en.page f   ON f.page_title = a.from_title AND f.page_namespace = 0 AND f.page_is_redirect = 0 LEFT JOIN fr.page t   ON   t.page_title = a.to_title AND t.page_namespace = 0 WHERE a.from_lang = 'en' AND  a.to_lang = 'fr' AND  t.page_title IS NULL

// Interlanguage links to redirects // en -> fr only

SELECT a.from_lang, a.from_title, a.to_lang, a.to_title FROM inter a INNER JOIN en.page f   ON f.page_title = a.from_title AND f.page_namespace = 0 AND f.page_is_redirect = 0 INNER JOIN fr.page t   ON   a.to_title = t.page_title AND t.page_namespace = 0 AND t.page_is_redirect = 1 WHERE a.from_lang = 'en' AND  a.to_lang = 'fr';

Finally, extract these suggestions in a human-readable format

DROP PROCEDURE IF EXISTS report_suggestions; DELIMITER //

CREATE PROCEDURE report_suggestions( group_size INT, flang VARCHAR(10), tlang VARCHAR(10) ) BEGIN DECLARE sug_pos, sug_base, done INT; DECLARE ftitle, ttitle VARCHAR(255); DECLARE sug CURSOR FOR SELECT DISTINCT from_title, to_title FROM suggestions WHERE from_lang = flang AND to_lang = tlang; DECLARE CONTINUE HANDLER FOR SQLSTATE '23000' SET done = 1;

SET sug_pos = 0; SET sug_base = 0; SET done = 0; OPEN sug;

REPEAT

FETCH sug INTO ftitle, ttitle;

IF NOT done THEN IF sug_pos = 0 THEN SELECT concat( '=== ', sug_base, ' - ', sug_base + group_size - 1, ' ===' ); SET sug_base = sug_base + group_size; SET sug_pos = group_size - 1; ELSE SET sug_pos = sug_pos - 1; END IF;

SELECT concat( '*', ftitle, ' &rarr; ', tlang, ':', ttitle, '' ); END IF;

UNTIL done END REPEAT; CLOSE sug; END; //

DELIMITER ;

call report_suggestions( 10, 'en', 'sv' );