User:Philosobot/Source code/phillists/redlinks/extract all links.pl

use strict;		     # 'strict' insists that all variables be declared use diagnostics;	     # 'diagnostics' expands the cryptic warnings undef $/; # undefines the separator. Can read one whole file in one scalar. use lib $ENV{HOME}. '/public_html/wp/modules'; # path to perl modules use lib $ENV{HOME}. '/public_html/wp/phillists'; require 'bin/perlwikipedia_utils.pl'; require "read_from_write_to_disk.pl"; use open 'utf8'; MAIN:{ my ($Editor, $article, $text, $count, @local_links, $link, %links, @articles, %local_hash);
 * 1) !/usr/bin/perl
 * 1) write a list of links (red and blue) showing up in phil articles. For each
 * 2) link, memorize in how many articles it shows up.

$Editor=wikipedia_login;

open (FILE, "<", "../All_philosophy.txt");   @articles=split ("\n", ); close(FILE); open (FILE, "<", "../All_philosophers.txt"); @articles=(@articles, split ("\n", )); close(FILE);
 * 1) get the list of phil articles

$count=0; foreach $article (@articles) {

next if ($article =~ /^\s*$/); # ignore empty lines print "--now in $article\n";

$text = &read_from_disk_or_wikipedia($Editor, $article);

@local_links = ($text =~ /\[\[\s*(.*?)\s*[\#\|\]]/g); %local_hash=; foreach $link (@local_links){
 * 1) make a local hash containing all links which show up in this article

next if ($link =~ /^\s*$/); # ignore empty links next if ($link =~ /:/);    # look only at links in the article namespace $link =~ s/^(.)/uc($1)/eg; $link =~ s/_/ /g;

$local_hash {$link}++; }

foreach $link (keys %local_hash){ $links{$link}++; }
 * 1) adding things to %local_hash first, and to %hash later, makes
 * 2) sure that if a link shows up many times in an article it is
 * 3) still counted only once in %links

$count++; }
 * 1) code very useful for debugging, don't delete
 * 1) last if ($count > 400);

open (FILE, ">", "Links.txt"); foreach $link ( sort { $links{$b} <=> $links{$a} } keys %links ){ print FILE "\[\[$link\]\] -- $links{$link}\n"; } close(FILE); }
 * 1) write to disk. Links which show up more often come on top.