User:Plastikspork/spider


 * 1) !/usr/bin/perl
 * 2) Retrieve a subdivision place hierarchy from Maplandia
 * 1) Retrieve a subdivision place hierarchy from Maplandia

use warnings; use strict; use Benchmark; use LWP::UserAgent;
 * 1) Initialize

my ($mdate); use vars qw($DEBUG $BASEURL);

$DEBUG = 2;
 * 1) Set the debug level

$mdate = localtime;
 * 1) Get the time

$BASEURL = "http://www.maplandia.com/";
 * 1) Set the base url

my $PLACEURL = "http://www.maplandia.com/burma/"; my $BASETITLE = "Burma";
 * 1) Where should we start?

open OUT, "> ".$BASETITLE.".log"; print "".$BASETITLE."\n"; print OUT "".$BASETITLE."\n"; &spider_maplandia( $PLACEURL, "*" ); close(OUT);

sub get_http {   # Retrieve a requested html page: my ($this_url) = @_; my ($useragent, $http_request, $useragent_result); my ($EV_REDIR, $reurl,        $xmlreurl); $EV_REDIR = "";  $useragent = new LWP::UserAgent;  $useragent->agent("Mozilla/5.0 (compatible; educational project)");  $useragent->timeout(60);    # Timeout after 60 seconds

$http_request = new HTTP::Request GET => $this_url; $useragent_result = $useragent->request($http_request); if ($useragent_result->is_success) { if ($useragent_result->content =~ /$EV_REDIR/i) { $reurl   = $1; $xmlreurl = $reurl; $xmlreurl =~ s/&/&amp;/g; print LOG "\nRedirected to $xmlreurl\n" if ($DEBUG >= 2); return &get_http($reurl); }   return $useragent_result->content; } else { print LOG "Could not get $this_url\n"; return ""; } }

sub spider_maplandia { my $topurl = shift @_; my $indent = shift @_;

# Grab the page my $in = &get_http($topurl);

# Preprocess the HTML $in =~ tr/\r\n/ /d;    # Compress into single line $in =~ s/\cM//g;       # Remove Ctrl-M's  $in =~ s/[ ]+/ /g;      # Remove redundant spacing $in =~ s/<\/?span[^<>]*>//gi; # Span

# Get the list of subplaces if ($in =~ / ((?:]*>|<\/ul>|]*>|<\/li>|]*>|]*>|<\/a>|[^<>]*)*)<\/div>/gi) { my $blist = $1; foreach my $place ($blist =~ /((?:]*>|<\/a>|[^<>]*)*)<\/li>/gi) { if( $place =~ /]*href="([^" ]*)" [ ]*title="([^"<>]*)"[^<>]*>[^<>]*<\/a>/ ) { my ($surl, $title) = ($1,$2); $title =~ s/\[/(/g;          $title =~ s/\]/)/g; $title =~ s/\&quot;/"/g;          print $indent." ".$title."\n";          print OUT $indent." ".$title."\n";          sleep 2;           &spider_maplandia( $BASEURL.$surl, "*".$indent );        }     }  }

return; }