User:Eloquence/Wikinfo import script

Copy this from the source, not from here.

This is the first pre-release of the new import script. It runs as a webserver on 8450 and does some cool stuff:
 * 1) Filters Special:Newpages for new articles that are not "from Wikipedia"
 * 2) Does not require a cookies.txt - uses its own account. Works with Wikinfo's new login requirement
 * 3) Auto-generates diffs of new articles that exist in both Wikinfo and Wikipedia
 * 4) Strips signatures from titles when importing

Note that to do all this it has to load quite a few pages, which takes some time for a full set of 500 new pages.

To do:
 * Cache previous runs in wikinfo.db so we don't have to wait 60 seconds
 * Use Special:Export

If you want to use this in some way or another, make sure you install all the used modules first using CPAN.

use LWP::UserAgent; use HTTP::Cookies; use HTTP::Daemon; use HTTP::Status; use HTTP::Response; use URI::Escape; use Text::ParagraphDiff; use GDBM_File ; tie %storage, 'GDBM_File', "wikinfo.db", &GDBM_WRCREAT, 0640; $SIG{INT} = \&catch_zap; # best strategy $WKPREFIX="WIKINFO_"; $WKSPREFIX="WIKINFOSIZE_"; $LCPREFIX="LASTCHECK_"; $WPPREFIX="WIKIPEDIA_"; $DIPREFIX="DIFF_";
 * 1) !/usr/bin/perl

$browser=LWP::UserAgent->new;

$browser->cookie_jar( {} ); @ns_headers = (  'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)',   'Accept' => 'image/gif, image/x-xbitmap, image/jpeg,        image/pjpeg, image/png, */*',   'Accept-Charset' => 'iso-8859-1,*,utf-8',   'Accept-Language' => 'en-US', ); $browser->post("http://www.wikinfo.org/wiki.phtml?title=Special:Userlogin&action=submit",@ns_headers,Content=>[wpName=>"Testuser",wpPassword=>"testpass",wpRemember=>"1",wpLoginAttempt=>"LOG IN"]); $browser->post("http://en.wikipedia.org/w/wiki.phtml?title=Special:Userlogin&action=submit",@ns_headers,Content=>[wpName=>"Testuser",wpPassword=>"testpass",wpRemember=>"1",wpLoginAttempt=>"LOG IN"]);

$d=new HTTP::Daemon(LocalHost=>'localhost', LocalPort => '8450', Reuse=>1); print "Please contact me at: ".$d->url. "\n"; @ns_headers = (  'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)',   'Accept' => 'image/gif, image/x-xbitmap, image/jpeg,        image/pjpeg, image/png, */*',   'Accept-Charset' => 'iso-8859-1,*,utf-8',   'Accept-Language' => 'en-US', );
 * 1) get_wikinfo_new;
 * 2) exit 0;

while ($c = $d->accept) { $r = $c->get_request; my $html; $html.= <



Wikinfo Import Script HTML

if ($r) { if ($r->method eq 'GET' and $r->url->path eq "/") {

my $re=new HTTP::Response; $re->header("content_type"=>"text/html"); $html.= <content($html); $c->send_response($re);

} elsif($r->method eq 'GET' and $r->url->path ne "/") {

my $re=new HTTP::Response; $re->header("content_type"=>"text/html");

$page=substr($r->url->path,1); $html.=import_wikinfo($page); $html.=" "; $re->content($html); $c->send_response($re); }	 else { $c->send_error(RC_FORBIDDEN) }     }      $c = undef;  # close connection }

sub get_wikinfo_new {

my $response = $browser->get(	"http://www.wikinfo.org/wiki.phtml?title=Special:Newpages&limit=500&offset=0",	@ns_headers); $response->content=~m/(.*?)<\/ol>/s; @lines=split(//i,$1); print $#lines; my @checklines;

foreach $line(@lines) { if($line=~m/(.*?)()(.*?)(<\/a>).*?\((.*?) bytes\)/i) { $date=$1; $linkopen=$2; $linktitle=$3; $linkclose=$4; $bytes=$5; if($line=~m/ \((.*)\)<\/em>/i) { $comment=$1; } else { $comment="";

}			$wikititle=to_url($linktitle); $pediatitle=to_url(strip_sig($linktitle));

if(!($comment=~m/from wikipedia \(note changes here\)/i)) { push @checklines,$linkopen; push @checklines,$linktitle; push @checklines,$linkclose; push @checklines,$bytes; push @checklines,$comment;

$tryurl="http://en.wikipedia.org/w/wiki.phtml?title=".$pediatitle. "&action=edit"; $response=$browser->get($tryurl,@ns_headers); $response->content=~m/(.*)<\/textarea>/is; $pediasource=$1; if(($pediasource=~m/\w+/)) {

push @checklines,"$pediatitle";

if(1) { $tryurl="http://www.wikinfo.org/wiki.phtml?title=". $wikititle. "&action=edit"; $response=$browser->get($tryurl,@ns_headers); $response->content=~m/(.*)<\/textarea>/is; $wikinfosource=$1; $diff=text_diff($pediasource, $wikinfosource, {string=>1, plain=>1, escape=>1}); $diff=~m/ (.*)<\/p>/si; $diff=$1; $diff=~s/ size="\+1">/>/gi; push @checklines, $diff; } else { push @checklines, "N/A"; }					push @checklines, "N/A"; # exists, no import possible

} else {

push @checklines, "N/A"; # no Wikipedia URL push @checklines, "N/A"; # no diff $importurl=$d->url.$wikititle; $importlink="Go!"; push @checklines,$importlink;

}

}

}	}

return @checklines; }

sub import_wikinfo {

my $title=shift; my $editurl="http://www.wikinfo.org/wiki.phtml?title=".$title."&action=edit"; my $viewurl="http://www.wikinfo.org/wiki.phtml?title=".$title; my $response = $browser->get($editurl,@ns_headers); my $rv; $pagetitle=to_wiki($title); $pediaurl=to_url(strip_sig($pagetitle));

$response->content=~m/<textarea.*?>(.*)<\/textarea>/is; $source=$1; $source=~s/\&quot;/"/gi; # unescape	$source=~s/\&gt;/>/gi;	$source=~s/\&lt;/</gi;	$source=~s/\&amp;/\&/gi;	if(!($source=~m/\w+/)) {	$rv.= "The page with the specified title was not found: <A HREF='$viewurl'>$viewurl</A> (<a href='$editurl'>edit</a>)";	return $rv;	}
 * 1) 	print "Full:\n".$response->content;
 * 1) 	print "Source:\n".$source;

$source.="\n\nAdapted from the Wikinfo article [$viewurl $pagetitle], licensed under the GNU Free Documentation License.";

$rv.="Checking for duplicate of <A HREF='$viewurl'>$viewurl</A>..<P>";

$tryurl="http://en.wikipedia.org/wiki/".$pediaurl; $response=$browser->get($tryurl,@ns_headers); if($response->content=~m/There is currently no text in this page/) {

$rv.="Posted new article to <a href='$tryurl'>$tryurl</A>!<P>"; $wpurl="http://en.wikipedia.org/w/wiki.phtml?title=".$pediaurl."&action=submit"; $browser->post($wpurl,@ns_headers,Content=>		[ 		wpTextbox1=>$source, 		wpSave=>"Save page", 		wpSummary=>"Imported from Wikinfo via User:Eloquence/Wikinfo import script"		]); } else {

$rv.="Page already exists on Wikipedia: <a href='$tryurl'>$tryurl</A>! You have to merge by hand. :-("

}

return $rv;

}

sub catch_zap { my $signame = shift; untie %storage; die "Program terminated: Received $signame"; }

sub strip_sig { my $title=shift; @names=( "Levan Urushadze", "Fred Bauder"); while ($name=shift(@names)) { $title=~s/(.*) by $name$/$1/g; }	return $title; }

sub to_url { my $title=shift; $title=~s/ /_/gi; $title=uri_escape($title); $title=~s/\'/\%27/gi; return $title; }

sub to_wiki { my $title=shift; $title=uri_unescape($title); $title=~s/\%27/\'/gi; $title=~s/_/ /gi; return $title; }