Wikipedia:Duplicated sections/script

$| = 1;
 * 1) Hot pipes


 * 1) This script is expecting entries.txt to be a relatively database
 * 2) dump that has been pre-processed to put each page on line by itself.


 * 1) On 31 July 2005, this script ran on a 1.2GHz i686 laptop with ~700MB
 * 2) RAM in about 20 minutes.  Not using the dupHeaders filter will
 * 3) cause it to take probably about 5 hours or more.


 * 1) The author of this script is Christopher Beland, User:Beland on
 * 2) en.wikipedia.org.  It is hereby released into the Public Domain.
 * 3) Feel free to use it for any purpose whatsoever.

use strict;

main;

sub main {

my ($cur_id, $cur_namespace, $cur_title, $cur_text, @junk, $line,	$cur_namespace_name, $i, $j, @tokens, $printed, $chain);

unless (-d "./todo") {	mkdir "./todo"; }

open (ENTRIES, "todo/duplicate-chunks.txt") || die "Cannot write todo/blank-pages.txt" ;

while () {	if (++$j % 100 == 0) {	   print STDERR $j."\r"; }

$line = $_; eval("\@tokens = $line"); ($cur_id, $cur_namespace, $cur_title, $cur_text, @junk) = @tokens;

unless (dupHeaders($cur_text) == 1) {	   next; }

if ($cur_namespace == -2) {	   $cur_namespace_name = "Media:"; }	elsif ($cur_namespace == -1) {	   $cur_namespace_name = "Special:"; }	elsif ($cur_namespace == 0) {	   $cur_namespace_name = ""; }	elsif ($cur_namespace == 1) {	   $cur_namespace_name = "Talk:"; }	elsif ($cur_namespace == 2) {	   $cur_namespace_name = "User:"; }	elsif ($cur_namespace == 3) {	   $cur_namespace_name = "User_talk:"; }	elsif ($cur_namespace == 4) {	   $cur_namespace_name = "Wikipedia:"; }	elsif ($cur_namespace == 5) {	   $cur_namespace_name = "Wikipedia_talk:"; }	elsif ($cur_namespace == 6) {	   $cur_namespace_name = ":Image:"; }	elsif ($cur_namespace == 7) {	   $cur_namespace_name = "Image_talk:"; }	elsif ($cur_namespace == 8) {	   $cur_namespace_name = "MediaWiki:"; }	elsif ($cur_namespace == 9) {	   $cur_namespace_name = "MediaWiki_talk:"; }	elsif ($cur_namespace == 10) {	   $cur_namespace_name = "Template:"; }	elsif ($cur_namespace == 11) {	   $cur_namespace_name = "Template_talk:"; }	elsif ($cur_namespace == 12) {	   $cur_namespace_name = "Help:"; }	elsif ($cur_namespace == 13) {	   $cur_namespace_name = "Help_talk:"; }	elsif ($cur_namespace == 14) {	   $cur_namespace_name = ":Category"; }	elsif ($cur_namespace == 15) {	   $cur_namespace_name = "Category_talk:"; }

# Remove leading and trailing 's.	$cur_title =~ s/^\'//; $cur_title =~ s/\'$//; # Remove leading and trailing whitespace $cur_title =~ s/^\s*//; $cur_title =~ s/\s*$//;

$cur_text =~ s/\\n/ /g; $cur_text =~ s/\s+/ /g;

my (%chains, @chunks, $i, $per, $numberRepeated);

@chunks = split (" ", $cur_text); while (@chunks > 3) {	   $chain = $chunks[-1]." ".$chunks[-2]." ".$chunks[-3]; $chains{$chain}++; pop(@chunks);

# Note: pop from the rear is a bjillion times more # efficient than unloading manually from the front.

$i++; }


 * 1) 	print DUPHEAD "* ".$cur_namespace_name.$cur_title." $i\n";

$printed = 0;

foreach $chain (keys(%chains)) {	   if ($chains{$chain} > 1) {		if ($printed == 0) {		   print DUPHEAD "* ".$cur_namespace_name.$cur_title.""; $printed = 1; }		$numberRepeated++ }	}
 * 1) 		print DUPHEAD $chains{$chain}.": ".$chain."\n";

if ($printed == 1) {	   $per = int(($numberRepeated / $i) * 100); print DUPHEAD " ${per}% repeated - $numberRepeated out of $i triplets\n"; }

}   close (ENTRIES); close (DUPHEAD); }

sub dupHeaders {   my ($text, %headers, $line); $text = $_[0]; unless ($text =~ m/=/) {	# No headers means no duplicate headers return (0); }

$text =~ s/\\n/\n/g; foreach $line (split ("\n", $text)) {	if ($line =~ m/^\s*\=/) {	   $headers{$line}++; }   }    foreach $line (keys(%headers)) {	if ($headers{$line} > 1) {	   # Found a duplicated header return(1); }   }

# Didn't return, so must not have found any duplicate headers return(0); }

print `sort -nr -k3 todo/duplicate-chunks.txt > todo/duplicate-chunks-sorted.txt`