User:ImageTaggingBot/tagbot.pl

The code for ImageTaggingBot. Requires libBot.pm and Pearle.pm.


 * 1) !/usr/bin/perl


 * 1) Tagbot
 * 2) A bot to identify and tag recently-uploaded images that have no image description page, source information, or copyright tag.
 * 1) A bot to identify and tag recently-uploaded images that have no image description page, source information, or copyright tag.

use strict; use warnings; use lib '/home/mark/perllib'; use lib '/home/mark/Desktop/wikibots/common';

use Date::Calc qw(Month_to_Text Today); use Array::Utils; use utf8;

use Data::Dumper;

use libBot;

binmode STDOUT, ":utf8";

my $permit_interruptions = 0;  # Allow talkpage messages to stop the bot?

my ($cur_y, $cur_m, $cur_d);

my %users_notified;                            # List of users notifed. 0, undef = no; 1 = notified once; 2 = notified and second notice my %notifications;                             # List of user,image pairs, used to ensure that no user is ever notified about an image twice. my %dont_notify = ;                          # List of users to never notify my %banned_users = ;                         # List of users banned from uploading my %exempt_users = ;                         # List of users exempt from inspection

my %unknown_tags;                              # List of tags found that are not in either the "good" or "bad" list my @sourcereq_tags;                            # List of tags that provide copyright information but not source information my $sourcereq_tags; my @nosource_tags;                             # List of tags that provide both source and copyright information my $nosource_tags; my @deletion_tags;                             # List of tags that will eventually lead to the deletion of the image my $deletion_tags; my @forbidden_tags;                            # List of tags that should never be seen my $forbidden_tags; my @deprecated_tags;                           # List of tags that shouldn't be used any more my $deprecated_tags; my @nontags;                                   # List of tags that aren't copyright tags but that appear on image description pages my $nontags; my @source_tags;                               # List of tags that provide source information but not copyright status my $source_tags;

sub loadTagList {       my $filename = shift; my @list = ; open INFILE, "<", $filename; while() {               $_ =~ s/#.*//;          # Remove comments $_ =~ s/^\s*//;        # Remove leading whitespace $_ =~ s/\s*$//;        # Remove trailing whitespace push @list, $_ if($_ !~ /^\s*$/); }       close INFILE; return @list; }

sub processTagList {       my $tags = join "|", @_; $tags =~ s/\(/\\\(/g; $tags =~ s/\)/\\\)/g; $tags =~ s/\./\\\./g; $tags =~ s/\*/.*?/g; return "($tags)"; }

@sourcereq_tags = loadTagList("sourcereq.tags"); $sourcereq_tags = processTagList(@sourcereq_tags); print "Sourcereq: Loaded\n\n";

@nosource_tags = loadTagList("nosource.tags"); $nosource_tags = processTagList(@nosource_tags); print "Nosource: Loaded\n\n";

@forbidden_tags = loadTagList("forbidden.tags"); $forbidden_tags = processTagList(@forbidden_tags); print "Forbid: Loaded\n\n";

@deletion_tags = loadTagList("deletion.tags"); $deletion_tags = processTagList(@deletion_tags); print "Deletion: Loaded\n\n";

@deprecated_tags = loadTagList("deprecated.tags"); $deprecated_tags = processTagList(@deprecated_tags); print "Deprecated: Loaded\n\n";

@nontags = loadTagList("nontags.tags"); $nontags = processTagList(@nontags); print "Nontags: Loaded\n\n";

@source_tags = loadTagList("source.tags"); $source_tags = processTagList(@source_tags); print "Sourcetags: Loaded\n\n";

sub tokenSubst {       my $string = shift; my $image = shift;

$string =~ s//$image/g if(defined($image)); $string =~ s//$cur_d/g; $string =~ s//$cur_m/g; $string =~ s//$cur_y/g;

return $string; }

sub loadUserList {       my $file = shift; my %notelist; my $i = 0; Pearle::myLog(4, "File: $file\n"); open INFILE, "<", $file; while() {               my ($user, $reason); $_ =~ s/\s*#.*$//g; chomp; ($user, $reason) = $_ =~ /([^\t]*)\t+(.*)/; next if(!defined($user) or !defined($reason)); $notelist{$user} = $reason; $i++; }       close INFILE; Pearle::myLog(3, "$i notifications loaded\n"); return %notelist;

}

($cur_y, $cur_m, $cur_d) = Today(1);   # Today in GMT $cur_m = Month_to_Text($cur_m);
 * 1) Initialize

Pearle::init("<>", "<>", "tagbot.log","cookies.tagbot.txt"); Pearle::config(nullOK => 1, sanityCheck => 1, loglevel => 3, printlevel => 4); config(username => '<>');

Pearle::myLog(2, "Beginning execution\n");

%dont_notify = loadNotificationList("orphanbot.whitelist"); %banned_users = loadUserList("banneduser.list"); %exempt_users = loadUserList("exemptuser.list");

if(!Pearle::login) {       exit; }

my @articles; @articles = Pearle::getLogArticles(log => 'upload', limit => 150); splice @articles, 0, 20; Pearle::myLog(3, scalar(@articles) . " images found\n");
 * 1) Get the day's uploads
 * 1) Chop off the 20 most recent log entries

foreach my $log_entry (@articles) {       my $image = $log_entry->[0]; my $uploader = $log_entry->[1]; my $summary = $log_entry->[2] || "";

print "$image\n"; print "$uploader\n";

Pearle::myLog(2, "Processing image $image\n");

# Basic checks that can be done from the log alone

# Non-terminating check: Was the image uploaded by a blacklisted user? if($banned_users{$uploader}) {               botwarnlog("*Image $image uploaded by blacklisted user User:$uploader\n"); Pearle::myLog(3, "Upload by banned user $uploader\n"); }

# Terminating check: Is the user on the whitelist? if($exempt_users{$uploader}) {               Pearle::myLog(2, "Upload by exempt user $uploader found.\n"); next; }

# Terminating check: Is the upload a modification? if($summary =~ /optimi(z|s)ed using (optipng|PNGCrusher)/i) {               Pearle::myLog(2, "Optimize upload found for image $image\n"); next; }       if($summary =~ /tweak|crop|scale|adjust|change|resize|corrected|correcting/i) {               Pearle::myLog(2, "Tweak found for image $image\n"); next; }

# Terminating check: Is the upload a revert? if($summary =~ /Reverted to earlier revision|Reverted to version/) {               Pearle::myLog(2, "Revert upload found for image $image\n"); next; }

# Get page data my $image_data = Pearle::APIQuery(titles => $image, prop => ['templates', 'revisions'],                                         tllimit => 500,                                               # All the templates                                          rvprop => ['content'],                                        # Article body                                          meta => 'userinfo', uiprop => ['hasmsg'],                     # Check for talkpage messages                                          redirects => 1,                                               # Resolve redirects                                          );

if(!defined($image_data)) {               Pearle::myLog(1, "Server did not return an appropriate response.\n"); next; }       my $parsed_xml = Pearle::getXMLParser->XMLin($image_data, ForceArray => ['tl'] ); Pearle::myLog(4, Dumper($parsed_xml));

my $page_text = GetPageText($parsed_xml); my @templates = GetPageTemplates($parsed_xml);

# Remove non-tags from template list # TODO: Remove redlinks @templates = grep {$_ !~ /:$nontags$/i} @templates;

my $stripped_page_text = $page_text || ""; $stripped_page_text =~ s/^==.*?==//gm;         # Remove section headers $stripped_page_text =~ s/\n//g;                # Remove newlines $stripped_page_text =~ s/}//g;       # Remove template parameters $stripped_page_text =~ s/{{[^}]+}}//gi;        # Remove templates

Pearle::myLog(4, "Templates: " . join(", ", @templates) . "\n"); Pearle::myLog(4, "Stripped text: $stripped_page_text\n"); print "=============================================================================\n";

# Check for interruptions if($permit_interruptions and DoIHaveMessages($image_data)) {               Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n"); last; }

# Sanity check: Does the image still exist? if(defined($parsed_xml->{query}->{pages}->{page}->{missing})) {               Pearle::myLog(2, "Image $image has already been deleted\n"); next; }

# Sanity check: Is the image marked for deletion? if(grep {$_ =~ /:$deletion_tags$/i} @templates) {               # We don't do anything with images already marked for deletion. There are just too many corner cases and wasted-effort conditions. Pearle::myLog(2, "Deletion tag found\n"); next; }

######### Check for source, license, and tag ###################

# Meanings: "undef" = we don't know, "0" = definitely no, "1" = probably yes my $has_source = undef; my $has_license = undef; my $has_tag = undef;

# Does the image lack a description page? if($page_text =~ /^\s*$/) {               Pearle::myLog(3, "Empty IDP\n"); $has_source = 0; $has_license = 0; $has_tag = 0; }

# Does the image have a source-providing template? if(my @tags = grep {$_ =~ /:$source_tags$/i} @templates) {               Pearle::myLog(3, "Source-providing template @tags found\n"); $has_source = 1;

# Remove the tags from the candidate set: they can't keep an image from being "untagged" or having unknown tags @templates = Array::Utils::array_diff(@templates, @tags); }

# Does the image have a self-sourcing tag? if(my @tags = grep {$_ =~ /:$nosource_tags$/i} @templates) {               Pearle::myLog(3, "Self-sourcing tag @tags found\n"); $has_source = 1; $has_license = 1; $has_tag = 1; }

# Does the image have a sourcereq tag? if(my @tags = grep {$_ =~ /:$sourcereq_tags$/i} @templates) {               Pearle::myLog(3, "Sourcereq tag @tags found\n"); $has_license = 1; $has_tag = 1; }

# Handle "Information" and "Non-free use rationale" tags if(grep {$_ =~ /:Information$/} @templates) {               Pearle::myLog(3, "Has an Information template\n"); # Remove the template from the list @templates = grep {$_ !~ /:Information$/} @templates; # Attempt to parse an "information" template if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i) {                       # If there's a filled-in "source" parameter, assume a source Pearle::myLog(3, "Assuming source in \n"); $has_source = 1; }

if($page_text =~/\|\s*author\s*=\s*[^|}]{4,}/i) {                       # If there's a filled-in "author" parameter, assume a source Pearle::myLog(3, "Assuming source in \n"); $has_source = 1; }

if($page_text =~/\|\s*permission\s*=\s*[^|}]{4,}/i) {                       # If there's a filled-in "permission" parameter, assume a license (but not a tag) Pearle::myLog(3, "Assuming license in \n"); $has_license = 1; }

if($page_text =~/\|\s*flickr_url\s*=\s*[^|}]{4,}/i) {                       # If there's a filled-in "flickr_url" parameter, assume a source Pearle::myLog(3, "Assuming source in \n"); $has_source = 1; }       }

if(grep {$_ =~ /:Non-free use rationale$/} @templates) {               Pearle::myLog(3, "Has a non-free use rationale template\n"); # Remove the template from the list @templates = grep {$_ !~ /:Non-free use rationale$/} @templates; # Attempt to parse an "non-free use rationale" template if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i) {                       # If there's a filled-in "source" parameter, assume a source Pearle::myLog(3, "Assuming source in \n"); $has_source = 1; }

$has_license = 1;      # Assume that it's licensed as "fair use" }

if(grep {$_ =~ /:Non-free media rationale$/} @templates) {               Pearle::myLog(3, "Has a non-free use rationale template\n"); # Remove the template from the list @templates = grep {$_ !~ /:Non-free media rationale$/} @templates; # Attempt to parse an "non-free use rationale" template if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i) {                       # If there's a filled-in "source" parameter, assume a source Pearle::myLog(3, "Assuming source in \n"); $has_source = 1; }               if($page_text =~ /\|\s*publisher\s*=\s*[^|}]{4,}/i) {                       # If there's a filled-in "publisher" parameter, assume a source Pearle::myLog(3, "Assuming source in \n"); $has_source = 1; }               if($page_text =~ /\|\s*owner\s*=\s*[^|}]{4,}/i) {                       # If there's a filled-in "owner" parameter, assume a source Pearle::myLog(3, "Assuming source in \n"); $has_source = 1; }

$has_license = 1;      # Assume that it's licensed as "fair use" }

if(grep {$_ =~ /:Non-free image data$/} @templates) {               Pearle::myLog(3, "Has a non-free image data template\n"); # Remove the template from the list @templates = grep {$_ !~ /:Non-free image data$/} @templates; # Attempt to parse an "non-free use rationale" template if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i) {                       # If there's a filled-in "source" parameter, assume a source Pearle::myLog(3, "Assuming source in \n"); $has_source = 1; }

$has_license = 1;      # Assume that it's licensed as "fair use" }       if(grep {$_ =~ /:spoken article entry$/i} @templates) {               Pearle::myLog(3, "Has a Spoken Article template\n"); # Remove the template from the list @templates = grep {$_ !~ /:Spoken article entry$/i} @templates; # Attempt to parse if($page_text =~ /\|\s*user_name\s*=[ \t]*\S+/i) {                       # If there's a filled-in "user_name" parameter, assume a source Pearle::myLog(3, "Assuming source in \n"); $has_source = 1; }               $has_license = 1;       # Assume that it's GFDL }

# Is the image description page lacking in tags? # This is checked here because we may have removed "information" or "non-free use rationale" templates from the list earlier # We want those in the list before here because it makes detecting them for parsing easier, but we don't want them in the list # here so we can say for sure that the page is untagged. if(scalar(@templates) == 0) {               Pearle::myLog(3, "No templates found\n"); $has_tag = 0; }

# Does it have source information outside of the templates? if(length($stripped_page_text) >= 7) # Page text with headers, newlines and templates stripped is at least seven bytes ("my work") {               # TODO: Better source checking if(!defined($has_source) or $has_source == 0) {                       Pearle::myLog(3, "Assuming page has source\n"); $has_source = 1; }               if(!defined($has_license) or $has_license == 0) {                       Pearle::myLog(3, "Assuming page has license\n"); $has_license = 1; }       }        else {               if(!defined($has_source)) {                       # If we still don't know if it has a source, it's safe to assume it doesn't.                        Pearle::myLog(3, "Assuming page doesn't have source\n"); $has_source = 0; }               if(!defined($has_license)) {                       # If we still don't know if it has a license, we'll assume it doesn't                        Pearle::myLog(3, "Assuming page doesn't have license information\n"); $has_license = 0; }       }        ########## Check for exceptional conditions ##########

# Terminating check: Is the image using a deprecated tag, and doesn't have any other license tag? if(($has_tag != 1) and (grep {$_ =~ /:$deprecated_tags$/i} @templates)) {               Pearle::myLog(2, "Image has deprecated tag\n"); # Mark as no-license wikilog($image, tokenSubst("\n"), "Obsolete or deprecated tag"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) {                       Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", "\n$image --~", "Image with obsolete or deprecated license"); }               Pearle::limit; next; }

# Terminating check: Does the image have a forbidden tag? if(my @tags = grep {$_ =~ /:$forbidden_tags$/i} @templates) {               # Doesn't matter what else is on the page, the image requires human handling Pearle::myLog(2, "Forbidden tag $tags[0] found on image $image\n"); botwarnlog("*Forbidden tag $tags[0] found on image $image\n"); Pearle::limit; next; }

# TODO: Terminating check: Malformed fair-use rationale

########## Process #####################################

# We've found an image with only unknown templates if(!defined($has_tag)) {               # We don't know if it has any tags or not, and so we cannot deduce the license or source status # We know an image doesn't have tags if: # * It has no templates # * or all templates are on the "nontags" list # We know an image has tags if: # * We found a tag we know about print "Has unknown tags\n";

my @new_unknown_tags = grep {!defined($unknown_tags{$_})} @templates; if(scalar(@new_unknown_tags) > 0) {                       foreach my $unknown_tag (@new_unknown_tags) {                               Pearle::myLog(2, "Unknown tag  found\n"); botwarnlog("* Unknown tag $unknown_tag found\n"); $unknown_tags{$unknown_tag} = 1; }                       Pearle::limit; }       }        elsif($has_tag == 0) {               if($has_source == 0) {                       if($has_license == 0) {                               # Tag as "no source" and "no license" wikilog($image, tokenSubst("\n\n"), "Image has no source or license information"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) {                                       Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", tokenSubst(" --~\n", $image), "You've uploaded an image with no source or license information"); }                               Pearle::myLog(2, "No source, no license\n"); Pearle::limit; }                       else {                               # No license tag, and it either has a license or we don't know if it has a license # Tag as "no source" and "untagged" wikilog($image, tokenSubst("\n\n"), "Image has no source or license tag"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) {                                       Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", tokenSubst(" --~\n", $image), "You've uploaded an image with no source or license tag"); }                               Pearle::myLog(2, "No source, untagged\n"); Pearle::limit; }               }                else {                       if($has_license == 0) {                               # Tag as "no license" wikilog($image, tokenSubst("\n"), "Image has no license information"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) {                                       Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", tokenSubst(" --~\n", $image), "You've uploaded an image with no license information"); }                               Pearle::myLog(2, "No license\n"); Pearle::limit; }                       else {                               # Tag as "untagged" wikilog($image, tokenSubst("\n"), "Image has no license tag"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) {                                       Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", tokenSubst(" --~\n", $image), "You've uploaded an image with no license tag"); }                               Pearle::myLog(2, "Untagged\n"); Pearle::limit; }               }        }        else {               # If it has a tag, it has a license if($has_source == 0) {                       # Tag as "no source" wikilog($image, tokenSubst("\n"), "Image has no source information"); if(!IsNotified($uploader, undef, $image, undef, \%dont_notify)) {                               Pearle::myLog(2, "Warning user $uploader\n"); wikilog("User talk:$uploader", tokenSubst(" --~\n", $image), "You've uploaded an image with no source information"); }                       Pearle::myLog(2, "No source\n"); Pearle::limit; }               else {                       # Everything's fine Pearle::myLog(2, "Image has no problems\n"); }       }

sleep(2); }

Pearle::myLog(2, "Finished with upload set\n");