#!/usr/bin/perl -w use strict; # Check a TREC 2010 entity track ELC submission for various # common errors: # * extra fields # * multiple run tags # * missing or extraneous topics # * invalid retrieved documents # * duplicate retrieved documents in a single topic # * too many documents retrieved for a topic # * fewer than maximum allowed retrieved for a topic (warning) # Messages regarding submission are printed to an error log # Change these variable values to the directory in which the error log # should be put my $errlog_dir = "."; # If more than 25 errors, then stop processing; something drastically # wrong with the file. my $MAX_ERRORS = 25; my @topics; my $MAX_RET = 100; my %docnos; # hash of all valid docnos my %numret; # number of docs retri6eved per topic my %rec; # fields in current record my $results_file; # input file to be checked my $errlog; # file name of error log my ($q0warn, $num_errors); # flags for errors detected my $d; # current docid my $line; # current input line my ($topic_string,$q0,$docno,$rank,$sim,$tag,$name, $rest); my $line_num; # current input line number my ($topic, $old_topic, $state); my $what; my $run_id; my $found; my ($i,$t,$col1,$col2,$last_i); my $usage = "Usage: $0 resultsfile\n"; $results_file = shift @ARGV or die $usage; # Initialize data structures used in checks @topics = (1, 2, 3, 4, 5, 6, 7, 11, 12, 15, 16, 17, 19, 20); # number retrived per topic foreach $t (@topics) { $numret{$t} = 0; } open RESULTS, "<$results_file" || die "Unable to open results file $results_file: $!\n"; my @path = split "/", $results_file; my $base = pop @path; $errlog = $errlog_dir . "/" . $base . ".errlog"; open ERRLOG, ">$errlog" || die "Cannot open error log for writing\n"; open INPUT, ">input" || die "Cannot create `input' file: $!\n"; $q0warn = 0; $num_errors = 0; $line_num = 0; $old_topic = "-1"; $run_id = ""; while ($line = ) { chomp $line; next if ($line =~ /^\s*$/); undef $tag; my @fields = split " ", $line; $line_num++; if (scalar(@fields) == 7) { ($topic_string, $q0, $docno, $rank, $sim, $tag, $name) = @fields; } else { &error("Wrong number of fields"); exit 255; } # make sure runtag is ok if (! $run_id) { # first line --- remember tag $run_id = $tag; if ($run_id !~ /^[A-Za-z0-9]{1,12}$/) { &error("Run tag `$run_id' is malformed"); next; } } else { # otherwise just make sure one tag used if ($tag ne $run_id) { &error("Run tag inconsistent (`$tag' and `$run_id')"); next; } } $topic = $topic_string; $topic =~ s/^0*//; if (!exists($numret{$topic})) { &error("Unknown topic ($topic)"); $topic = 0; next; } if ($topic != $old_topic) { $old_topic = $topic; %docnos = (); } # make sure second field is "Q0" if ($q0 ne "Q0" && ! $q0warn) { $q0warn = 1; &error("Field 2 is `$q0' not `Q0'"); } # make sure DOCNO known and not duplicated if (validateUrl($docno)) { if (exists $docnos{$docno}) { &error("Document `$docno' retrieved more than once for topic $topic"); next; } $docnos{$docno} = $topic; } else { # invalid DOCNO &error("`$docno' is an invalid URI"); next; } # remove leading 0's from rank (but keep final 0!) $rank =~ s/^0*//; if (! $rank) { $rank = "0"; } $numret{$topic}++; print INPUT "$topic Q0 $docno $rank $sim $run_id $name\n"; } # Do global checks: # error if some topic has no (or too many) documents retrieved for it # warning if too few documents retrieved for a topic foreach $t (@topics) { if ($numret{$t} == 0) { &error("No documents retrieved for topic $t"); } elsif ($numret{$t} > $MAX_RET) { &error("Too many documents ($numret{$t}) retrieved for topic $t"); } elsif ($numret{$t} < $MAX_RET) { print ERRLOG "$0 of $results_file: WARNING: only $numret{$t} documents retrieved for topic $t\n" } } close INPUT || die "Close failed for input: $!\n"; print ERRLOG "Finished processing $results_file\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; if ($num_errors) { exit 255; } exit 0; # print error message, keeping track of total number of errors # line numbers refer to SORTED file since that is the actual input file sub error { my ($msg_string, $at_line) = @_; $at_line = $line_num unless defined $at_line; print ERRLOG "$0 of $results_file: Error on line $at_line --- $msg_string\n"; $num_errors++; if ($num_errors > $MAX_ERRORS) { print ERRLOG "$0 of $results_file: Quit. Too many errors!\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; exit 255; } } sub validateUrl { my($strUrl) = shift; return $strUrl =~ m!(http:|https:|ftp:)//([A-z\d]+)\:([A-z\d]+)\@([A-z\d\-\.]+\.)+[A-z]!i || $strUrl =~ m!^(http:|https:|ftp:)//([A-z\d\-\.]+\.)+[A-z]!i || $strUrl=~ m!^(http:|https:|ftp:)//(\d){1,3}\.(\d){1,3}\.(\d){1,3}\.(\d){1,3}!i ? 1 : 0; }