#!/usr/bin/perl # Inter-Annotator Agreement # # This scripts takes several .anvil files of the same # timeline as arugments and calculates the amount of # agreement between them. # # Craig Martell # cmartell@gradient.cis.upenn.edu # # FORM Project, Linguistic Data Consortium # University of Pennsylvania # # http://www.ldc.upenn.edu/Projects/FORM/ # use Graph; $timeFormat = "%.0f"; ### This is to easily adjust the time-stamp tolerance. $timeIgnore = 1; $timeStart = 73; $timeLimit = 75; ### This is to insure that all files are read to the same point. ### Set to the ceiling of the highest time stamp of the shortest ### file. ### This reads in the files and create the annotation graphs for each annotator. $graphCounter = 0; foreach $file (@ARGV){ open (FILE, "./$file"); $graphCounter++; $annotationGraph{$graphCounter} = Graph->new(); while () { $line = $_; ### The following extracts the trackname, which will be part of the edge name if ($line =~ /track name/) { ($trackName) = ($line =~ /track name="(.*?)"/); $trackName =~ s/ //g; ($highLevel,$midLevel,$lowLevel) = (split(/\./, $trackName)); } ### This sets the nodes if ($line =~ //); $start = sprintf("$timeFormat",$start); $end = sprintf("$timeFormat",$end); if ($start >= $timeStart && $end <= $timeLimit){ if ($timeIgnore) { $start = 0; $end = 0; } $annotationGraph{$graphCounter}->add_edge( "$start", "$end"); } } ### This sets the names of the edges if ($line =~ /(.*?)set_attribute($totalAttribute,$start,$end,$value); } } close (FILE); } ### This creates the labeled-edge hashes for each file $arcCounter = 0; foreach $graph (keys %annotationGraph){ my $u, $v; my @vertices = sort {$a <=> $b} $annotationGraph{$graph}->vertices; print $#vertices+1 . "\n"; foreach $u (@vertices){ my @successors = $annotationGraph{$graph}->successors($u); undef %singular; foreach $v (@successors){ if (!$singular{$v}){ $singular{$v} = 1; %attributes = $annotationGraph{$graph}->get_attributes($u, $v); foreach $totalAttribute (sort keys %attributes){ $arcKey = "$u" . "::$v" . "::$totalAttribute" . "::$attributes{$totalAttribute}"; # print "$arcKey\n"; $arcCounter++; $graphList[$graph]{$arcKey}++; } } } } } print "\nTotal number of arcs: $arcCounter\n"; ### Find the intersection and remove it from each graph foreach $key (sort {$a <=> $b } keys %{$graphList[1]}){ undef $keyNotFound; for (my $i=2;$i<=$graphCounter;$i++){ if (!exists $graphList[$i]{$key}){ $keyNotFound = 1; } } if (!$keyNotFound){ push @intersection, $key; } } foreach $key (@intersection) { for (my $i=1;$i<=$graphCounter;$i++){ undef $graphList[$i]{$key}; } } $intersectionPercent = (($graphCounter*($#intersection + 1))/$arcCounter)*100; $intersectionPercent = sprintf("%.2f", $intersectionPercent); print "Exact Agreement $intersectionPercent\%\n" . "=================================================================================\n"; #foreach $member (@intersection){print "$member\n";} ### Now find and remove the intersection if we drop the value of attribute. ### This checks to see if we are getting the same properties, and ### times, and are just disagreeing on the value. for (my $i=1;$i<=$graphCounter;$i++){ foreach $key (keys %{$graphList[$i]}){ $key =~ s/^(.*)::.*?$/$1/; $graphListNoValue[$i]{$key}++; } } foreach $key (sort {$a <=> $b} keys %{$graphListNoValue[1]}){ undef $keyNotFound; for (my $i=2;$i<=$graphCounter;$i++){ if (!exists $graphListNoValue[$i]{$key}){ $keyNotFound = 1; } } if (!$keyNotFound){ push @intersectionNoValue, $key; } } foreach $key (@intersectionNoValue) { for (my $i=1;$i<=$graphCounter;$i++){ undef $graphList[$i]{$key}; } } $intersectionNoValuePercent = (($graphCounter*($#intersectionNoValue + 1))/$arcCounter)*100; $intersectionNoValuePercent = sprintf("%.2f", $intersectionNoValuePercent); print "\nNo-Value Agreement $intersectionNoValuePercent\%\n" . "=================================================================================\n"; #foreach $member (@intersectionNoValue){print "$member\n";} #undef @intersection; #$intersectionCounter = 0; #foreach $file (keys %annotationGraph){ # $intersectionCounter++; # if ($intersectionCounter == 1) { # @intersection{keys %file} = (); # } else { # @intersection{ @intersection, keys %intersection} = (); # } #} #$intersectionSize = 0; #foreach $member (@intersection){ # print "$intersectionSize -- $member\n"; #}