#!/usr/local/bin/perl -w # wsj2rm: convert a part of the Wall Street Journal corpus to the # NP Chunking format used by Ramshaw and Marcus WVLC95 # usage: wsj2rm files # notes: 1. input format: as used in the combined section of the Penn # Treebank Project: Release 2 CDROM # 2. output format: word, POS tag, IOB tag (seperated by spaces) # 3. the POS tag in the output is the tag assigned by WSJ corpus. # the tags used in the WVLC95 experiments were assigned by the # the Brill tagger: ftp://ftp.cs.jhu.edu/pub/brill/Programs # file RULE_BASED_TAGGER_V.1.14.tar.Z # 4. the IOB tags mean: I: inside a baseNP; O: outside a baseNP; # B: the first word in a base NP # 5. this is not the software as used by Ramshaw and Marcus but # an idependently written version based on their data files. # reference: Lance A. Ramshaw and Mitchell P. Marcus, Text Chunking Using # Transformation-Based Learning, "Proceedings of the Third ACL # Workshop on Very Large Corpora", 1995. # ftp://ftp.cis.upenn.edu/pub/chunker/wvlcbook.ps.gz # 981022 Erik Tjong Kim Sang, University of Antwerp, erikt@uia.ua.ac.be # 981214-981219 revised to get 100% NP accuracy # 981230 removed bug # 990223 added extra patches in processTag $true = 1; $false = 0; @tags = (); @npDiscarded = (); $chunkTag = "O"; $lastChunkTag = "O"; $buffer = ""; $lastField = ""; $ignoreCloseNP = $false; $sentenceSeen = $false; $SClosedNP = $false; $POSLevel = -1; $level = 0; sub printBuffer { print $buffer; if ( $buffer =~ /O\n$/ ) { $lastChunkTag = "O"; } elsif ( $buffer =~ /[IB]\n$/ ) { $lastChunkTag = "I"; } else { print STDERR "cannot happen: buffer=$buffer\n"; exit(1); } $buffer = ""; $POSLevel = -1; } sub processClosingBracket { $level--; $tag = pop(@tags); if ( ( $tag =~ /^NP/ || $tag =~ /^WHNP/ ) ) { $NPStart = $false; # NP continues after POS tag if ( $ignoreCloseNP ) { $ignoreCloseNP = $false; } # no single SYM in NP elsif ( $buffer =~ /^[^ ]* SYM [IB]$/ ) { $buffer =~ s/[IB]$/O/; &printBuffer; } elsif ( $buffer ne "" ) { &printBuffer; } # if S closed NP then do not cause bug elsif ( $SClosedNP ) { $lastChunkTag = "I"; $SClosedNP = $false; } pop(@npDiscarded); } if ( $POSLevel > $level+2 ) { if ( $buffer eq "" ) { printf STDERR "cannot happen: $POSLevel $level $tag\n"; exit(1); } else { if ( $buffer !~ /\n.*\n/ ) { $buffer=~s/POS B\n/POS I\n/g; } &printBuffer; } } } sub processTag { $level++; $sentenceSeen = $true; $tag = $field; push(@tags,($tag)); if ( $tag =~ /^NP/ || $tag =~ /^WHNP/ ) { $SClosedNP = $false; if ( $buffer ne "" ) { if ( $tags[$#tags-2] =~ /^SBAR-ADV/ ) { # fix bug file wsj_1829.mrg line 49 pop(@npDiscarded); push(@npDiscarded,($true)); push(@npDiscarded,($true)); } else { # print non-empty buffer $buffer =~ s/[IB]\n/O\n/g; $buffer =~ s/POS O\n/POS I\n/g; if ( $POSLevel < 0 || $buffer =~ /\n.*\n/ ) { $npDiscarded[$#npDiscarded]=$true; } &printBuffer; $NPStart = $true; push(@npDiscarded,($false)); } } else { # buffer was empty $NPStart = $true; push(@npDiscarded,($false)); } } if ( $tag eq "POS" && $tags[$#tags-2] =~ /^NP/ ) { if ( $#npDiscarded>0 ) { if ( ! $npDiscarded[$#npDiscarded-1] ) { if ( $buffer ne "" ) { &printBuffer; } $ignoreCloseNP = $true; $POSLevel = $level; $NPStart = $true; } } } if ( $tag =~ /^S/ && $tag !~ /^SYM/ && $tag !~ /^SBAR-ADV/ ) { if ( $NPStart ) { $NPStart = $false; pop(@npDiscarded); push(@npDiscarded,($true)); } # the next ten lines contain some hacks in order to get # exactly the same output as RM95 for sections 15-18+20 # of the WSJ corpus. Ideally the should be replaced by # if ( $buffer ne "" ) { if ( ( $buffer ne "" ) && ( $tag ne "SBAR" || $tags[$#tags-1] =~ /^NP/ ) ) { if ( $tags[$#tags-3] !~ /P$/ && $tags[$#tags-3] !~ /P-/ && $tags[$#tags-2] !~ /P$/ && $tags[$#tags-2] !~ /P-/ && $tags[$#tags-1] !~ /P$/ && $tags[$#tags-1] !~ /P-/ ) { $buffer =~ s/[IB]\n/O\n/g; } if ( $tags[$#tags-2] ne "PP-CLR" ) { $buffer =~ s/^`` `` [IB]\n$/`` `` O\n/; } &printBuffer; if ( $lastChunkTag ne "O" ) { $lastChunkTag = "O"; # bug in RM95 files: should be "I" $SClosedNP = $true; } } } } sub processWord { if ( $buffer ne "" ) { $chunkTag = "I"; $buffer = $buffer . "$field $tags[$#tags] $chunkTag\n"; } elsif ( $NPStart ) { if ( $lastChunkTag eq "O" ) { $chunkTag = "I"; } else { $chunkTag = "B"; } $buffer = $buffer . "$field $tags[$#tags] $chunkTag\n"; $NPStart = $false; } else { $chunkTag = "O"; print("$field $tags[$#tags] $chunkTag\n"); if ( $#tags>0 ) { if ( $tags[$#tags-1] =~ /^NP/ || $tags[$#tags-1] =~ /^WHNP/ ) { $npDiscarded[$#npDiscarded] = $true; } } $SClosedNP = $false; } $lastChunkTag = $chunkTag; } sub processLine { $line = shift(@_); chop($line); # do not process lines with empty NPs if ($line !~ /^[^\)]*NP[^\)]*-NONE[^\)]*\) \)[^\)]*$/ ) { @fields = split(/([() ])/,$line); for (@fields) { $field = "$_"; if (($field ne " ")&&($field ne "")) { if ( $field eq ")" ) { &processClosingBracket; } elsif ( $lastField eq "(" ) { &processTag; } elsif ( "$field" ne "(" && "$tags[$#tags]" ne "-NONE-" ) { &processWord; } $lastField = $field; } } if ( $#tags<0 && $sentenceSeen ) { # processing end of sentence print("\n"); $lastChunkTag = "O"; $sentenceSeen = $false; if ( ! $#npDiscarded<0 ) { print STDERR "cannot happen: npDiscarded $#npDiscarded\n"; exit(1); } } } } if ( $#ARGV<0 ) { while (<>) { processLine($_); } } else { foreach $fileName (@ARGV) { open(FILE,$fileName) || die "wsj2rm: cannot open file $fileName\n"; while () { processLine($_); } close(FILE); } } exit(0);