#!/usr/local/bin/perl -w # wsj2conll: convert a part of the Wall Street Journal corpus to the # NP bracketing format for the CoNLL-99 shared task # usage: wsj2conll files # notes: - related url: http://lcg-www.uia.ac.be/conll99/npb/ # - in order to obtain complete data files you have to combine # the output of this script with word and POS tag information. # For sections 15-18 and 20 this information can be found on # ftp://ftp.cis.upenn.edu/pub/chunker/ # - input format: as used in the combined section of the Penn # Treebank Project: Release 2 CDROM # - brackets are put in the same positions as in the WSJ corpus # and thus they differ for baseNP brackets from Ramshaw & # Marcus WVLP95 (notable example: genitive 's) # - earlier version names: wsj2rm, wsj2np # 990412 Erik Tjong Kim Sang, University of Antwerp, erikt@uia.ua.ac.be @tags = (); $buffer = ""; $lastField = ""; sub processWord { $buffer .= "*"; } sub processClosingBracket { local($tag) = pop(@tags); if ( ( $tag =~ /^NP/ || $tag =~ /^WHNP/ ) ) { $buffer .= ")"; } } sub processTag { local($tag) = $field; push(@tags,($tag)); if ( $tag =~ /^NP/ || $tag =~ /^WHNP/ ) { $buffer .= "("; } } sub printBuffer { local(@chars) = (); local($c) = ""; local($lastC) = "("; local($oldBuffer) = $buffer; # remove empty NPs: "()" -> "" $buffer =~ s/\(\)//g; while ($buffer ne $oldBuffer) { $oldBuffer = $buffer; $buffer =~ s/\(\)//g; } # simplify double bracket pairs: "((NP))" -> "(NP)" $buffer =~ s/\(\(\*\)\)/\(\*\)/g; while ($buffer ne $oldBuffer) { $oldBuffer = $buffer; $buffer =~ s/\(\(\*\)\)/\(\*\)/g; } @chars = split(//,$buffer); foreach $c (@chars) { if ($lastC ne "(" && $c ne ")" ) { print "\n"; } print "$c"; $lastC = $c; } print "\n\n"; } sub processLine { $line = shift(@_); chop($line); @fields = split(/([() ])/,$line); for (@fields) { $field = "$_"; if (($field ne " ")&&($field ne "")) { if ( $field eq ")" ) { &processClosingBracket; } elsif ( $lastField eq "(" ) { &processTag; } elsif ( "$field" ne "(" && "$tags[$#tags]" ne "-NONE-" ) { &processWord; } $lastField = $field; } } if ( $#tags<0 && $buffer ne "" ) { # processing end of sentence &printBuffer; $buffer = ""; } } # main routine if ( $#ARGV<0 ) { while (<>) { &processLine($_); } } else { foreach $fileName (@ARGV) { open(FILE,$fileName) || die "wsj2conll: cannot open file $fileName\n"; while () { &processLine($_); } close(FILE); } } exit(0);