-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexpand.pl
114 lines (109 loc) · 3.19 KB
/
expand.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/perl
use strict;
use warnings;
my %rev = (
'@' => '~',
'~' => '@',
'@i' => '~i',
'~i' => '@i',
'#m' => '%m',
'%m' => '#m',
'#s' => '%s',
'%s' => '#s',
'#p' => '%p',
'%p' => '#p',
';c' => '-c',
'-c' => ';c',
';r' => '-r',
'-r' => ';r',
';u' => '-u',
'-u' => ';u',
);
sub process_data_file
{
(my $file) = @_;
my $breisfile = $file;
$breisfile =~ s/^data/breis/;
my $outputfile = $file;
$outputfile =~ s/\./plus./;
my %toinsert; # reversed relations to insert to actual PWN
# Princeton WordNet data.* files are ASCII only
my @breislines;
open(BREISFILE, "<", $breisfile) or die "Could not open $breisfile: $!\n";
while (<BREISFILE>) {
next if (/^ /);
push @breislines, $_;
chomp;
(my $synset_offset, my $lex_filenum, my $ss_type, my $w_cnt, my $rest) = /^([0-9]{8}) ([0-9][0-9]) ([nvasr]) ([0-9a-f][0-9a-f]) (.+)$/;
my $decimal_words = hex($w_cnt);
for (my $i=0; $i < $decimal_words; $i++) {
$rest =~ s/^([^ ]+) ([0-9a-z]) //;
}
$rest =~ s/^([0-9]{3}) //;
my $p_cnt = $1;
for (my $i=0; $i < $p_cnt; $i++) {
$rest =~ s/^([^ ]+) ([0-9]{8}) ([nvasr]) ([0-9a-f]{4}) //;
my $pointer_symbol=$1;
my $offset=$2;
my $pos=$3;
my $sourcetarget=$4;
push @{$toinsert{"$offset $pos"}}, $rev{$pointer_symbol}." $synset_offset $ss_type $sourcetarget";
}
unless ($rest =~ m/^\| /) {
print STDERR "Warning: line $. malformed in $breisfile: $rest\n";
}
}
close BREISFILE;
open(OUTPUTFILE, ">:utf8", $outputfile) or die "Could not open $outputfile: $!\n";
# Princeton WordNet data.* files are ASCII only
open(DATAFILE, "<", $file) or die "Could not open $file: $!\n";
while (<DATAFILE>) {
chomp;
next if (/^ /);
if ($file eq 'data.verb' and m/^02423762/) {
# fix bug in upstream WordNet files; see email from
# Francis Bond 29 Mar 2019 re: Some cycles in wordnets
# This is the fix employed by NLTK, cf
# https://github.com/nltk/nltk/issues/1230
s/@ 02422663 v/@ 00612841 v/;
}
my $line = $_;
(my $synset_offset, my $lex_filenum, my $ss_type, my $w_cnt, my $rest) = /^([0-9]{8}) ([0-9][0-9]) ([nvasr]) ([0-9a-f][0-9a-f]) (.+)$/;
if (exists($toinsert{"$synset_offset $ss_type"})) {
print OUTPUTFILE "$synset_offset $lex_filenum $ss_type $w_cnt ";
my $decimal_words = hex($w_cnt);
for (my $i=0; $i < $decimal_words; $i++) {
$rest =~ s/^([^ ]+) ([0-9a-z]) //;
print OUTPUTFILE "$1 $2 ";
}
$rest =~ s/^([0-9]{3}) //;
my $p_cnt = $1;
my $new_p_cnt = $p_cnt + scalar(@{$toinsert{"$synset_offset $ss_type"}});
print OUTPUTFILE sprintf("%03d", $new_p_cnt).' ';
for (my $i=0; $i < $p_cnt; $i++) {
$rest =~ s/^([^ ]+) ([0-9]{8}) ([nvasr]) ([0-9a-f]{4}) //;
my $pointer_symbol=$1;
my $offset=$2;
my $pos=$3;
my $sourcetarget=$4;
print OUTPUTFILE "$pointer_symbol $offset $pos $sourcetarget ";
}
for my $newp (@{$toinsert{"$synset_offset $ss_type"}}) {
print OUTPUTFILE "$newp ";
}
print OUTPUTFILE "$rest\n";
}
else { # no change needed
print OUTPUTFILE "$line\n";
}
}
close DATAFILE;
for my $l (@breislines) {
print OUTPUTFILE $l;
}
close OUTPUTFILE;
}
process_data_file('data.adj');
process_data_file('data.adv');
process_data_file('data.noun');
process_data_file('data.verb');